diff --git a/flash_attn/impls/artifacts/benchmark/attn.jsonl b/flash_attn/impls/artifacts/benchmark/attn.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..281422152b7165845e3f3f86639f60f2bca83f93
--- /dev/null
+++ b/flash_attn/impls/artifacts/benchmark/attn.jsonl
@@ -0,0 +1,6 @@
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4453760087490082, "p50": 0.45241600275039673, "p90": 0.45257601141929626, "mean": 0.4501312017440796, "reps": 5, "warmup": 2}, "compile_ms": 1.8144960403442383, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4647679924964905, "p50": 0.46665599942207336, "p90": 0.47142401337623596, "mean": 0.46863360404968263, "reps": 5, "warmup": 2}, "compile_ms": 0.3614720106124878, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.597823977470398, "p50": 0.6007360219955444, "p90": 0.6015999913215637, "mean": 0.6010496020317078, "reps": 5, "warmup": 2}, "compile_ms": 0.4886080026626587, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6014080047607422, "p50": 0.6025919914245605, "p90": 0.6026239991188049, "mean": 0.6072191953659057, "reps": 5, "warmup": 2}, "compile_ms": 0.4956800043582916, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6389120221138, "p50": 0.6423360109329224, "p90": 0.6447039842605591, "mean": 0.6453696012496948, "reps": 5, "warmup": 2}, "compile_ms": 0.532256007194519, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6434879899024963, "p50": 0.6450560092926025, "p90": 0.6518719792366028, "mean": 0.6475072026252746, "reps": 5, "warmup": 2}, "compile_ms": 0.535040020942688, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
diff --git a/flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl b/flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7a01a71a0c21cca52d23c3bc8bc60dddce52a94a
--- /dev/null
+++ b/flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl
@@ -0,0 +1,6 @@
+{"ts": "2025-10-02T15:02:05Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5145599842071533, "p50": 0.5220479965209961, "p90": 0.5232319831848145, "mean": 0.5199103951454163, "reps": 5, "warmup": 2}, "compile_ms": 3343.085205078125, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5552319884300232, "p50": 0.5602560043334961, "p90": 0.5604159832000732, "mean": 0.5585088014602662, "reps": 5, "warmup": 2}, "compile_ms": 471.8746032714844, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6817600131034851, "p50": 0.6845120191574097, "p90": 0.6866880059242249, "mean": 0.6862144112586975, "reps": 5, "warmup": 2}, "compile_ms": 469.6441650390625, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7130560278892517, "p50": 0.7160000205039978, "p90": 0.7172480225563049, "mean": 0.7158400177955627, "reps": 5, "warmup": 2}, "compile_ms": 471.8545227050781, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7383360266685486, "p50": 0.746783971786499, "p90": 0.7520319819450378, "mean": 0.7461183905601502, "reps": 5, "warmup": 2}, "compile_ms": 473.72625732421875, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:08Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7711359858512878, "p50": 0.7734079957008362, "p90": 0.7748159766197205, "mean": 0.7733887910842896, "reps": 5, "warmup": 2}, "compile_ms": 476.75982666015625, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
diff --git a/flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl b/flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5a92dadfb15cb32438e155409a450a70fb71dccf
--- /dev/null
+++ b/flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl
@@ -0,0 +1,6 @@
+{"ts": "2025-10-02T15:02:53Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6211519837379456, "p50": 0.6424639821052551, "p90": 0.6726719737052917, "mean": 0.6559999942779541, "reps": 5, "warmup": 2}, "compile_ms": 4537.6962890625, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:55Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.665503978729248, "p50": 0.6812480092048645, "p90": 0.7109439969062805, "mean": 0.7009024024009705, "reps": 5, "warmup": 2}, "compile_ms": 1491.3409423828125, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:56Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8014079928398132, "p50": 0.8136320114135742, "p90": 0.8414080142974854, "mean": 0.8342463970184326, "reps": 5, "warmup": 2}, "compile_ms": 1269.2235107421875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:58Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8420799970626831, "p50": 0.8514879941940308, "p90": 0.8752319812774658, "mean": 0.8708159923553467, "reps": 5, "warmup": 2}, "compile_ms": 1631.2921142578125, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:03:00Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8840640187263489, "p50": 0.8960639834403992, "p90": 0.9062719941139221, "mean": 0.9071423888206482, "reps": 5, "warmup": 2}, "compile_ms": 1919.3294677734375, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:03:02Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9141759872436523, "p50": 0.9165440201759338, "p90": 0.9380800127983093, "mean": 0.9373440027236939, "reps": 5, "warmup": 2}, "compile_ms": 1484.717529296875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..12c02a967606bf5974e797deac59e0a0af2e284e
--- /dev/null
+++ b/flash_attn/impls/cells/benchmark.py
@@ -0,0 +1,68 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# "xformers",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+import xformers.ops as xops
+
+
+def xformers_attention(q, k, v):
+ """xFormers memory efficient attention"""
+ # xFormers expects [batch, seq_len, heads, head_dim]
+ return xops.memory_efficient_attention(q, k, v)
+
+
+kbt.add(
+ "xformers_meff",
+ xformers_attention,
+ tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ dtype = "float32" if device == "cpu" else "bfloat16"
+
+ # Flux-like workloads
+ base = 1024 if device == "cuda" else 512
+ flux_sizes = (
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+ )
+ heads = 24 if device == "cuda" else 8
+ head_dim = 128 if device == "cuda" else 64
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn.jsonl"])
\ No newline at end of file
diff --git a/flash_attn/impls/cells/benchmark_default.py b/flash_attn/impls/cells/benchmark_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc2fd06ac69ffe1f5bc88d1821b17447dc90c846
--- /dev/null
+++ b/flash_attn/impls/cells/benchmark_default.py
@@ -0,0 +1,70 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+
+
+def torch_flash_base(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
+
+
+# Compile with default mode
+compiled_flash_default = torch.compile(torch_flash_base, mode="default", fullgraph=True, dynamic=False)
+
+kbt.add(
+ "torch_flash_compiled_default",
+ compiled_flash_default,
+ tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "default"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ dtype = "float32" if device == "cpu" else "bfloat16"
+
+ # Flux-like workloads
+ base = 1024 if device == "cuda" else 512
+ flux_sizes = (
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+ )
+ heads = 24 if device == "cuda" else 8
+ head_dim = 128 if device == "cuda" else 64
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn_default.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn_default.jsonl"])
\ No newline at end of file
diff --git a/flash_attn/impls/cells/benchmark_max_autotune.py b/flash_attn/impls/cells/benchmark_max_autotune.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd96e676c4d9ebdf709b701a7b9a71b9d51774fd
--- /dev/null
+++ b/flash_attn/impls/cells/benchmark_max_autotune.py
@@ -0,0 +1,70 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+
+
+def torch_flash_base(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
+
+
+# Compile with max-autotune mode
+compiled_flash_max_autotune = torch.compile(torch_flash_base, mode="max-autotune", fullgraph=True, dynamic=False)
+
+kbt.add(
+ "torch_flash_compiled_max_autotune",
+ compiled_flash_max_autotune,
+ tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ dtype = "float32" if device == "cpu" else "bfloat16"
+
+ # Flux-like workloads
+ base = 1024 if device == "cuda" else 512
+ flux_sizes = (
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+ )
+ heads = 24 if device == "cuda" else 8
+ head_dim = 128 if device == "cuda" else 64
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn_max_autotune.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn_max_autotune.jsonl"])
\ No newline at end of file
diff --git a/flash_attn/impls/cells/nv.py b/flash_attn/impls/cells/nv.py
new file mode 100644
index 0000000000000000000000000000000000000000..80eef60a7536ed875fb21731ab2d059458bd20b4
--- /dev/null
+++ b/flash_attn/impls/cells/nv.py
@@ -0,0 +1,3 @@
+import subprocess
+
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
\ No newline at end of file
diff --git a/flash_attn/impls/compiled_variants.html b/flash_attn/impls/compiled_variants.html
new file mode 100644
index 0000000000000000000000000000000000000000..aaf874764371833762d91aff9aa3101cdcc9a8f7
--- /dev/null
+++ b/flash_attn/impls/compiled_variants.html
@@ -0,0 +1,4028 @@
+
+
+
+
+
+ compiled_variants
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
+
+
+
+
+
Torch Compile Variants
+
This file benchmarks Flash Attention with different torch.compile modes.
+
Flash Attention with torch.compile(mode="default")
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+
+
+def torch_flash_base(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
+
+
+# Compile with default mode
+compiled_flash_default = torch.compile(torch_flash_base, mode="default", fullgraph=True, dynamic=False)
+
+kbt.add(
+ "torch_flash_compiled_default",
+ compiled_flash_default,
+ tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "default"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ dtype = "float32" if device == "cpu" else "bfloat16"
+
+ # Flux-like workloads
+ base = 1024 if device == "cuda" else 512
+ flux_sizes = (
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+ )
+ heads = 24 if device == "cuda" else 8
+ head_dim = 128 if device == "cuda" else 64
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn_default.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn_default.jsonl"])
+
+
+
+
+
+
+
impl wl p50(ms) ok
+torch_flash_compiled_default flux_L128 0.52 True
+torch_flash_compiled_default flux_L256 0.56 True
+torch_flash_compiled_default flux_L320 0.68 True
+torch_flash_compiled_default flux_L384 0.72 True
+torch_flash_compiled_default flux_L448 0.75 True
+torch_flash_compiled_default flux_L512 0.77 True
+
+
+
+
+ Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading triton (148.3MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading pillow (6.3MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading matplotlib (8.3MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading torch (846.9MiB)
+Downloading kiwisolver (1.4MiB)
+ Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
+ Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading setuptools
+ Downloading fonttools
+ Downloading networkx
+ Downloading pillow
+ Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 37 packages in 545ms
+
+
+
+
+
+
+
Flash Attention with torch.compile(mode="max-autotune")
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+
+
+def torch_flash_base(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
+
+
+# Compile with max-autotune mode
+compiled_flash_max_autotune = torch.compile(torch_flash_base, mode="max-autotune", fullgraph=True, dynamic=False)
+
+kbt.add(
+ "torch_flash_compiled_max_autotune",
+ compiled_flash_max_autotune,
+ tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ dtype = "float32" if device == "cpu" else "bfloat16"
+
+ # Flux-like workloads
+ base = 1024 if device == "cuda" else 512
+ flux_sizes = (
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+ )
+ heads = 24 if device == "cuda" else 8
+ head_dim = 128 if device == "cuda" else 64
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn_max_autotune.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn_max_autotune.jsonl"])
+
+
+
+
+
+
+
impl wl p50(ms) ok
+torch_flash_compiled_max_autotune flux_L128 0.64 True
+torch_flash_compiled_max_autotune flux_L256 0.68 True
+torch_flash_compiled_max_autotune flux_L320 0.81 True
+torch_flash_compiled_max_autotune flux_L384 0.85 True
+torch_flash_compiled_max_autotune flux_L448 0.90 True
+torch_flash_compiled_max_autotune flux_L512 0.92 True
+
+
+
+
+ Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading numpy (16.2MiB)
+Downloading matplotlib (8.3MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading setuptools (1.1MiB)
+Downloading pillow (6.3MiB)
+Downloading sympy (6.0MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading triton (148.3MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+ Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
+ Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading setuptools
+ Downloading networkx
+ Downloading fonttools
+ Downloading pillow
+ Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 37 packages in 526ms
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html
new file mode 100644
index 0000000000000000000000000000000000000000..0aebfc1053c7654f5de03a6500533d74fabdd85e
--- /dev/null
+++ b/flash_attn/impls/flash_attention.html
@@ -0,0 +1,3924 @@
+
+
+
+
+
+ flash_attention
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
+
+
+
+
+
Flash Attention Implementation
+
GPU Info
+
+
+
+
+
import subprocess
+
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+
+
+
+
+
+
Thu Oct 2 15:03:41 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
+| 0% 31C P0 87W / 300W | 0MiB / 23028MiB | 0% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
+| 0% 25C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
+| 0% 26C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
+| 0% 25C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
Flash Attention Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+
+
+def torch_flash(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
+
+kbt.add(
+ "torch_flash_ma",
+ torch_flash,
+ tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ dtype = "float32" if device == "cpu" else "bfloat16"
+
+ # Flux-like workloads scaled down for CPU testing
+ base = 1024 if device == "cuda" else 512
+ flux_sizes = (
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+ )
+ heads = 24 if device == "cuda" else 8
+ head_dim = 128 if device == "cuda" else 64
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn.jsonl"])
+
+
+
+
+
+
+
impl wl p50(ms) ok
+torch_flash_ma flux_L128 0.48 True
+torch_flash_ma flux_L256 0.53 True
+torch_flash_ma flux_L320 0.65 True
+torch_flash_ma flux_L384 0.68 True
+torch_flash_ma flux_L448 0.71 True
+torch_flash_ma flux_L512 0.74 True
+
+
+
+
+ Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading setuptools (1.1MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading pillow (6.3MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading numpy (16.2MiB)
+Downloading matplotlib (8.3MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading torch (846.9MiB)
+Downloading triton (148.3MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+ Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
+ Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading setuptools
+ Downloading fonttools
+ Downloading networkx
+ Downloading pillow
+ Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 37 packages in 560ms
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html
new file mode 100644
index 0000000000000000000000000000000000000000..0c4b9c8387e21235521f9378733f6d05c113b924
--- /dev/null
+++ b/flash_attn/impls/hf_kernels_flash_attn.html
@@ -0,0 +1,3875 @@
+
+
+
+
+
+ hf_kernels_flash_attn
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
+
+
+
+
+
HF Kernels - Flash Attention
+
HuggingFace Kernels Flash Attention Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+from kernels import get_kernel
+
+hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
+
+
+def hf_flash_attention(query, key, value):
+ """HuggingFace Kernels Flash Attention"""
+ return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
+
+
+kbt.add(
+ "hf_kernels_flash_attn",
+ hf_flash_attention,
+ tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ if device == "cpu":
+ print("HF Kernels Flash Attention requires CUDA - skipping benchmark")
+ sys.exit(0)
+
+ dtype = "bfloat16"
+
+ # Flux-like workloads
+ base = 1024
+ flux_sizes = [128, 256, 320, 384, 448, 512]
+ heads = 24
+ head_dim = 128
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn.jsonl"])
+
+
+
+
+
+
+
impl wl p50(ms) ok
+hf_kernels_flash_attn flux_L128 0.35 True
+hf_kernels_flash_attn flux_L256 0.38 True
+hf_kernels_flash_attn flux_L320 0.50 True
+hf_kernels_flash_attn flux_L384 0.52 True
+hf_kernels_flash_attn flux_L448 0.54 True
+hf_kernels_flash_attn flux_L512 0.56 True
+
+
+
+
+ Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading sympy (6.0MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading pillow (6.3MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading triton (148.3MiB)
+Downloading setuptools (1.1MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading matplotlib (8.3MiB)
+Downloading torch (846.9MiB)
+Downloading hf-xet (3.0MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+ Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
+ Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading hf-xet
+ Downloading setuptools
+ Downloading fonttools
+ Downloading networkx
+ Downloading pillow
+ Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 47 packages in 457ms
+
+
+
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
+Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03, 6.33it/s]
+Fetching 20 files: 10%|█ | 2/20 [00:01<00:10, 1.75it/s]
+Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 19.65it/s]
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html
new file mode 100644
index 0000000000000000000000000000000000000000..00f11b620393adb189b56816c91fed1733267bbf
--- /dev/null
+++ b/flash_attn/impls/hf_kernels_flash_attn3.html
@@ -0,0 +1,3874 @@
+
+
+
+
+
+ hf_kernels_flash_attn3
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
+
+
+
+
+
HF Kernels - Flash Attention 3
+
HuggingFace Kernels Flash Attention 3 Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+from kernels import get_kernel
+
+hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
+
+
+def hf_flash_attention3(query, key, value):
+ return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
+
+
+kbt.add(
+ "hf_kernels_flash_attn3",
+ hf_flash_attention3,
+ tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ if device == "cpu":
+ print("HF Kernels Flash Attention 3 requires CUDA - skipping benchmark")
+ sys.exit(0)
+
+ dtype = "bfloat16"
+
+ # Flux-like workloads
+ base = 1024
+ flux_sizes = [128, 256, 320, 384, 448, 512]
+ heads = 24
+ head_dim = 128
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn.jsonl"])
+
+
+
+
+
+
+
impl wl p50(ms) ok
+hf_kernels_flash_attn3 flux_L128 0.36 True
+hf_kernels_flash_attn3 flux_L256 0.39 True
+hf_kernels_flash_attn3 flux_L320 0.52 True
+hf_kernels_flash_attn3 flux_L384 0.53 True
+hf_kernels_flash_attn3 flux_L448 0.57 True
+hf_kernels_flash_attn3 flux_L512 0.57 True
+
+
+
+
+ Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading matplotlib (8.3MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading hf-xet (3.0MiB)
+Downloading fonttools (4.7MiB)
+Downloading pillow (6.3MiB)
+Downloading triton (148.3MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading setuptools (1.1MiB)
+ Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
+ Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading hf-xet
+ Downloading setuptools
+ Downloading fonttools
+ Downloading networkx
+ Downloading pillow
+ Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 47 packages in 453ms
+
+
+
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
+Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 5.61it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.13it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.56it/s]
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/index.html b/flash_attn/impls/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..16f13ae51cc53a9df48c5e0d99ab1e6ea0aeb5ce
--- /dev/null
+++ b/flash_attn/impls/index.html
@@ -0,0 +1,30 @@
+
+
+
+
+ Directory Index
+
+
+
+ Index of /flash_attn/impls
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html
new file mode 100644
index 0000000000000000000000000000000000000000..b0b24198c46517be49a3d083f94c569ed35c387d
--- /dev/null
+++ b/flash_attn/impls/mem_efficient_attention.html
@@ -0,0 +1,3865 @@
+
+
+
+
+
+ mem_efficient_attention
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
+
+
+
+
+
Memory Efficient Attention Implementation
+
Memory Efficient SDPA Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+
+
+def torch_mem_eff(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(
+ torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION
+ ):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
+
+kbt.add(
+ "torch_mem_eff",
+ torch_mem_eff,
+ tags={"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ dtype = "float32" if device == "cpu" else "bfloat16"
+
+ # Flux-like workloads scaled down for CPU testing
+ base = 1024 if device == "cuda" else 512
+ flux_sizes = (
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+ )
+ heads = 24 if device == "cuda" else 8
+ head_dim = 128 if device == "cuda" else 64
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn.jsonl"])
+
+
+
+
+
+
+
impl wl p50(ms) ok
+torch_mem_eff flux_L128 0.59 True
+torch_mem_eff flux_L256 0.65 True
+torch_mem_eff flux_L320 0.77 True
+torch_mem_eff flux_L384 0.79 True
+torch_mem_eff flux_L448 0.84 True
+torch_mem_eff flux_L512 0.95 True
+
+
+
+
+ Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading matplotlib (8.3MiB)
+Downloading pillow (6.3MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading numpy (16.2MiB)
+Downloading setuptools (1.1MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading sympy (6.0MiB)
+Downloading fonttools (4.7MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading triton (148.3MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+ Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
+ Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading setuptools
+ Downloading networkx
+ Downloading fonttools
+ Downloading pillow
+ Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 37 packages in 453ms
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html
new file mode 100644
index 0000000000000000000000000000000000000000..0c201326d1815fac1626c0f03854be0562e68cce
--- /dev/null
+++ b/flash_attn/impls/sage_attention.html
@@ -0,0 +1,3890 @@
+
+
+
+
+
+ sage_attention
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
+
+
+
+
+
SageAttention Implementation
+
SageAttention Benchmark (INT8 Quantized)
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels",
+# "kernels-benchmark-tools",
+# "sageattention",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+# from sageattention import sageattn_qk_int8_pv_fp16_cuda
+
+
+# def sage_attention(q, k, v):
+# """SageAttention with INT8 Q/K quantization and FP16 P/V"""
+# return sageattn_qk_int8_pv_fp16_cuda(q, k, v, tensor_layout="NHD")
+
+from kernels import get_kernel
+
+hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention")
+
+
+def sage_attention(query, key, value):
+ """HuggingFace Kernels Flash Attention"""
+ return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
+
+kbt.add(
+ "sage_int8_fp16",
+ sage_attention,
+ tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ if device == "cpu":
+ print("SageAttention requires CUDA - skipping benchmark")
+ sys.exit(0)
+
+ dtype = "bfloat16"
+
+ # Flux-like workloads
+ base = 1024
+ flux_sizes = [128, 256, 320, 384, 448, 512]
+ heads = 24
+ head_dim = 128
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn.jsonl"])
+
+
+
+
+
+
+
impl wl p50(ms) ok
+sage_int8_fp16 flux_L128 FAIL False
+ Error: module 'sage_attention_a39c012a73160148' has no attribute 'fwd'
+sage_int8_fp16 flux_L256 FAIL False
+ Error: module 'sage_attention_a39c012a73160148' has no attribute 'fwd'
+sage_int8_fp16 flux_L320 FAIL False
+ Error: module 'sage_attention_a39c012a73160148' has no attribute 'fwd'
+sage_int8_fp16 flux_L384 FAIL False
+ Error: module 'sage_attention_a39c012a73160148' has no attribute 'fwd'
+sage_int8_fp16 flux_L448 FAIL False
+ Error: module 'sage_attention_a39c012a73160148' has no attribute 'fwd'
+sage_int8_fp16 flux_L512 FAIL False
+ Error: module 'sage_attention_a39c012a73160148' has no attribute 'fwd'
+
+
+
+
+ Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading triton (148.3MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading pillow (6.3MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading hf-xet (3.0MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading sympy (6.0MiB)
+Downloading numpy (16.2MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading matplotlib (8.3MiB)
+ Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
+ Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading hf-xet
+ Downloading setuptools
+ Downloading fonttools
+ Downloading networkx
+ Downloading pillow
+ Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 48 packages in 513ms
+
+
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
+Fetching 11 files: 9%|▉ | 1/11 [00:00<00:01, 5.70it/s]
+Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:01, 6.67it/s]
+Fetching 11 files: 27%|██▋ | 3/11 [00:00<00:01, 6.46it/s]
+Fetching 11 files: 64%|██████▎ | 7/11 [00:00<00:00, 11.66it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 15.59it/s]
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html
new file mode 100644
index 0000000000000000000000000000000000000000..995ddf881ce661d53a84caf98ed15ea35a01c2ee
--- /dev/null
+++ b/flash_attn/impls/xformers.html
@@ -0,0 +1,3867 @@
+
+
+
+
+
+ xformers
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
+
+
+
+
+
xFormers Memory Efficient Attention
+
xFormers Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels-benchmark-tools",
+# "xformers",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+import xformers.ops as xops
+
+
+def xformers_attention(q, k, v):
+ """xFormers memory efficient attention"""
+ # xFormers expects [batch, seq_len, heads, head_dim]
+ return xops.memory_efficient_attention(q, k, v)
+
+
+kbt.add(
+ "xformers_meff",
+ xformers_attention,
+ tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
+)
+
+if __name__ == "__main__":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ dtype = "float32" if device == "cpu" else "bfloat16"
+
+ # Flux-like workloads
+ base = 1024 if device == "cuda" else 512
+ flux_sizes = (
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+ )
+ heads = 24 if device == "cuda" else 8
+ head_dim = 128 if device == "cuda" else 64
+
+ wl = []
+ for L in flux_sizes:
+ wl.append(
+ {
+ "name": f"flux_L{L}",
+ "batch": 1,
+ "seq_len": base + L,
+ "heads": heads,
+ "head_dim": head_dim,
+ "dtype": dtype,
+ "device": device,
+ "seed": 0,
+ }
+ )
+
+ kbt.run(
+ wl,
+ jsonl="attn.jsonl",
+ reps=5,
+ warmup=2,
+ gen=kbt.attn.gen_qkv,
+ ref=kbt.attn.ref_math,
+ cmp=kbt.attn.cmp_allclose,
+ )
+ kbt.summarize(["attn.jsonl"])
+
+
+
+
+
+
+
impl wl p50(ms) ok
+xformers_meff flux_L128 0.45 True
+xformers_meff flux_L256 0.47 True
+xformers_meff flux_L320 0.60 True
+xformers_meff flux_L384 0.60 True
+xformers_meff flux_L448 0.64 True
+xformers_meff flux_L512 0.65 True
+
+
+
+
+ Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading numpy (16.2MiB)
+Downloading xformers (111.8MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading pillow (6.3MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading fonttools (4.7MiB)
+Downloading triton (148.3MiB)
+Downloading torch (846.9MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading matplotlib (8.3MiB)
+ Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
+ Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading setuptools
+ Downloading networkx
+ Downloading fonttools
+ Downloading pillow
+ Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading sympy
+ Downloading numpy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading xformers
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading torch
+Installed 38 packages in 522ms
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/index.html b/flash_attn/index.html
index 398172379434c672102a2bd0e4175dcb8e06f75e..9f7dc880eb75f8403035bd95857991f89fe04872 100644
--- a/flash_attn/index.html
+++ b/flash_attn/index.html
@@ -18,7 +18,8 @@
Index of /flash_attn