Spaces:

kernels-community
/

kernels-benchmarks

Running

App Files Files Community

drbh HF Staff commited on Oct 2

Commit

0cce993

verified ·

1 Parent(s): 6c5c584

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

flash_attn/impls/artifacts/benchmark/attn.jsonl +6 -0
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +6 -0
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +6 -0
flash_attn/impls/cells/benchmark.py +68 -0
flash_attn/impls/cells/benchmark_default.py +70 -0
flash_attn/impls/cells/benchmark_max_autotune.py +70 -0
flash_attn/impls/cells/nv.py +3 -0
flash_attn/impls/compiled_variants.html +0 -0
flash_attn/impls/flash_attention.html +0 -0
flash_attn/impls/hf_kernels_flash_attn.html +0 -0
flash_attn/impls/hf_kernels_flash_attn3.html +0 -0
flash_attn/impls/index.html +30 -0
flash_attn/impls/mem_efficient_attention.html +0 -0
flash_attn/impls/sage_attention.html +0 -0
flash_attn/impls/xformers.html +0 -0
flash_attn/index.html +2 -1
flash_attn/results/index.html +24 -0
index.html +0 -1

flash_attn/impls/artifacts/benchmark/attn.jsonl ADDED Viewed

	@@ -0,0 +1,6 @@

+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4453760087490082, "p50": 0.45241600275039673, "p90": 0.45257601141929626, "mean": 0.4501312017440796, "reps": 5, "warmup": 2}, "compile_ms": 1.8144960403442383, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4647679924964905, "p50": 0.46665599942207336, "p90": 0.47142401337623596, "mean": 0.46863360404968263, "reps": 5, "warmup": 2}, "compile_ms": 0.3614720106124878, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.597823977470398, "p50": 0.6007360219955444, "p90": 0.6015999913215637, "mean": 0.6010496020317078, "reps": 5, "warmup": 2}, "compile_ms": 0.4886080026626587, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6014080047607422, "p50": 0.6025919914245605, "p90": 0.6026239991188049, "mean": 0.6072191953659057, "reps": 5, "warmup": 2}, "compile_ms": 0.4956800043582916, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6389120221138, "p50": 0.6423360109329224, "p90": 0.6447039842605591, "mean": 0.6453696012496948, "reps": 5, "warmup": 2}, "compile_ms": 0.532256007194519, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6434879899024963, "p50": 0.6450560092926025, "p90": 0.6518719792366028, "mean": 0.6475072026252746, "reps": 5, "warmup": 2}, "compile_ms": 0.535040020942688, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}

flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl ADDED Viewed

	@@ -0,0 +1,6 @@

+{"ts": "2025-10-02T15:02:05Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5145599842071533, "p50": 0.5220479965209961, "p90": 0.5232319831848145, "mean": 0.5199103951454163, "reps": 5, "warmup": 2}, "compile_ms": 3343.085205078125, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5552319884300232, "p50": 0.5602560043334961, "p90": 0.5604159832000732, "mean": 0.5585088014602662, "reps": 5, "warmup": 2}, "compile_ms": 471.8746032714844, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6817600131034851, "p50": 0.6845120191574097, "p90": 0.6866880059242249, "mean": 0.6862144112586975, "reps": 5, "warmup": 2}, "compile_ms": 469.6441650390625, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7130560278892517, "p50": 0.7160000205039978, "p90": 0.7172480225563049, "mean": 0.7158400177955627, "reps": 5, "warmup": 2}, "compile_ms": 471.8545227050781, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7383360266685486, "p50": 0.746783971786499, "p90": 0.7520319819450378, "mean": 0.7461183905601502, "reps": 5, "warmup": 2}, "compile_ms": 473.72625732421875, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:08Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7711359858512878, "p50": 0.7734079957008362, "p90": 0.7748159766197205, "mean": 0.7733887910842896, "reps": 5, "warmup": 2}, "compile_ms": 476.75982666015625, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}

flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl ADDED Viewed

	@@ -0,0 +1,6 @@

+{"ts": "2025-10-02T15:02:53Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6211519837379456, "p50": 0.6424639821052551, "p90": 0.6726719737052917, "mean": 0.6559999942779541, "reps": 5, "warmup": 2}, "compile_ms": 4537.6962890625, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:55Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.665503978729248, "p50": 0.6812480092048645, "p90": 0.7109439969062805, "mean": 0.7009024024009705, "reps": 5, "warmup": 2}, "compile_ms": 1491.3409423828125, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:56Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8014079928398132, "p50": 0.8136320114135742, "p90": 0.8414080142974854, "mean": 0.8342463970184326, "reps": 5, "warmup": 2}, "compile_ms": 1269.2235107421875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:02:58Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8420799970626831, "p50": 0.8514879941940308, "p90": 0.8752319812774658, "mean": 0.8708159923553467, "reps": 5, "warmup": 2}, "compile_ms": 1631.2921142578125, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:03:00Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8840640187263489, "p50": 0.8960639834403992, "p90": 0.9062719941139221, "mean": 0.9071423888206482, "reps": 5, "warmup": 2}, "compile_ms": 1919.3294677734375, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:03:02Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9141759872436523, "p50": 0.9165440201759338, "p90": 0.9380800127983093, "mean": 0.9373440027236939, "reps": 5, "warmup": 2}, "compile_ms": 1484.717529296875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}

flash_attn/impls/cells/benchmark.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch",
+#     "kernels-benchmark-tools",
+#     "xformers",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+import xformers.ops as xops
+def xformers_attention(q, k, v):
+    """xFormers memory efficient attention"""
+    # xFormers expects [batch, seq_len, heads, head_dim]
+    return xops.memory_efficient_attention(q, k, v)
+kbt.add(
+    "xformers_meff",
+    xformers_attention,
+    tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
+)
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = "float32" if device == "cpu" else "bfloat16"
+    # Flux-like workloads
+    base = 1024 if device == "cuda" else 512
+    flux_sizes = (
+        [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+    )
+    heads = 24 if device == "cuda" else 8
+    head_dim = 128 if device == "cuda" else 64
+    wl = []
+    for L in flux_sizes:
+        wl.append(
+            {
+                "name": f"flux_L{L}",
+                "batch": 1,
+                "seq_len": base + L,
+                "heads": heads,
+                "head_dim": head_dim,
+                "dtype": dtype,
+                "device": device,
+                "seed": 0,
+            }
+        )
+    kbt.run(
+        wl,
+        jsonl="attn.jsonl",
+        reps=5,
+        warmup=2,
+        gen=kbt.attn.gen_qkv,
+        ref=kbt.attn.ref_math,
+        cmp=kbt.attn.cmp_allclose,
+    )
+    kbt.summarize(["attn.jsonl"])

flash_attn/impls/cells/benchmark_default.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+def torch_flash_base(q, k, v):
+    qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+    with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+        o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+    return o.transpose(1, 2).contiguous()
+# Compile with default mode
+compiled_flash_default = torch.compile(torch_flash_base, mode="default", fullgraph=True, dynamic=False)
+kbt.add(
+    "torch_flash_compiled_default",
+    compiled_flash_default,
+    tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "default"},
+)
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = "float32" if device == "cpu" else "bfloat16"
+    # Flux-like workloads
+    base = 1024 if device == "cuda" else 512
+    flux_sizes = (
+        [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+    )
+    heads = 24 if device == "cuda" else 8
+    head_dim = 128 if device == "cuda" else 64
+    wl = []
+    for L in flux_sizes:
+        wl.append(
+            {
+                "name": f"flux_L{L}",
+                "batch": 1,
+                "seq_len": base + L,
+                "heads": heads,
+                "head_dim": head_dim,
+                "dtype": dtype,
+                "device": device,
+                "seed": 0,
+            }
+        )
+    kbt.run(
+        wl,
+        jsonl="attn_default.jsonl",
+        reps=5,
+        warmup=2,
+        gen=kbt.attn.gen_qkv,
+        ref=kbt.attn.ref_math,
+        cmp=kbt.attn.cmp_allclose,
+    )
+    kbt.summarize(["attn_default.jsonl"])

flash_attn/impls/cells/benchmark_max_autotune.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+def torch_flash_base(q, k, v):
+    qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+    with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+        o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+    return o.transpose(1, 2).contiguous()
+# Compile with max-autotune mode
+compiled_flash_max_autotune = torch.compile(torch_flash_base, mode="max-autotune", fullgraph=True, dynamic=False)
+kbt.add(
+    "torch_flash_compiled_max_autotune",
+    compiled_flash_max_autotune,
+    tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
+)
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = "float32" if device == "cpu" else "bfloat16"
+    # Flux-like workloads
+    base = 1024 if device == "cuda" else 512
+    flux_sizes = (
+        [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
+    )
+    heads = 24 if device == "cuda" else 8
+    head_dim = 128 if device == "cuda" else 64
+    wl = []
+    for L in flux_sizes:
+        wl.append(
+            {
+                "name": f"flux_L{L}",
+                "batch": 1,
+                "seq_len": base + L,
+                "heads": heads,
+                "head_dim": head_dim,
+                "dtype": dtype,
+                "device": device,
+                "seed": 0,
+            }
+        )
+    kbt.run(
+        wl,
+        jsonl="attn_max_autotune.jsonl",
+        reps=5,
+        warmup=2,
+        gen=kbt.attn.gen_qkv,
+        ref=kbt.attn.ref_math,
+        cmp=kbt.attn.cmp_allclose,
+    )
+    kbt.summarize(["attn_max_autotune.jsonl"])

flash_attn/impls/cells/nv.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import subprocess
2	+
3	+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)

flash_attn/impls/compiled_variants.html ADDED Viewed