drbh HF Staff commited on
Commit
0cce993
·
verified ·
1 Parent(s): 6c5c584

Upload folder using huggingface_hub

Browse files
flash_attn/impls/artifacts/benchmark/attn.jsonl ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4453760087490082, "p50": 0.45241600275039673, "p90": 0.45257601141929626, "mean": 0.4501312017440796, "reps": 5, "warmup": 2}, "compile_ms": 1.8144960403442383, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4647679924964905, "p50": 0.46665599942207336, "p90": 0.47142401337623596, "mean": 0.46863360404968263, "reps": 5, "warmup": 2}, "compile_ms": 0.3614720106124878, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.597823977470398, "p50": 0.6007360219955444, "p90": 0.6015999913215637, "mean": 0.6010496020317078, "reps": 5, "warmup": 2}, "compile_ms": 0.4886080026626587, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6014080047607422, "p50": 0.6025919914245605, "p90": 0.6026239991188049, "mean": 0.6072191953659057, "reps": 5, "warmup": 2}, "compile_ms": 0.4956800043582916, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6389120221138, "p50": 0.6423360109329224, "p90": 0.6447039842605591, "mean": 0.6453696012496948, "reps": 5, "warmup": 2}, "compile_ms": 0.532256007194519, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6434879899024963, "p50": 0.6450560092926025, "p90": 0.6518719792366028, "mean": 0.6475072026252746, "reps": 5, "warmup": 2}, "compile_ms": 0.535040020942688, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"ts": "2025-10-02T15:02:05Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5145599842071533, "p50": 0.5220479965209961, "p90": 0.5232319831848145, "mean": 0.5199103951454163, "reps": 5, "warmup": 2}, "compile_ms": 3343.085205078125, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5552319884300232, "p50": 0.5602560043334961, "p90": 0.5604159832000732, "mean": 0.5585088014602662, "reps": 5, "warmup": 2}, "compile_ms": 471.8746032714844, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6817600131034851, "p50": 0.6845120191574097, "p90": 0.6866880059242249, "mean": 0.6862144112586975, "reps": 5, "warmup": 2}, "compile_ms": 469.6441650390625, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7130560278892517, "p50": 0.7160000205039978, "p90": 0.7172480225563049, "mean": 0.7158400177955627, "reps": 5, "warmup": 2}, "compile_ms": 471.8545227050781, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7383360266685486, "p50": 0.746783971786499, "p90": 0.7520319819450378, "mean": 0.7461183905601502, "reps": 5, "warmup": 2}, "compile_ms": 473.72625732421875, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T15:02:08Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7711359858512878, "p50": 0.7734079957008362, "p90": 0.7748159766197205, "mean": 0.7733887910842896, "reps": 5, "warmup": 2}, "compile_ms": 476.75982666015625, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"ts": "2025-10-02T15:02:53Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6211519837379456, "p50": 0.6424639821052551, "p90": 0.6726719737052917, "mean": 0.6559999942779541, "reps": 5, "warmup": 2}, "compile_ms": 4537.6962890625, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T15:02:55Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.665503978729248, "p50": 0.6812480092048645, "p90": 0.7109439969062805, "mean": 0.7009024024009705, "reps": 5, "warmup": 2}, "compile_ms": 1491.3409423828125, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T15:02:56Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8014079928398132, "p50": 0.8136320114135742, "p90": 0.8414080142974854, "mean": 0.8342463970184326, "reps": 5, "warmup": 2}, "compile_ms": 1269.2235107421875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T15:02:58Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8420799970626831, "p50": 0.8514879941940308, "p90": 0.8752319812774658, "mean": 0.8708159923553467, "reps": 5, "warmup": 2}, "compile_ms": 1631.2921142578125, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T15:03:00Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8840640187263489, "p50": 0.8960639834403992, "p90": 0.9062719941139221, "mean": 0.9071423888206482, "reps": 5, "warmup": 2}, "compile_ms": 1919.3294677734375, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T15:03:02Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9141759872436523, "p50": 0.9165440201759338, "p90": 0.9380800127983093, "mean": 0.9373440027236939, "reps": 5, "warmup": 2}, "compile_ms": 1484.717529296875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/cells/benchmark.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels-benchmark-tools",
7
+ # "xformers",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
12
+ # ///
13
+ import torch
14
+ import sys
15
+ import os
16
+ import kernels_benchmark_tools as kbt
17
+ import xformers.ops as xops
18
+
19
+
20
+ def xformers_attention(q, k, v):
21
+ """xFormers memory efficient attention"""
22
+ # xFormers expects [batch, seq_len, heads, head_dim]
23
+ return xops.memory_efficient_attention(q, k, v)
24
+
25
+
26
+ kbt.add(
27
+ "xformers_meff",
28
+ xformers_attention,
29
+ tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
30
+ )
31
+
32
+ if __name__ == "__main__":
33
+ device = "cuda" if torch.cuda.is_available() else "cpu"
34
+ dtype = "float32" if device == "cpu" else "bfloat16"
35
+
36
+ # Flux-like workloads
37
+ base = 1024 if device == "cuda" else 512
38
+ flux_sizes = (
39
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
40
+ )
41
+ heads = 24 if device == "cuda" else 8
42
+ head_dim = 128 if device == "cuda" else 64
43
+
44
+ wl = []
45
+ for L in flux_sizes:
46
+ wl.append(
47
+ {
48
+ "name": f"flux_L{L}",
49
+ "batch": 1,
50
+ "seq_len": base + L,
51
+ "heads": heads,
52
+ "head_dim": head_dim,
53
+ "dtype": dtype,
54
+ "device": device,
55
+ "seed": 0,
56
+ }
57
+ )
58
+
59
+ kbt.run(
60
+ wl,
61
+ jsonl="attn.jsonl",
62
+ reps=5,
63
+ warmup=2,
64
+ gen=kbt.attn.gen_qkv,
65
+ ref=kbt.attn.ref_math,
66
+ cmp=kbt.attn.cmp_allclose,
67
+ )
68
+ kbt.summarize(["attn.jsonl"])
flash_attn/impls/cells/benchmark_default.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels-benchmark-tools",
7
+ # ]
8
+ #
9
+ # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
11
+ # ///
12
+ import torch
13
+ import sys
14
+ import os
15
+ import kernels_benchmark_tools as kbt
16
+
17
+
18
+ def torch_flash_base(q, k, v):
19
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
20
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
21
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
22
+ return o.transpose(1, 2).contiguous()
23
+
24
+
25
+ # Compile with default mode
26
+ compiled_flash_default = torch.compile(torch_flash_base, mode="default", fullgraph=True, dynamic=False)
27
+
28
+ kbt.add(
29
+ "torch_flash_compiled_default",
30
+ compiled_flash_default,
31
+ tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "default"},
32
+ )
33
+
34
+ if __name__ == "__main__":
35
+ device = "cuda" if torch.cuda.is_available() else "cpu"
36
+ dtype = "float32" if device == "cpu" else "bfloat16"
37
+
38
+ # Flux-like workloads
39
+ base = 1024 if device == "cuda" else 512
40
+ flux_sizes = (
41
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
42
+ )
43
+ heads = 24 if device == "cuda" else 8
44
+ head_dim = 128 if device == "cuda" else 64
45
+
46
+ wl = []
47
+ for L in flux_sizes:
48
+ wl.append(
49
+ {
50
+ "name": f"flux_L{L}",
51
+ "batch": 1,
52
+ "seq_len": base + L,
53
+ "heads": heads,
54
+ "head_dim": head_dim,
55
+ "dtype": dtype,
56
+ "device": device,
57
+ "seed": 0,
58
+ }
59
+ )
60
+
61
+ kbt.run(
62
+ wl,
63
+ jsonl="attn_default.jsonl",
64
+ reps=5,
65
+ warmup=2,
66
+ gen=kbt.attn.gen_qkv,
67
+ ref=kbt.attn.ref_math,
68
+ cmp=kbt.attn.cmp_allclose,
69
+ )
70
+ kbt.summarize(["attn_default.jsonl"])
flash_attn/impls/cells/benchmark_max_autotune.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels-benchmark-tools",
7
+ # ]
8
+ #
9
+ # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
11
+ # ///
12
+ import torch
13
+ import sys
14
+ import os
15
+ import kernels_benchmark_tools as kbt
16
+
17
+
18
+ def torch_flash_base(q, k, v):
19
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
20
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
21
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
22
+ return o.transpose(1, 2).contiguous()
23
+
24
+
25
+ # Compile with max-autotune mode
26
+ compiled_flash_max_autotune = torch.compile(torch_flash_base, mode="max-autotune", fullgraph=True, dynamic=False)
27
+
28
+ kbt.add(
29
+ "torch_flash_compiled_max_autotune",
30
+ compiled_flash_max_autotune,
31
+ tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
32
+ )
33
+
34
+ if __name__ == "__main__":
35
+ device = "cuda" if torch.cuda.is_available() else "cpu"
36
+ dtype = "float32" if device == "cpu" else "bfloat16"
37
+
38
+ # Flux-like workloads
39
+ base = 1024 if device == "cuda" else 512
40
+ flux_sizes = (
41
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
42
+ )
43
+ heads = 24 if device == "cuda" else 8
44
+ head_dim = 128 if device == "cuda" else 64
45
+
46
+ wl = []
47
+ for L in flux_sizes:
48
+ wl.append(
49
+ {
50
+ "name": f"flux_L{L}",
51
+ "batch": 1,
52
+ "seq_len": base + L,
53
+ "heads": heads,
54
+ "head_dim": head_dim,
55
+ "dtype": dtype,
56
+ "device": device,
57
+ "seed": 0,
58
+ }
59
+ )
60
+
61
+ kbt.run(
62
+ wl,
63
+ jsonl="attn_max_autotune.jsonl",
64
+ reps=5,
65
+ warmup=2,
66
+ gen=kbt.attn.gen_qkv,
67
+ ref=kbt.attn.ref_math,
68
+ cmp=kbt.attn.cmp_allclose,
69
+ )
70
+ kbt.summarize(["attn_max_autotune.jsonl"])
flash_attn/impls/cells/nv.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import subprocess
2
+
3
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
flash_attn/impls/compiled_variants.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/flash_attention.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/hf_kernels_flash_attn.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/hf_kernels_flash_attn3.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/index.html ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /flash_attn/impls</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='compiled_variants.html' class='file'>compiled_variants.html</a></li>
22
+ <li><a href='flash_attention.html' class='file'>flash_attention.html</a></li>
23
+ <li><a href='hf_kernels_flash_attn.html' class='file'>hf_kernels_flash_attn.html</a></li>
24
+ <li><a href='hf_kernels_flash_attn3.html' class='file'>hf_kernels_flash_attn3.html</a></li>
25
+ <li><a href='mem_efficient_attention.html' class='file'>mem_efficient_attention.html</a></li>
26
+ <li><a href='sage_attention.html' class='file'>sage_attention.html</a></li>
27
+ <li><a href='xformers.html' class='file'>xformers.html</a></li>
28
+ </ul>
29
+ </body>
30
+ </html>
flash_attn/impls/mem_efficient_attention.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/sage_attention.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/xformers.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/index.html CHANGED
@@ -18,7 +18,8 @@
18
  <h1>Index of /flash_attn</h1>
19
  <ul>
20
  <li><a href='../index.html' class='dir'>../</a></li>
21
- <li><a href='benchmark.html' class='file'>benchmark.html</a></li>
 
22
  </ul>
23
  </body>
24
  </html>
 
18
  <h1>Index of /flash_attn</h1>
19
  <ul>
20
  <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='impls/index.html' class='dir'>impls/</a></li>
22
+ <li><a href='results/index.html' class='dir'>results/</a></li>
23
  </ul>
24
  </body>
25
  </html>
flash_attn/results/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <title>Directory Index</title>
6
+ <style>
7
+ body { font-family: monospace; margin: 20px; }
8
+ h1 { font-size: 1.5em; }
9
+ ul { list-style-type: none; padding-left: 20px; }
10
+ li { margin: 5px 0; }
11
+ .dir { font-weight: bold; }
12
+ .file { color: #0066cc; }
13
+ a { text-decoration: none; }
14
+ a:hover { text-decoration: underline; }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <h1>Index of /flash_attn/results</h1>
19
+ <ul>
20
+ <li><a href='../index.html' class='dir'>../</a></li>
21
+ <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
22
+ </ul>
23
+ </body>
24
+ </html>
index.html CHANGED
@@ -18,7 +18,6 @@
18
  <h1>Index of /</h1>
19
  <ul>
20
  <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
21
- <li><a href='moe_benchmarks/index.html' class='dir'>moe_benchmarks/</a></li>
22
  </ul>
23
  </body>
24
  </html>
 
18
  <h1>Index of /</h1>
19
  <ul>
20
  <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
 
21
  </ul>
22
  </body>
23
  </html>