Upload folder using huggingface_hub
Browse files- flash_attn/impls/artifacts/benchmark/attn.jsonl +6 -0
- flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +6 -0
- flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +6 -0
- flash_attn/impls/cells/benchmark.py +68 -0
- flash_attn/impls/cells/benchmark_default.py +70 -0
- flash_attn/impls/cells/benchmark_max_autotune.py +70 -0
- flash_attn/impls/cells/nv.py +3 -0
- flash_attn/impls/compiled_variants.html +0 -0
- flash_attn/impls/flash_attention.html +0 -0
- flash_attn/impls/hf_kernels_flash_attn.html +0 -0
- flash_attn/impls/hf_kernels_flash_attn3.html +0 -0
- flash_attn/impls/index.html +30 -0
- flash_attn/impls/mem_efficient_attention.html +0 -0
- flash_attn/impls/sage_attention.html +0 -0
- flash_attn/impls/xformers.html +0 -0
- flash_attn/index.html +2 -1
- flash_attn/results/index.html +24 -0
- index.html +0 -1
flash_attn/impls/artifacts/benchmark/attn.jsonl
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4453760087490082, "p50": 0.45241600275039673, "p90": 0.45257601141929626, "mean": 0.4501312017440796, "reps": 5, "warmup": 2}, "compile_ms": 1.8144960403442383, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4647679924964905, "p50": 0.46665599942207336, "p90": 0.47142401337623596, "mean": 0.46863360404968263, "reps": 5, "warmup": 2}, "compile_ms": 0.3614720106124878, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.597823977470398, "p50": 0.6007360219955444, "p90": 0.6015999913215637, "mean": 0.6010496020317078, "reps": 5, "warmup": 2}, "compile_ms": 0.4886080026626587, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6014080047607422, "p50": 0.6025919914245605, "p90": 0.6026239991188049, "mean": 0.6072191953659057, "reps": 5, "warmup": 2}, "compile_ms": 0.4956800043582916, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6389120221138, "p50": 0.6423360109329224, "p90": 0.6447039842605591, "mean": 0.6453696012496948, "reps": 5, "warmup": 2}, "compile_ms": 0.532256007194519, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6434879899024963, "p50": 0.6450560092926025, "p90": 0.6518719792366028, "mean": 0.6475072026252746, "reps": 5, "warmup": 2}, "compile_ms": 0.535040020942688, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T15:02:05Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5145599842071533, "p50": 0.5220479965209961, "p90": 0.5232319831848145, "mean": 0.5199103951454163, "reps": 5, "warmup": 2}, "compile_ms": 3343.085205078125, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5552319884300232, "p50": 0.5602560043334961, "p90": 0.5604159832000732, "mean": 0.5585088014602662, "reps": 5, "warmup": 2}, "compile_ms": 471.8746032714844, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6817600131034851, "p50": 0.6845120191574097, "p90": 0.6866880059242249, "mean": 0.6862144112586975, "reps": 5, "warmup": 2}, "compile_ms": 469.6441650390625, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7130560278892517, "p50": 0.7160000205039978, "p90": 0.7172480225563049, "mean": 0.7158400177955627, "reps": 5, "warmup": 2}, "compile_ms": 471.8545227050781, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7383360266685486, "p50": 0.746783971786499, "p90": 0.7520319819450378, "mean": 0.7461183905601502, "reps": 5, "warmup": 2}, "compile_ms": 473.72625732421875, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T15:02:08Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7711359858512878, "p50": 0.7734079957008362, "p90": 0.7748159766197205, "mean": 0.7733887910842896, "reps": 5, "warmup": 2}, "compile_ms": 476.75982666015625, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T15:02:53Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6211519837379456, "p50": 0.6424639821052551, "p90": 0.6726719737052917, "mean": 0.6559999942779541, "reps": 5, "warmup": 2}, "compile_ms": 4537.6962890625, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T15:02:55Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.665503978729248, "p50": 0.6812480092048645, "p90": 0.7109439969062805, "mean": 0.7009024024009705, "reps": 5, "warmup": 2}, "compile_ms": 1491.3409423828125, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T15:02:56Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8014079928398132, "p50": 0.8136320114135742, "p90": 0.8414080142974854, "mean": 0.8342463970184326, "reps": 5, "warmup": 2}, "compile_ms": 1269.2235107421875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T15:02:58Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8420799970626831, "p50": 0.8514879941940308, "p90": 0.8752319812774658, "mean": 0.8708159923553467, "reps": 5, "warmup": 2}, "compile_ms": 1631.2921142578125, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T15:03:00Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8840640187263489, "p50": 0.8960639834403992, "p90": 0.9062719941139221, "mean": 0.9071423888206482, "reps": 5, "warmup": 2}, "compile_ms": 1919.3294677734375, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T15:03:02Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9141759872436523, "p50": 0.9165440201759338, "p90": 0.9380800127983093, "mean": 0.9373440027236939, "reps": 5, "warmup": 2}, "compile_ms": 1484.717529296875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "xformers",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 12 |
+
# ///
|
| 13 |
+
import torch
|
| 14 |
+
import sys
|
| 15 |
+
import os
|
| 16 |
+
import kernels_benchmark_tools as kbt
|
| 17 |
+
import xformers.ops as xops
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def xformers_attention(q, k, v):
|
| 21 |
+
"""xFormers memory efficient attention"""
|
| 22 |
+
# xFormers expects [batch, seq_len, heads, head_dim]
|
| 23 |
+
return xops.memory_efficient_attention(q, k, v)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
kbt.add(
|
| 27 |
+
"xformers_meff",
|
| 28 |
+
xformers_attention,
|
| 29 |
+
tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 34 |
+
dtype = "float32" if device == "cpu" else "bfloat16"
|
| 35 |
+
|
| 36 |
+
# Flux-like workloads
|
| 37 |
+
base = 1024 if device == "cuda" else 512
|
| 38 |
+
flux_sizes = (
|
| 39 |
+
[128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
|
| 40 |
+
)
|
| 41 |
+
heads = 24 if device == "cuda" else 8
|
| 42 |
+
head_dim = 128 if device == "cuda" else 64
|
| 43 |
+
|
| 44 |
+
wl = []
|
| 45 |
+
for L in flux_sizes:
|
| 46 |
+
wl.append(
|
| 47 |
+
{
|
| 48 |
+
"name": f"flux_L{L}",
|
| 49 |
+
"batch": 1,
|
| 50 |
+
"seq_len": base + L,
|
| 51 |
+
"heads": heads,
|
| 52 |
+
"head_dim": head_dim,
|
| 53 |
+
"dtype": dtype,
|
| 54 |
+
"device": device,
|
| 55 |
+
"seed": 0,
|
| 56 |
+
}
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
kbt.run(
|
| 60 |
+
wl,
|
| 61 |
+
jsonl="attn.jsonl",
|
| 62 |
+
reps=5,
|
| 63 |
+
warmup=2,
|
| 64 |
+
gen=kbt.attn.gen_qkv,
|
| 65 |
+
ref=kbt.attn.ref_math,
|
| 66 |
+
cmp=kbt.attn.cmp_allclose,
|
| 67 |
+
)
|
| 68 |
+
kbt.summarize(["attn.jsonl"])
|
flash_attn/impls/cells/benchmark_default.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# ]
|
| 8 |
+
#
|
| 9 |
+
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 11 |
+
# ///
|
| 12 |
+
import torch
|
| 13 |
+
import sys
|
| 14 |
+
import os
|
| 15 |
+
import kernels_benchmark_tools as kbt
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def torch_flash_base(q, k, v):
|
| 19 |
+
qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
|
| 20 |
+
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
|
| 21 |
+
o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
|
| 22 |
+
return o.transpose(1, 2).contiguous()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Compile with default mode
|
| 26 |
+
compiled_flash_default = torch.compile(torch_flash_base, mode="default", fullgraph=True, dynamic=False)
|
| 27 |
+
|
| 28 |
+
kbt.add(
|
| 29 |
+
"torch_flash_compiled_default",
|
| 30 |
+
compiled_flash_default,
|
| 31 |
+
tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "default"},
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 36 |
+
dtype = "float32" if device == "cpu" else "bfloat16"
|
| 37 |
+
|
| 38 |
+
# Flux-like workloads
|
| 39 |
+
base = 1024 if device == "cuda" else 512
|
| 40 |
+
flux_sizes = (
|
| 41 |
+
[128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
|
| 42 |
+
)
|
| 43 |
+
heads = 24 if device == "cuda" else 8
|
| 44 |
+
head_dim = 128 if device == "cuda" else 64
|
| 45 |
+
|
| 46 |
+
wl = []
|
| 47 |
+
for L in flux_sizes:
|
| 48 |
+
wl.append(
|
| 49 |
+
{
|
| 50 |
+
"name": f"flux_L{L}",
|
| 51 |
+
"batch": 1,
|
| 52 |
+
"seq_len": base + L,
|
| 53 |
+
"heads": heads,
|
| 54 |
+
"head_dim": head_dim,
|
| 55 |
+
"dtype": dtype,
|
| 56 |
+
"device": device,
|
| 57 |
+
"seed": 0,
|
| 58 |
+
}
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
kbt.run(
|
| 62 |
+
wl,
|
| 63 |
+
jsonl="attn_default.jsonl",
|
| 64 |
+
reps=5,
|
| 65 |
+
warmup=2,
|
| 66 |
+
gen=kbt.attn.gen_qkv,
|
| 67 |
+
ref=kbt.attn.ref_math,
|
| 68 |
+
cmp=kbt.attn.cmp_allclose,
|
| 69 |
+
)
|
| 70 |
+
kbt.summarize(["attn_default.jsonl"])
|
flash_attn/impls/cells/benchmark_max_autotune.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# ]
|
| 8 |
+
#
|
| 9 |
+
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 11 |
+
# ///
|
| 12 |
+
import torch
|
| 13 |
+
import sys
|
| 14 |
+
import os
|
| 15 |
+
import kernels_benchmark_tools as kbt
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def torch_flash_base(q, k, v):
|
| 19 |
+
qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
|
| 20 |
+
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
|
| 21 |
+
o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
|
| 22 |
+
return o.transpose(1, 2).contiguous()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Compile with max-autotune mode
|
| 26 |
+
compiled_flash_max_autotune = torch.compile(torch_flash_base, mode="max-autotune", fullgraph=True, dynamic=False)
|
| 27 |
+
|
| 28 |
+
kbt.add(
|
| 29 |
+
"torch_flash_compiled_max_autotune",
|
| 30 |
+
compiled_flash_max_autotune,
|
| 31 |
+
tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 36 |
+
dtype = "float32" if device == "cpu" else "bfloat16"
|
| 37 |
+
|
| 38 |
+
# Flux-like workloads
|
| 39 |
+
base = 1024 if device == "cuda" else 512
|
| 40 |
+
flux_sizes = (
|
| 41 |
+
[128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
|
| 42 |
+
)
|
| 43 |
+
heads = 24 if device == "cuda" else 8
|
| 44 |
+
head_dim = 128 if device == "cuda" else 64
|
| 45 |
+
|
| 46 |
+
wl = []
|
| 47 |
+
for L in flux_sizes:
|
| 48 |
+
wl.append(
|
| 49 |
+
{
|
| 50 |
+
"name": f"flux_L{L}",
|
| 51 |
+
"batch": 1,
|
| 52 |
+
"seq_len": base + L,
|
| 53 |
+
"heads": heads,
|
| 54 |
+
"head_dim": head_dim,
|
| 55 |
+
"dtype": dtype,
|
| 56 |
+
"device": device,
|
| 57 |
+
"seed": 0,
|
| 58 |
+
}
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
kbt.run(
|
| 62 |
+
wl,
|
| 63 |
+
jsonl="attn_max_autotune.jsonl",
|
| 64 |
+
reps=5,
|
| 65 |
+
warmup=2,
|
| 66 |
+
gen=kbt.attn.gen_qkv,
|
| 67 |
+
ref=kbt.attn.ref_math,
|
| 68 |
+
cmp=kbt.attn.cmp_allclose,
|
| 69 |
+
)
|
| 70 |
+
kbt.summarize(["attn_max_autotune.jsonl"])
|
flash_attn/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
|
| 3 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
flash_attn/impls/compiled_variants.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/flash_attention.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/hf_kernels_flash_attn.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/hf_kernels_flash_attn3.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/index.html
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<title>Directory Index</title>
|
| 6 |
+
<style>
|
| 7 |
+
body { font-family: monospace; margin: 20px; }
|
| 8 |
+
h1 { font-size: 1.5em; }
|
| 9 |
+
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
+
li { margin: 5px 0; }
|
| 11 |
+
.dir { font-weight: bold; }
|
| 12 |
+
.file { color: #0066cc; }
|
| 13 |
+
a { text-decoration: none; }
|
| 14 |
+
a:hover { text-decoration: underline; }
|
| 15 |
+
</style>
|
| 16 |
+
</head>
|
| 17 |
+
<body>
|
| 18 |
+
<h1>Index of /flash_attn/impls</h1>
|
| 19 |
+
<ul>
|
| 20 |
+
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
+
<li><a href='compiled_variants.html' class='file'>compiled_variants.html</a></li>
|
| 22 |
+
<li><a href='flash_attention.html' class='file'>flash_attention.html</a></li>
|
| 23 |
+
<li><a href='hf_kernels_flash_attn.html' class='file'>hf_kernels_flash_attn.html</a></li>
|
| 24 |
+
<li><a href='hf_kernels_flash_attn3.html' class='file'>hf_kernels_flash_attn3.html</a></li>
|
| 25 |
+
<li><a href='mem_efficient_attention.html' class='file'>mem_efficient_attention.html</a></li>
|
| 26 |
+
<li><a href='sage_attention.html' class='file'>sage_attention.html</a></li>
|
| 27 |
+
<li><a href='xformers.html' class='file'>xformers.html</a></li>
|
| 28 |
+
</ul>
|
| 29 |
+
</body>
|
| 30 |
+
</html>
|
flash_attn/impls/mem_efficient_attention.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/sage_attention.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/xformers.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/index.html
CHANGED
|
@@ -18,7 +18,8 @@
|
|
| 18 |
<h1>Index of /flash_attn</h1>
|
| 19 |
<ul>
|
| 20 |
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='
|
|
|
|
| 22 |
</ul>
|
| 23 |
</body>
|
| 24 |
</html>
|
|
|
|
| 18 |
<h1>Index of /flash_attn</h1>
|
| 19 |
<ul>
|
| 20 |
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 22 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 23 |
</ul>
|
| 24 |
</body>
|
| 25 |
</html>
|
flash_attn/results/index.html
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<title>Directory Index</title>
|
| 6 |
+
<style>
|
| 7 |
+
body { font-family: monospace; margin: 20px; }
|
| 8 |
+
h1 { font-size: 1.5em; }
|
| 9 |
+
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
+
li { margin: 5px 0; }
|
| 11 |
+
.dir { font-weight: bold; }
|
| 12 |
+
.file { color: #0066cc; }
|
| 13 |
+
a { text-decoration: none; }
|
| 14 |
+
a:hover { text-decoration: underline; }
|
| 15 |
+
</style>
|
| 16 |
+
</head>
|
| 17 |
+
<body>
|
| 18 |
+
<h1>Index of /flash_attn/results</h1>
|
| 19 |
+
<ul>
|
| 20 |
+
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 22 |
+
</ul>
|
| 23 |
+
</body>
|
| 24 |
+
</html>
|
index.html
CHANGED
|
@@ -18,7 +18,6 @@
|
|
| 18 |
<h1>Index of /</h1>
|
| 19 |
<ul>
|
| 20 |
<li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
|
| 21 |
-
<li><a href='moe_benchmarks/index.html' class='dir'>moe_benchmarks/</a></li>
|
| 22 |
</ul>
|
| 23 |
</body>
|
| 24 |
</html>
|
|
|
|
| 18 |
<h1>Index of /</h1>
|
| 19 |
<ul>
|
| 20 |
<li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
|
|
|
|
| 21 |
</ul>
|
| 22 |
</body>
|
| 23 |
</html>
|