Upload folder using huggingface_hub
Browse files- flash_attn/impls/artifacts/benchmark/attn.jsonl +6 -0
- flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +6 -0
- flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +6 -0
- flash_attn/impls/cells/benchmark.py +72 -0
- flash_attn/impls/cells/benchmark_default.py +70 -0
- flash_attn/impls/cells/benchmark_max_autotune.py +70 -0
- flash_attn/impls/cells/nv.py +3 -0
- flash_attn/impls/compiled_variants.html +0 -0
- flash_attn/impls/flash_attention.html +0 -0
- flash_attn/impls/hf_kernels_flash_attn.html +0 -0
- flash_attn/impls/hf_kernels_flash_attn3.html +0 -0
- flash_attn/impls/index.html +94 -0
- flash_attn/impls/mem_efficient_attention.html +0 -0
- flash_attn/impls/sage_attention.html +0 -0
- flash_attn/impls/xformers.html +0 -0
- flash_attn/index.html +89 -0
- flash_attn/results/artifacts/combine/latency.csv +43 -0
- flash_attn/results/artifacts/combine/latency.svg +3 -0
- flash_attn/results/cells/combine.py +319 -0
- flash_attn/results/cells/csv_export.py +76 -0
- flash_attn/results/combined_results.html +0 -0
- flash_attn/results/index.html +88 -0
- index.html +85 -0
flash_attn/impls/artifacts/benchmark/attn.jsonl
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3389439880847931, "p50": 0.3461120128631592, "p90": 0.3461120128631592, "mean": 0.3452928066253662, "reps": 5, "warmup": 2}, "compile_ms": 0.9463679790496826, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000362396240234375, "mse": 2.9206275939941406e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.40959998965263367, "p50": 0.41280001401901245, "p90": 0.41286399960517883, "mean": 0.41234560012817384, "reps": 5, "warmup": 2}, "compile_ms": 0.34329599142074585, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4310399889945984, "p50": 0.4331519901752472, "p90": 0.4362240135669708, "mean": 0.4366208016872406, "reps": 5, "warmup": 2}, "compile_ms": 0.35942399501800537, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4359680116176605, "p50": 0.44361600279808044, "p90": 0.447488009929657, "mean": 0.4450624048709869, "reps": 5, "warmup": 2}, "compile_ms": 0.3678080141544342, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4711039960384369, "p50": 0.47513601183891296, "p90": 0.4763199985027313, "mean": 0.4750400006771088, "reps": 5, "warmup": 2}, "compile_ms": 0.40857601165771484, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.49663999676704407, "p50": 0.4997119903564453, "p90": 0.5038080215454102, "mean": 0.5009407997131348, "reps": 5, "warmup": 2}, "compile_ms": 0.43724799156188965, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3563520014286041, "p50": 0.35942399501800537, "p90": 0.3624959886074066, "mean": 0.3856383919715881, "reps": 5, "warmup": 2}, "compile_ms": 2383.33544921875, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4926080107688904, "p50": 0.49663999676704407, "p90": 0.5017600059509277, "mean": 0.4982912003993988, "reps": 5, "warmup": 2}, "compile_ms": 76.60860443115234, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5335040092468262, "p50": 0.5366079807281494, "p90": 0.5386239886283875, "mean": 0.5369919896125793, "reps": 5, "warmup": 2}, "compile_ms": 74.49088287353516, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5775359869003296, "p50": 0.5868800282478333, "p90": 0.5877760052680969, "mean": 0.5841408014297486, "reps": 5, "warmup": 2}, "compile_ms": 72.97433471679688, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T16:11:56Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6072319746017456, "p50": 0.6113280057907104, "p90": 0.6144000291824341, "mean": 0.6184704065322876, "reps": 5, "warmup": 2}, "compile_ms": 215.12498474121094, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T16:11:56Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6399999856948853, "p50": 0.6430720090866089, "p90": 0.6430720090866089, "mean": 0.6428672075271606, "reps": 5, "warmup": 2}, "compile_ms": 71.8028793334961, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T16:11:08Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3665919899940491, "p50": 0.3768320083618164, "p90": 0.41171199083328247, "mean": 0.40020479559898375, "reps": 5, "warmup": 2}, "compile_ms": 2910.97705078125, "peak_bytes": 85722112, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T16:11:08Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5160959959030151, "p50": 0.5489599704742432, "p90": 0.5631359815597534, "mean": 0.5535807967185974, "reps": 5, "warmup": 2}, "compile_ms": 85.84806060791016, "peak_bytes": 97387520, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.562175989151001, "p50": 0.6144000291824341, "p90": 0.6318079829216003, "mean": 0.6143999934196472, "reps": 5, "warmup": 2}, "compile_ms": 82.77401733398438, "peak_bytes": 99746816, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6512640118598938, "p50": 0.6584320068359375, "p90": 0.6799359917640686, "mean": 0.6754495978355408, "reps": 5, "warmup": 2}, "compile_ms": 81.94969940185547, "peak_bytes": 101843968, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6973119974136353, "p50": 0.7014080286026001, "p90": 0.7229440212249756, "mean": 0.7210752129554748, "reps": 5, "warmup": 2}, "compile_ms": 81.1141128540039, "peak_bytes": 103810048, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T16:11:10Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.7485439777374268, "p50": 0.7557439804077148, "p90": 0.7710719704627991, "mean": 0.7735359907150269, "reps": 5, "warmup": 2}, "compile_ms": 767.1397094726562, "peak_bytes": 106562560, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 12 |
+
# ///
|
| 13 |
+
import torch
|
| 14 |
+
import sys
|
| 15 |
+
import os
|
| 16 |
+
import kernels_benchmark_tools as kbt
|
| 17 |
+
from kernels import get_kernel
|
| 18 |
+
|
| 19 |
+
hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn", revision="v0.0.2")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def hf_flash_attention(query, key, value):
|
| 23 |
+
"""HuggingFace Kernels Flash Attention"""
|
| 24 |
+
return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
kbt.add(
|
| 28 |
+
"hf_kernels_flash_attn",
|
| 29 |
+
hf_flash_attention,
|
| 30 |
+
tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 35 |
+
|
| 36 |
+
if device == "cpu":
|
| 37 |
+
print("HF Kernels Flash Attention requires CUDA - skipping benchmark")
|
| 38 |
+
sys.exit(0)
|
| 39 |
+
|
| 40 |
+
dtype = "bfloat16"
|
| 41 |
+
|
| 42 |
+
# Flux-like workloads
|
| 43 |
+
base = 1024
|
| 44 |
+
flux_sizes = [128, 256, 320, 384, 448, 512]
|
| 45 |
+
heads = 24
|
| 46 |
+
head_dim = 128
|
| 47 |
+
|
| 48 |
+
wl = []
|
| 49 |
+
for L in flux_sizes:
|
| 50 |
+
wl.append(
|
| 51 |
+
{
|
| 52 |
+
"name": f"flux_L{L}",
|
| 53 |
+
"batch": 1,
|
| 54 |
+
"seq_len": base + L,
|
| 55 |
+
"heads": heads,
|
| 56 |
+
"head_dim": head_dim,
|
| 57 |
+
"dtype": dtype,
|
| 58 |
+
"device": device,
|
| 59 |
+
"seed": 0,
|
| 60 |
+
}
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
kbt.run(
|
| 64 |
+
wl,
|
| 65 |
+
jsonl="attn.jsonl",
|
| 66 |
+
reps=5,
|
| 67 |
+
warmup=2,
|
| 68 |
+
gen=kbt.attn.gen_qkv,
|
| 69 |
+
ref=kbt.attn.ref_math,
|
| 70 |
+
cmp=kbt.attn.cmp_allclose,
|
| 71 |
+
)
|
| 72 |
+
kbt.summarize(["attn.jsonl"])
|
flash_attn/impls/cells/benchmark_default.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# ]
|
| 8 |
+
#
|
| 9 |
+
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 11 |
+
# ///
|
| 12 |
+
import torch
|
| 13 |
+
import sys
|
| 14 |
+
import os
|
| 15 |
+
import kernels_benchmark_tools as kbt
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def torch_flash_base(q, k, v):
|
| 19 |
+
qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
|
| 20 |
+
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
|
| 21 |
+
o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
|
| 22 |
+
return o.transpose(1, 2).contiguous()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Compile with default mode
|
| 26 |
+
compiled_flash_default = torch.compile(torch_flash_base, mode="default", fullgraph=True, dynamic=False)
|
| 27 |
+
|
| 28 |
+
kbt.add(
|
| 29 |
+
"torch_flash_compiled_default",
|
| 30 |
+
compiled_flash_default,
|
| 31 |
+
tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "default"},
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 36 |
+
dtype = "float32" if device == "cpu" else "bfloat16"
|
| 37 |
+
|
| 38 |
+
# Flux-like workloads
|
| 39 |
+
base = 1024 if device == "cuda" else 512
|
| 40 |
+
flux_sizes = (
|
| 41 |
+
[128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
|
| 42 |
+
)
|
| 43 |
+
heads = 24 if device == "cuda" else 8
|
| 44 |
+
head_dim = 128 if device == "cuda" else 64
|
| 45 |
+
|
| 46 |
+
wl = []
|
| 47 |
+
for L in flux_sizes:
|
| 48 |
+
wl.append(
|
| 49 |
+
{
|
| 50 |
+
"name": f"flux_L{L}",
|
| 51 |
+
"batch": 1,
|
| 52 |
+
"seq_len": base + L,
|
| 53 |
+
"heads": heads,
|
| 54 |
+
"head_dim": head_dim,
|
| 55 |
+
"dtype": dtype,
|
| 56 |
+
"device": device,
|
| 57 |
+
"seed": 0,
|
| 58 |
+
}
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
kbt.run(
|
| 62 |
+
wl,
|
| 63 |
+
jsonl="attn_default.jsonl",
|
| 64 |
+
reps=5,
|
| 65 |
+
warmup=2,
|
| 66 |
+
gen=kbt.attn.gen_qkv,
|
| 67 |
+
ref=kbt.attn.ref_math,
|
| 68 |
+
cmp=kbt.attn.cmp_allclose,
|
| 69 |
+
)
|
| 70 |
+
kbt.summarize(["attn_default.jsonl"])
|
flash_attn/impls/cells/benchmark_max_autotune.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# ]
|
| 8 |
+
#
|
| 9 |
+
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 11 |
+
# ///
|
| 12 |
+
import torch
|
| 13 |
+
import sys
|
| 14 |
+
import os
|
| 15 |
+
import kernels_benchmark_tools as kbt
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def torch_flash_base(q, k, v):
|
| 19 |
+
qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
|
| 20 |
+
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
|
| 21 |
+
o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
|
| 22 |
+
return o.transpose(1, 2).contiguous()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Compile with max-autotune mode
|
| 26 |
+
compiled_flash_max_autotune = torch.compile(torch_flash_base, mode="max-autotune", fullgraph=True, dynamic=False)
|
| 27 |
+
|
| 28 |
+
kbt.add(
|
| 29 |
+
"torch_flash_compiled_max_autotune",
|
| 30 |
+
compiled_flash_max_autotune,
|
| 31 |
+
tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 36 |
+
dtype = "float32" if device == "cpu" else "bfloat16"
|
| 37 |
+
|
| 38 |
+
# Flux-like workloads
|
| 39 |
+
base = 1024 if device == "cuda" else 512
|
| 40 |
+
flux_sizes = (
|
| 41 |
+
[128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
|
| 42 |
+
)
|
| 43 |
+
heads = 24 if device == "cuda" else 8
|
| 44 |
+
head_dim = 128 if device == "cuda" else 64
|
| 45 |
+
|
| 46 |
+
wl = []
|
| 47 |
+
for L in flux_sizes:
|
| 48 |
+
wl.append(
|
| 49 |
+
{
|
| 50 |
+
"name": f"flux_L{L}",
|
| 51 |
+
"batch": 1,
|
| 52 |
+
"seq_len": base + L,
|
| 53 |
+
"heads": heads,
|
| 54 |
+
"head_dim": head_dim,
|
| 55 |
+
"dtype": dtype,
|
| 56 |
+
"device": device,
|
| 57 |
+
"seed": 0,
|
| 58 |
+
}
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
kbt.run(
|
| 62 |
+
wl,
|
| 63 |
+
jsonl="attn_max_autotune.jsonl",
|
| 64 |
+
reps=5,
|
| 65 |
+
warmup=2,
|
| 66 |
+
gen=kbt.attn.gen_qkv,
|
| 67 |
+
ref=kbt.attn.ref_math,
|
| 68 |
+
cmp=kbt.attn.cmp_allclose,
|
| 69 |
+
)
|
| 70 |
+
kbt.summarize(["attn_max_autotune.jsonl"])
|
flash_attn/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
|
| 3 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
flash_attn/impls/compiled_variants.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/flash_attention.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/hf_kernels_flash_attn.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/hf_kernels_flash_attn3.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/index.html
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /flash_attn/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /flash_attn/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='compiled_variants.html' class='file'>compiled_variants.html</a></li>
|
| 86 |
+
<li><a href='flash_attention.html' class='file'>flash_attention.html</a></li>
|
| 87 |
+
<li><a href='hf_kernels_flash_attn.html' class='file'>hf_kernels_flash_attn.html</a></li>
|
| 88 |
+
<li><a href='hf_kernels_flash_attn3.html' class='file'>hf_kernels_flash_attn3.html</a></li>
|
| 89 |
+
<li><a href='mem_efficient_attention.html' class='file'>mem_efficient_attention.html</a></li>
|
| 90 |
+
<li><a href='sage_attention.html' class='file'>sage_attention.html</a></li>
|
| 91 |
+
<li><a href='xformers.html' class='file'>xformers.html</a></li>
|
| 92 |
+
</ul>
|
| 93 |
+
</body>
|
| 94 |
+
</html>
|
flash_attn/impls/mem_efficient_attention.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/sage_attention.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/xformers.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /flash_attn</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /flash_attn</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
flash_attn/results/artifacts/combine/latency.csv
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Implementation,Impl ID,Workload,Batch,Seq Length,Heads,Head Dim,Dtype,Mean (ms),P10 (ms),P50 (ms),P90 (ms),Reps,Peak Mem (MB),Backend,Family
|
| 2 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L128,1,1152,24,128,bfloat16,0.407123202085495,0.40537598729133606,0.40755200386047363,0.407584011554718,5,83.38,FLASH,torch-sdpa
|
| 3 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L256,1,1280,24,128,bfloat16,0.5235007882118226,0.5212159752845764,0.5232639908790588,0.523360013961792,5,90.62,FLASH,torch-sdpa
|
| 4 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L320,1,1344,24,128,bfloat16,0.545849597454071,0.5418559908866882,0.5468159914016724,0.5469120144844055,5,95.06,FLASH,torch-sdpa
|
| 5 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L384,1,1408,24,128,bfloat16,0.5892416119575501,0.5867519974708557,0.5888000130653381,0.5888000130653381,5,99.88,FLASH,torch-sdpa
|
| 6 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L448,1,1472,24,128,bfloat16,0.6449280023574829,0.6430720090866089,0.6442239880561829,0.6450240015983582,5,103.81,FLASH,torch-sdpa
|
| 7 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L512,1,1536,24,128,bfloat16,0.6823423862457275,0.6777600049972534,0.6809599995613098,0.6818559765815735,5,109.12,FLASH,torch-sdpa
|
| 8 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L128,1,1152,24,128,bfloat16,0.48371200561523436,0.4821760058403015,0.4833280146121979,0.4853760004043579,5,83.38,EFFICIENT,torch-sdpa
|
| 9 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L256,1,1280,24,128,bfloat16,0.6268800020217895,0.6246399879455566,0.6266880035400391,0.6286720037460327,5,90.62,EFFICIENT,torch-sdpa
|
| 10 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L320,1,1344,24,128,bfloat16,0.699776005744934,0.6973440051078796,0.7004160284996033,0.7004479765892029,5,95.94,EFFICIENT,torch-sdpa
|
| 11 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L384,1,1408,24,128,bfloat16,0.8333312034606933,0.8284159898757935,0.8325120210647583,0.8376320004463196,5,100.0,EFFICIENT,torch-sdpa
|
| 12 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L448,1,1472,24,128,bfloat16,0.9533439993858337,0.9502720236778259,0.9512959718704224,0.9572479724884033,5,103.81,EFFICIENT,torch-sdpa
|
| 13 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L512,1,1536,24,128,bfloat16,1.0066367864608765,1.0024960041046143,1.0045440196990967,1.0097919702529907,5,109.12,EFFICIENT,torch-sdpa
|
| 14 |
+
xFormers,xformers_meff,flux_L128,1,1152,24,128,bfloat16,0.3452928066253662,0.3389439880847931,0.3461120128631592,0.3461120128631592,5,83.38,memory_efficient,xformers
|
| 15 |
+
xFormers,xformers_meff,flux_L256,1,1280,24,128,bfloat16,0.41234560012817384,0.40959998965263367,0.41280001401901245,0.41286399960517883,5,90.62,memory_efficient,xformers
|
| 16 |
+
xFormers,xformers_meff,flux_L320,1,1344,24,128,bfloat16,0.4366208016872406,0.4310399889945984,0.4331519901752472,0.4362240135669708,5,95.06,memory_efficient,xformers
|
| 17 |
+
xFormers,xformers_meff,flux_L384,1,1408,24,128,bfloat16,0.4450624048709869,0.4359680116176605,0.44361600279808044,0.447488009929657,5,99.88,memory_efficient,xformers
|
| 18 |
+
xFormers,xformers_meff,flux_L448,1,1472,24,128,bfloat16,0.4750400006771088,0.4711039960384369,0.47513601183891296,0.4763199985027313,5,103.81,memory_efficient,xformers
|
| 19 |
+
xFormers,xformers_meff,flux_L512,1,1536,24,128,bfloat16,0.5009407997131348,0.49663999676704407,0.4997119903564453,0.5038080215454102,5,109.12,memory_efficient,xformers
|
| 20 |
+
Compiled (default),torch_flash_compiled_default,flux_L128,1,1152,24,128,bfloat16,0.3856383919715881,0.3563520014286041,0.35942399501800537,0.3624959886074066,5,83.38,FLASH,torch-sdpa
|
| 21 |
+
Compiled (default),torch_flash_compiled_default,flux_L256,1,1280,24,128,bfloat16,0.4982912003993988,0.4926080107688904,0.49663999676704407,0.5017600059509277,5,90.62,FLASH,torch-sdpa
|
| 22 |
+
Compiled (default),torch_flash_compiled_default,flux_L320,1,1344,24,128,bfloat16,0.5369919896125793,0.5335040092468262,0.5366079807281494,0.5386239886283875,5,95.25,FLASH,torch-sdpa
|
| 23 |
+
Compiled (default),torch_flash_compiled_default,flux_L384,1,1408,24,128,bfloat16,0.5841408014297486,0.5775359869003296,0.5868800282478333,0.5877760052680969,5,99.88,FLASH,torch-sdpa
|
| 24 |
+
Compiled (default),torch_flash_compiled_default,flux_L448,1,1472,24,128,bfloat16,0.6184704065322876,0.6072319746017456,0.6113280057907104,0.6144000291824341,5,103.81,FLASH,torch-sdpa
|
| 25 |
+
Compiled (default),torch_flash_compiled_default,flux_L512,1,1536,24,128,bfloat16,0.6428672075271606,0.6399999856948853,0.6430720090866089,0.6430720090866089,5,109.12,FLASH,torch-sdpa
|
| 26 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L128,1,1152,24,128,bfloat16,0.40020479559898375,0.3665919899940491,0.3768320083618164,0.41171199083328247,5,81.75,FLASH,torch-sdpa
|
| 27 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L256,1,1280,24,128,bfloat16,0.5535807967185974,0.5160959959030151,0.5489599704742432,0.5631359815597534,5,92.88,FLASH,torch-sdpa
|
| 28 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L320,1,1344,24,128,bfloat16,0.6143999934196472,0.562175989151001,0.6144000291824341,0.6318079829216003,5,95.13,FLASH,torch-sdpa
|
| 29 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L384,1,1408,24,128,bfloat16,0.6754495978355408,0.6512640118598938,0.6584320068359375,0.6799359917640686,5,97.13,FLASH,torch-sdpa
|
| 30 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L448,1,1472,24,128,bfloat16,0.7210752129554748,0.6973119974136353,0.7014080286026001,0.7229440212249756,5,99.0,FLASH,torch-sdpa
|
| 31 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L512,1,1536,24,128,bfloat16,0.7735359907150269,0.7485439777374268,0.7557439804077148,0.7710719704627991,5,101.63,FLASH,torch-sdpa
|
| 32 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L128,1,1152,24,128,bfloat16,0.2456959992647171,0.24371199309825897,0.24566400051116943,0.2457599937915802,5,83.38,flash-attn,hf-kernels
|
| 33 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L256,1,1280,24,128,bfloat16,0.3215551972389221,0.3164159953594208,0.319487988948822,0.32051199674606323,5,90.62,flash-attn,hf-kernels
|
| 34 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L320,1,1344,24,128,bfloat16,0.3384703993797302,0.33670398592948914,0.33792001008987427,0.33983999490737915,5,95.06,flash-attn,hf-kernels
|
| 35 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L384,1,1408,24,128,bfloat16,0.3510208010673523,0.3481599986553192,0.3491840064525604,0.35225600004196167,5,99.88,flash-attn,hf-kernels
|
| 36 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L448,1,1472,24,128,bfloat16,0.3829823970794678,0.38095998764038086,0.3829759955406189,0.3840000033378601,5,103.81,flash-attn,hf-kernels
|
| 37 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L512,1,1536,24,128,bfloat16,0.4259391903877258,0.4227519929409027,0.4249599874019623,0.4259839951992035,5,109.12,flash-attn,hf-kernels
|
| 38 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L128,1,1152,24,128,bfloat16,0.2755008041858673,0.26736000180244446,0.27561599016189575,0.27955201268196106,5,83.38,flash-attn3,hf-kernels
|
| 39 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L256,1,1280,24,128,bfloat16,0.3397440016269684,0.3368000090122223,0.3399679958820343,0.34191998839378357,5,90.62,flash-attn3,hf-kernels
|
| 40 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L320,1,1344,24,128,bfloat16,0.36019839644432067,0.3563520014286041,0.3604480028152466,0.36137598752975464,5,95.06,flash-attn3,hf-kernels
|
| 41 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L384,1,1408,24,128,bfloat16,0.37342079877853396,0.3718400001525879,0.37379199266433716,0.3746879994869232,5,99.88,flash-attn3,hf-kernels
|
| 42 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L448,1,1472,24,128,bfloat16,0.4024448037147522,0.3993600010871887,0.4014720022678375,0.4034560024738312,5,103.81,flash-attn3,hf-kernels
|
| 43 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L512,1,1536,24,128,bfloat16,0.4305088043212891,0.4270080029964447,0.4291520118713379,0.4331519901752472,5,109.12,flash-attn3,hf-kernels
|
flash_attn/results/artifacts/combine/latency.svg
ADDED
|
|
Git LFS Details
|
flash_attn/results/cells/combine.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 12 |
+
# ///
|
| 13 |
+
import os
|
| 14 |
+
import sys
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
import json
|
| 17 |
+
import torch # noqa: F401 # imported because upstream may expect torch to be importable
|
| 18 |
+
import kernels_benchmark_tools as kbt
|
| 19 |
+
|
| 20 |
+
# --- Matplotlib setup and helpers ------------------------------------------------
|
| 21 |
+
import matplotlib as mpl
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
+
import csv
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# Keep text as text (not paths) so CSS can style fonts, size, etc.
|
| 27 |
+
mpl.rcParams["svg.fonttype"] = "none"
|
| 28 |
+
# Make ids deterministic across builds
|
| 29 |
+
mpl.rcParams["svg.hashsalt"] = "latency-benchmark-combined"
|
| 30 |
+
# Avoid auto-closed figures interfering with our tagging
|
| 31 |
+
mpl.rcParams["figure.autolayout"] = True
|
| 32 |
+
# Make background transparent
|
| 33 |
+
mpl.rcParams["figure.facecolor"] = "none"
|
| 34 |
+
mpl.rcParams["axes.facecolor"] = "none"
|
| 35 |
+
mpl.rcParams["savefig.facecolor"] = "none"
|
| 36 |
+
mpl.rcParams["savefig.edgecolor"] = "none"
|
| 37 |
+
|
| 38 |
+
def _slugify(s: str) -> str:
|
| 39 |
+
s = (s or "").strip().lower()
|
| 40 |
+
keep = []
|
| 41 |
+
for ch in s:
|
| 42 |
+
if ch.isalnum():
|
| 43 |
+
keep.append(ch)
|
| 44 |
+
elif ch in (" ", "-", "_", "/", ".", ":"):
|
| 45 |
+
keep.append("-")
|
| 46 |
+
else:
|
| 47 |
+
keep.append("")
|
| 48 |
+
out = "".join(keep)
|
| 49 |
+
while "--" in out:
|
| 50 |
+
out = out.replace("--", "-")
|
| 51 |
+
return out.strip("-") or "unnamed"
|
| 52 |
+
|
| 53 |
+
def _tag_current_figure(default_series_prefix="series"):
|
| 54 |
+
"""Attach SVG ids (gid) to key artists so they can be targeted from CSS."""
|
| 55 |
+
fig = plt.gcf()
|
| 56 |
+
if fig is None:
|
| 57 |
+
return
|
| 58 |
+
|
| 59 |
+
# Tag the figure itself
|
| 60 |
+
fig.set_gid("figure--latency")
|
| 61 |
+
|
| 62 |
+
for ax_idx, ax in enumerate(fig.get_axes(), start=1):
|
| 63 |
+
ax.set_gid(f"axes--{ax_idx}")
|
| 64 |
+
|
| 65 |
+
# Axis labels & title
|
| 66 |
+
if ax.get_title():
|
| 67 |
+
for t in ax.texts:
|
| 68 |
+
if t.get_text() == ax.get_title():
|
| 69 |
+
t.set_gid("title--main")
|
| 70 |
+
if ax.xaxis and ax.xaxis.get_label():
|
| 71 |
+
ax.xaxis.label.set_gid("label--x")
|
| 72 |
+
if ax.yaxis and ax.yaxis.get_label():
|
| 73 |
+
ax.yaxis.label.set_gid("label--y")
|
| 74 |
+
|
| 75 |
+
# Gridlines
|
| 76 |
+
for i, gl in enumerate(ax.get_xgridlines(), start=1):
|
| 77 |
+
gl.set_gid(f"grid-x--{i}")
|
| 78 |
+
for i, gl in enumerate(ax.get_ygridlines(), start=1):
|
| 79 |
+
gl.set_gid(f"grid-y--{i}")
|
| 80 |
+
|
| 81 |
+
# Legend block & entries
|
| 82 |
+
leg = ax.get_legend()
|
| 83 |
+
if leg is not None:
|
| 84 |
+
leg.set_gid("legend")
|
| 85 |
+
for i, txt in enumerate(leg.get_texts(), start=1):
|
| 86 |
+
label_slug = _slugify(txt.get_text())
|
| 87 |
+
txt.set_gid(f"legend-label--{label_slug or i}")
|
| 88 |
+
|
| 89 |
+
# Series (lines, patches)
|
| 90 |
+
# Lines
|
| 91 |
+
line_seen = {}
|
| 92 |
+
for ln in getattr(ax, "lines", []):
|
| 93 |
+
raw_label = ln.get_label() or ""
|
| 94 |
+
# Matplotlib uses labels beginning with "_" for non-legendable items
|
| 95 |
+
label = raw_label if not raw_label.startswith("_") else f"{default_series_prefix}"
|
| 96 |
+
slug = _slugify(label)
|
| 97 |
+
line_seen[slug] = line_seen.get(slug, 0) + 1
|
| 98 |
+
suffix = "" if line_seen[slug] == 1 else f"-{line_seen[slug]}"
|
| 99 |
+
ln.set_gid(f"series--{slug}{suffix}")
|
| 100 |
+
|
| 101 |
+
# Patches (bars, areas)
|
| 102 |
+
patch_seen = {}
|
| 103 |
+
for pt in getattr(ax, "patches", []):
|
| 104 |
+
label = getattr(pt, "get_label", lambda: "")() or f"{default_series_prefix}"
|
| 105 |
+
if isinstance(label, str) and label.startswith("_"):
|
| 106 |
+
label = default_series_prefix
|
| 107 |
+
slug = _slugify(label)
|
| 108 |
+
patch_seen[slug] = patch_seen.get(slug, 0) + 1
|
| 109 |
+
suffix = "" if patch_seen[slug] == 1 else f"-{patch_seen[slug]}"
|
| 110 |
+
pt.set_gid(f"series--{slug}{suffix}")
|
| 111 |
+
|
| 112 |
+
def _postprocess_svg_add_classes(svg_path: Path):
|
| 113 |
+
"""Add convenient CSS classes alongside ids (e.g., class='series grid grid-x')."""
|
| 114 |
+
try:
|
| 115 |
+
import xml.etree.ElementTree as ET
|
| 116 |
+
ET.register_namespace("", "http://www.w3.org/2000/svg")
|
| 117 |
+
tree = ET.parse(svg_path)
|
| 118 |
+
root = tree.getroot()
|
| 119 |
+
for el in root.iter():
|
| 120 |
+
el_id = el.attrib.get("id", "")
|
| 121 |
+
if not el_id:
|
| 122 |
+
continue
|
| 123 |
+
cls = []
|
| 124 |
+
if el_id.startswith("figure--"):
|
| 125 |
+
cls.append("figure")
|
| 126 |
+
elif el_id.startswith("axes--"):
|
| 127 |
+
cls.append("axes")
|
| 128 |
+
elif el_id.startswith("grid-x--"):
|
| 129 |
+
cls += ["grid", "grid-x"]
|
| 130 |
+
elif el_id.startswith("grid-y--"):
|
| 131 |
+
cls += ["grid", "grid-y"]
|
| 132 |
+
elif el_id.startswith("legend"):
|
| 133 |
+
cls.append("legend")
|
| 134 |
+
elif el_id.startswith("label--x"):
|
| 135 |
+
cls.append("xlabel")
|
| 136 |
+
elif el_id.startswith("label--y"):
|
| 137 |
+
cls.append("ylabel")
|
| 138 |
+
elif el_id.startswith("title--"):
|
| 139 |
+
cls.append("title")
|
| 140 |
+
elif el_id.startswith("series--"):
|
| 141 |
+
cls.append("series")
|
| 142 |
+
if cls:
|
| 143 |
+
# Preserve any existing class (unlikely from Matplotlib)
|
| 144 |
+
existing = el.attrib.get("class", "")
|
| 145 |
+
el.set("class", (existing + " " + " ".join(cls)).strip())
|
| 146 |
+
tree.write(svg_path, encoding="utf-8", xml_declaration=True)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"✗ SVG postprocess (classes) skipped: {e}")
|
| 149 |
+
|
| 150 |
+
# Monkey-patch savefig to force SVG & ensure tagging occurs even if kbt.viz saves internally.
|
| 151 |
+
_orig_savefig = plt.savefig
|
| 152 |
+
def _savefig_svg(fname, *args, **kwargs):
|
| 153 |
+
# Always save as SVG at a stable path for the artifact system
|
| 154 |
+
out = Path("latency.svg")
|
| 155 |
+
kwargs["format"] = "svg"
|
| 156 |
+
# Ensure everything we care about has ids before export
|
| 157 |
+
_tag_current_figure()
|
| 158 |
+
res = _orig_savefig(out, *args, **kwargs)
|
| 159 |
+
# Add helpful CSS classes on top of ids
|
| 160 |
+
_postprocess_svg_add_classes(out)
|
| 161 |
+
print(f"✓ Combined visualization saved as {out}")
|
| 162 |
+
return res
|
| 163 |
+
|
| 164 |
+
plt.savefig = _savefig_svg # apply patch
|
| 165 |
+
|
| 166 |
+
# Capture close calls in case kbt.viz() closes figures before we re-save
|
| 167 |
+
_orig_close = plt.close
|
| 168 |
+
_last_closed = {"fig": None}
|
| 169 |
+
def _capture_close(arg=None):
|
| 170 |
+
try:
|
| 171 |
+
if hasattr(arg, "savefig"): # looks like a Figure
|
| 172 |
+
_last_closed["fig"] = arg
|
| 173 |
+
else:
|
| 174 |
+
_last_closed["fig"] = plt.gcf()
|
| 175 |
+
finally:
|
| 176 |
+
return _orig_close(arg)
|
| 177 |
+
plt.close = _capture_close
|
| 178 |
+
|
| 179 |
+
# --- Locate benchmark artifacts --------------------------------------------------
|
| 180 |
+
cache_dirs = {
|
| 181 |
+
"Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
|
| 182 |
+
"MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
|
| 183 |
+
"Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
|
| 184 |
+
"xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
|
| 185 |
+
"SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
|
| 186 |
+
"Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
|
| 187 |
+
"Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
|
| 188 |
+
"HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
|
| 189 |
+
"HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
print("LOADING BENCHMARK DATA")
|
| 193 |
+
for name, cache_dir in cache_dirs.items():
|
| 194 |
+
print(f"{name:30s}: {cache_dir}")
|
| 195 |
+
print()
|
| 196 |
+
|
| 197 |
+
file_mapping = {
|
| 198 |
+
"Flash (PyTorch SDPA)": "attn.jsonl",
|
| 199 |
+
"MemEff (PyTorch SDPA)": "attn.jsonl",
|
| 200 |
+
"Flash Attn 2": "attn.jsonl",
|
| 201 |
+
"xFormers": "attn.jsonl",
|
| 202 |
+
"SageAttention": "attn.jsonl",
|
| 203 |
+
"Compiled (default)": "attn_default.jsonl",
|
| 204 |
+
"Compiled (max-autotune)": "attn_max_autotune.jsonl",
|
| 205 |
+
"HF Kernels Flash Attn": "attn.jsonl",
|
| 206 |
+
"HF Kernels Flash Attn3": "attn.jsonl",
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
all_paths = []
|
| 210 |
+
for name, cache_dir in cache_dirs.items():
|
| 211 |
+
if cache_dir:
|
| 212 |
+
path = Path(cache_dir) / file_mapping[name]
|
| 213 |
+
if path.exists() and path.stat().st_size > 0:
|
| 214 |
+
all_paths.append(str(path))
|
| 215 |
+
print(f"✓ Found {name}: {path}")
|
| 216 |
+
else:
|
| 217 |
+
print(f"⊘ Empty/Missing {name}: {path}")
|
| 218 |
+
else:
|
| 219 |
+
print(f"✗ No cache dir for {name}")
|
| 220 |
+
print()
|
| 221 |
+
|
| 222 |
+
if not all_paths:
|
| 223 |
+
print("ERROR: No benchmark data files found!")
|
| 224 |
+
# restore patched functions before exiting
|
| 225 |
+
plt.savefig = _orig_savefig
|
| 226 |
+
plt.close = _orig_close
|
| 227 |
+
sys.exit(1)
|
| 228 |
+
|
| 229 |
+
# --- Summary + Visualization -----------------------------------------------------
|
| 230 |
+
print("COMBINED BENCHMARK SUMMARY\n")
|
| 231 |
+
kbt.summarize(all_paths)
|
| 232 |
+
print("\nGENERATING COMBINED VISUALIZATION\n")
|
| 233 |
+
|
| 234 |
+
try:
|
| 235 |
+
# If kbt.viz saves internally, our patched savefig ensures SVG gets written,
|
| 236 |
+
# and it will carry ids/classes for CSS styling.
|
| 237 |
+
kbt.viz(all_paths)
|
| 238 |
+
# Safety net: if kbt.viz didn't save, save now.
|
| 239 |
+
# if not Path("latency.svg").exists():
|
| 240 |
+
# _tag_current_figure()
|
| 241 |
+
# plt.savefig("latency.svg")
|
| 242 |
+
|
| 243 |
+
plt.savefig("latency.svg") # ensure saved with tagging
|
| 244 |
+
|
| 245 |
+
print("✓ SVG visualization ready: latency.svg!")
|
| 246 |
+
except ImportError as e:
|
| 247 |
+
print(f"✗ Visualization requires matplotlib: {e}")
|
| 248 |
+
except Exception as e:
|
| 249 |
+
print(f"✗ Visualization failed: {e}")
|
| 250 |
+
finally:
|
| 251 |
+
# Clean up patches to avoid side effects in later cells
|
| 252 |
+
plt.savefig = _orig_savefig
|
| 253 |
+
plt.close = _orig_close
|
| 254 |
+
|
| 255 |
+
print()
|
| 256 |
+
print("ANALYSIS COMPLETE")
|
| 257 |
+
print(f"Total implementations analyzed: {len(all_paths)}")
|
| 258 |
+
print(f"\nImplementations included:")
|
| 259 |
+
for name, cache_dir in cache_dirs.items():
|
| 260 |
+
if cache_dir:
|
| 261 |
+
path = Path(cache_dir) / file_mapping[name]
|
| 262 |
+
if path.exists() and path.stat().st_size > 0:
|
| 263 |
+
print(f" ✓ {name}")
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
# Collect all benchmark data and export to CSV
|
| 268 |
+
all_data = {}
|
| 269 |
+
for name, cache_dir in cache_dirs.items():
|
| 270 |
+
if cache_dir:
|
| 271 |
+
path = Path(cache_dir) / file_mapping[name]
|
| 272 |
+
if path.exists() and path.stat().st_size > 0:
|
| 273 |
+
with open(path, 'r') as f:
|
| 274 |
+
records = [json.loads(line) for line in f]
|
| 275 |
+
all_data[name] = records
|
| 276 |
+
|
| 277 |
+
# Export to CSV
|
| 278 |
+
csv_path = Path("latency.csv")
|
| 279 |
+
with open(csv_path, 'w', newline='') as csvfile:
|
| 280 |
+
writer = csv.writer(csvfile)
|
| 281 |
+
|
| 282 |
+
# Write header
|
| 283 |
+
header = ["Implementation", "Impl ID", "Workload", "Batch", "Seq Length", "Heads", "Head Dim", "Dtype",
|
| 284 |
+
"Mean (ms)", "P10 (ms)", "P50 (ms)", "P90 (ms)", "Reps",
|
| 285 |
+
# "Compile (ms)",
|
| 286 |
+
"Peak Mem (MB)", "Backend", "Family"]
|
| 287 |
+
writer.writerow(header)
|
| 288 |
+
|
| 289 |
+
# Write data rows
|
| 290 |
+
for impl_name, records in all_data.items():
|
| 291 |
+
for record in records:
|
| 292 |
+
wl = record.get('wl', {})
|
| 293 |
+
lat = record.get('lat_ms', {})
|
| 294 |
+
tags = record.get('tags', {})
|
| 295 |
+
|
| 296 |
+
row = [
|
| 297 |
+
impl_name,
|
| 298 |
+
record.get('impl', ''),
|
| 299 |
+
wl.get('name', ''),
|
| 300 |
+
wl.get('batch', ''),
|
| 301 |
+
wl.get('seq_len', ''),
|
| 302 |
+
wl.get('heads', ''),
|
| 303 |
+
wl.get('head_dim', ''),
|
| 304 |
+
wl.get('dtype', ''),
|
| 305 |
+
lat.get('mean', ''),
|
| 306 |
+
lat.get('p10', ''),
|
| 307 |
+
lat.get('p50', ''),
|
| 308 |
+
lat.get('p90', ''),
|
| 309 |
+
lat.get('reps', ''),
|
| 310 |
+
# record.get('compile_ms', ''),
|
| 311 |
+
round(record.get('peak_bytes', 0) / 1024 / 1024, 2) if record.get('peak_bytes') else '',
|
| 312 |
+
tags.get('backend', ''),
|
| 313 |
+
tags.get('family', ''),
|
| 314 |
+
]
|
| 315 |
+
writer.writerow(row)
|
| 316 |
+
|
| 317 |
+
print(f"✓ CSV export complete: {csv_path}")
|
| 318 |
+
print(f"Total implementations: {len(all_data)}")
|
| 319 |
+
print(f"Total records: {sum(len(records) for records in all_data.values())}")
|
flash_attn/results/cells/csv_export.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# ]
|
| 8 |
+
#
|
| 9 |
+
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 11 |
+
# ///
|
| 12 |
+
import os
|
| 13 |
+
import csv
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import json
|
| 16 |
+
|
| 17 |
+
# --- Locate benchmark artifacts --------------------------------------------------
|
| 18 |
+
cache_dirs = {
|
| 19 |
+
"Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
|
| 20 |
+
"MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
|
| 21 |
+
"Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
|
| 22 |
+
"xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
|
| 23 |
+
"SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
|
| 24 |
+
"Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
|
| 25 |
+
"Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
|
| 26 |
+
"HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
|
| 27 |
+
"HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
file_mapping = {
|
| 31 |
+
"Flash (PyTorch SDPA)": "attn.jsonl",
|
| 32 |
+
"MemEff (PyTorch SDPA)": "attn.jsonl",
|
| 33 |
+
"Flash Attn 2": "attn.jsonl",
|
| 34 |
+
"xFormers": "attn.jsonl",
|
| 35 |
+
"SageAttention": "attn.jsonl",
|
| 36 |
+
"Compiled (default)": "attn_default.jsonl",
|
| 37 |
+
"Compiled (max-autotune)": "attn_max_autotune.jsonl",
|
| 38 |
+
"HF Kernels Flash Attn": "attn.jsonl",
|
| 39 |
+
"HF Kernels Flash Attn3": "attn.jsonl",
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# Collect all benchmark data
|
| 43 |
+
all_data = {}
|
| 44 |
+
for name, cache_dir in cache_dirs.items():
|
| 45 |
+
if cache_dir:
|
| 46 |
+
path = Path(cache_dir) / file_mapping[name]
|
| 47 |
+
if path.exists() and path.stat().st_size > 0:
|
| 48 |
+
with open(path, 'r') as f:
|
| 49 |
+
records = [json.loads(line) for line in f]
|
| 50 |
+
all_data[name] = records
|
| 51 |
+
|
| 52 |
+
# Export to CSV
|
| 53 |
+
csv_path = Path("latency.csv")
|
| 54 |
+
with open(csv_path, 'w', newline='') as csvfile:
|
| 55 |
+
writer = csv.writer(csvfile)
|
| 56 |
+
|
| 57 |
+
# Write header
|
| 58 |
+
header = ["Implementation", "Sequence Length", "Latency (ms)", "Min (ms)", "Max (ms)", "Median (ms)"]
|
| 59 |
+
writer.writerow(header)
|
| 60 |
+
|
| 61 |
+
# Write data rows
|
| 62 |
+
for impl_name, records in all_data.items():
|
| 63 |
+
for record in records:
|
| 64 |
+
row = [
|
| 65 |
+
impl_name,
|
| 66 |
+
record.get('seqlen', ''),
|
| 67 |
+
record.get('latency', ''),
|
| 68 |
+
record.get('min', ''),
|
| 69 |
+
record.get('max', ''),
|
| 70 |
+
record.get('median', ''),
|
| 71 |
+
]
|
| 72 |
+
writer.writerow(row)
|
| 73 |
+
|
| 74 |
+
print(f"✓ CSV export complete: {csv_path}")
|
| 75 |
+
print(f"Total implementations: {len(all_data)}")
|
| 76 |
+
print(f"Total records: {sum(len(records) for records in all_data.values())}")
|
flash_attn/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /flash_attn/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /flash_attn/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|
index.html
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<h1>Index of /</h1>
|
| 81 |
+
<ul>
|
| 82 |
+
<li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
|
| 83 |
+
</ul>
|
| 84 |
+
</body>
|
| 85 |
+
</html>
|