drbh HF Staff commited on
Commit
58b76f1
·
verified ·
1 Parent(s): b654441

Upload folder using huggingface_hub

Browse files
flash_attn/impls/artifacts/benchmark/attn.jsonl ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3389439880847931, "p50": 0.3461120128631592, "p90": 0.3461120128631592, "mean": 0.3452928066253662, "reps": 5, "warmup": 2}, "compile_ms": 0.9463679790496826, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000362396240234375, "mse": 2.9206275939941406e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.40959998965263367, "p50": 0.41280001401901245, "p90": 0.41286399960517883, "mean": 0.41234560012817384, "reps": 5, "warmup": 2}, "compile_ms": 0.34329599142074585, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4310399889945984, "p50": 0.4331519901752472, "p90": 0.4362240135669708, "mean": 0.4366208016872406, "reps": 5, "warmup": 2}, "compile_ms": 0.35942399501800537, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4359680116176605, "p50": 0.44361600279808044, "p90": 0.447488009929657, "mean": 0.4450624048709869, "reps": 5, "warmup": 2}, "compile_ms": 0.3678080141544342, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4711039960384369, "p50": 0.47513601183891296, "p90": 0.4763199985027313, "mean": 0.4750400006771088, "reps": 5, "warmup": 2}, "compile_ms": 0.40857601165771484, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T16:08:21Z", "run": "4862bb56aac04f66908d2a97924104e2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.49663999676704407, "p50": 0.4997119903564453, "p90": 0.5038080215454102, "mean": 0.5009407997131348, "reps": 5, "warmup": 2}, "compile_ms": 0.43724799156188965, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3563520014286041, "p50": 0.35942399501800537, "p90": 0.3624959886074066, "mean": 0.3856383919715881, "reps": 5, "warmup": 2}, "compile_ms": 2383.33544921875, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.4926080107688904, "p50": 0.49663999676704407, "p90": 0.5017600059509277, "mean": 0.4982912003993988, "reps": 5, "warmup": 2}, "compile_ms": 76.60860443115234, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5335040092468262, "p50": 0.5366079807281494, "p90": 0.5386239886283875, "mean": 0.5369919896125793, "reps": 5, "warmup": 2}, "compile_ms": 74.49088287353516, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T16:11:55Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5775359869003296, "p50": 0.5868800282478333, "p90": 0.5877760052680969, "mean": 0.5841408014297486, "reps": 5, "warmup": 2}, "compile_ms": 72.97433471679688, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T16:11:56Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6072319746017456, "p50": 0.6113280057907104, "p90": 0.6144000291824341, "mean": 0.6184704065322876, "reps": 5, "warmup": 2}, "compile_ms": 215.12498474121094, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T16:11:56Z", "run": "e9857dd2d39d4b40a6c91c0fdad82b00", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6399999856948853, "p50": 0.6430720090866089, "p90": 0.6430720090866089, "mean": 0.6428672075271606, "reps": 5, "warmup": 2}, "compile_ms": 71.8028793334961, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"ts": "2025-10-02T16:11:08Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3665919899940491, "p50": 0.3768320083618164, "p90": 0.41171199083328247, "mean": 0.40020479559898375, "reps": 5, "warmup": 2}, "compile_ms": 2910.97705078125, "peak_bytes": 85722112, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T16:11:08Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5160959959030151, "p50": 0.5489599704742432, "p90": 0.5631359815597534, "mean": 0.5535807967185974, "reps": 5, "warmup": 2}, "compile_ms": 85.84806060791016, "peak_bytes": 97387520, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.562175989151001, "p50": 0.6144000291824341, "p90": 0.6318079829216003, "mean": 0.6143999934196472, "reps": 5, "warmup": 2}, "compile_ms": 82.77401733398438, "peak_bytes": 99746816, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6512640118598938, "p50": 0.6584320068359375, "p90": 0.6799359917640686, "mean": 0.6754495978355408, "reps": 5, "warmup": 2}, "compile_ms": 81.94969940185547, "peak_bytes": 101843968, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T16:11:09Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.6973119974136353, "p50": 0.7014080286026001, "p90": 0.7229440212249756, "mean": 0.7210752129554748, "reps": 5, "warmup": 2}, "compile_ms": 81.1141128540039, "peak_bytes": 103810048, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T16:11:10Z", "run": "06f2face3c924e1b89a35a0fb568d4b1", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.7485439777374268, "p50": 0.7557439804077148, "p90": 0.7710719704627991, "mean": 0.7735359907150269, "reps": 5, "warmup": 2}, "compile_ms": 767.1397094726562, "peak_bytes": 106562560, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/cells/benchmark.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels-benchmark-tools",
7
+ # "kernels",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
12
+ # ///
13
+ import torch
14
+ import sys
15
+ import os
16
+ import kernels_benchmark_tools as kbt
17
+ from kernels import get_kernel
18
+
19
+ hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn", revision="v0.0.2")
20
+
21
+
22
+ def hf_flash_attention(query, key, value):
23
+ """HuggingFace Kernels Flash Attention"""
24
+ return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
25
+
26
+
27
+ kbt.add(
28
+ "hf_kernels_flash_attn",
29
+ hf_flash_attention,
30
+ tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
31
+ )
32
+
33
+ if __name__ == "__main__":
34
+ device = "cuda" if torch.cuda.is_available() else "cpu"
35
+
36
+ if device == "cpu":
37
+ print("HF Kernels Flash Attention requires CUDA - skipping benchmark")
38
+ sys.exit(0)
39
+
40
+ dtype = "bfloat16"
41
+
42
+ # Flux-like workloads
43
+ base = 1024
44
+ flux_sizes = [128, 256, 320, 384, 448, 512]
45
+ heads = 24
46
+ head_dim = 128
47
+
48
+ wl = []
49
+ for L in flux_sizes:
50
+ wl.append(
51
+ {
52
+ "name": f"flux_L{L}",
53
+ "batch": 1,
54
+ "seq_len": base + L,
55
+ "heads": heads,
56
+ "head_dim": head_dim,
57
+ "dtype": dtype,
58
+ "device": device,
59
+ "seed": 0,
60
+ }
61
+ )
62
+
63
+ kbt.run(
64
+ wl,
65
+ jsonl="attn.jsonl",
66
+ reps=5,
67
+ warmup=2,
68
+ gen=kbt.attn.gen_qkv,
69
+ ref=kbt.attn.ref_math,
70
+ cmp=kbt.attn.cmp_allclose,
71
+ )
72
+ kbt.summarize(["attn.jsonl"])
flash_attn/impls/cells/benchmark_default.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels-benchmark-tools",
7
+ # ]
8
+ #
9
+ # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
11
+ # ///
12
+ import torch
13
+ import sys
14
+ import os
15
+ import kernels_benchmark_tools as kbt
16
+
17
+
18
+ def torch_flash_base(q, k, v):
19
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
20
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
21
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
22
+ return o.transpose(1, 2).contiguous()
23
+
24
+
25
+ # Compile with default mode
26
+ compiled_flash_default = torch.compile(torch_flash_base, mode="default", fullgraph=True, dynamic=False)
27
+
28
+ kbt.add(
29
+ "torch_flash_compiled_default",
30
+ compiled_flash_default,
31
+ tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "default"},
32
+ )
33
+
34
+ if __name__ == "__main__":
35
+ device = "cuda" if torch.cuda.is_available() else "cpu"
36
+ dtype = "float32" if device == "cpu" else "bfloat16"
37
+
38
+ # Flux-like workloads
39
+ base = 1024 if device == "cuda" else 512
40
+ flux_sizes = (
41
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
42
+ )
43
+ heads = 24 if device == "cuda" else 8
44
+ head_dim = 128 if device == "cuda" else 64
45
+
46
+ wl = []
47
+ for L in flux_sizes:
48
+ wl.append(
49
+ {
50
+ "name": f"flux_L{L}",
51
+ "batch": 1,
52
+ "seq_len": base + L,
53
+ "heads": heads,
54
+ "head_dim": head_dim,
55
+ "dtype": dtype,
56
+ "device": device,
57
+ "seed": 0,
58
+ }
59
+ )
60
+
61
+ kbt.run(
62
+ wl,
63
+ jsonl="attn_default.jsonl",
64
+ reps=5,
65
+ warmup=2,
66
+ gen=kbt.attn.gen_qkv,
67
+ ref=kbt.attn.ref_math,
68
+ cmp=kbt.attn.cmp_allclose,
69
+ )
70
+ kbt.summarize(["attn_default.jsonl"])
flash_attn/impls/cells/benchmark_max_autotune.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels-benchmark-tools",
7
+ # ]
8
+ #
9
+ # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
11
+ # ///
12
+ import torch
13
+ import sys
14
+ import os
15
+ import kernels_benchmark_tools as kbt
16
+
17
+
18
+ def torch_flash_base(q, k, v):
19
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
20
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
21
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
22
+ return o.transpose(1, 2).contiguous()
23
+
24
+
25
+ # Compile with max-autotune mode
26
+ compiled_flash_max_autotune = torch.compile(torch_flash_base, mode="max-autotune", fullgraph=True, dynamic=False)
27
+
28
+ kbt.add(
29
+ "torch_flash_compiled_max_autotune",
30
+ compiled_flash_max_autotune,
31
+ tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
32
+ )
33
+
34
+ if __name__ == "__main__":
35
+ device = "cuda" if torch.cuda.is_available() else "cpu"
36
+ dtype = "float32" if device == "cpu" else "bfloat16"
37
+
38
+ # Flux-like workloads
39
+ base = 1024 if device == "cuda" else 512
40
+ flux_sizes = (
41
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
42
+ )
43
+ heads = 24 if device == "cuda" else 8
44
+ head_dim = 128 if device == "cuda" else 64
45
+
46
+ wl = []
47
+ for L in flux_sizes:
48
+ wl.append(
49
+ {
50
+ "name": f"flux_L{L}",
51
+ "batch": 1,
52
+ "seq_len": base + L,
53
+ "heads": heads,
54
+ "head_dim": head_dim,
55
+ "dtype": dtype,
56
+ "device": device,
57
+ "seed": 0,
58
+ }
59
+ )
60
+
61
+ kbt.run(
62
+ wl,
63
+ jsonl="attn_max_autotune.jsonl",
64
+ reps=5,
65
+ warmup=2,
66
+ gen=kbt.attn.gen_qkv,
67
+ ref=kbt.attn.ref_math,
68
+ cmp=kbt.attn.cmp_allclose,
69
+ )
70
+ kbt.summarize(["attn_max_autotune.jsonl"])
flash_attn/impls/cells/nv.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import subprocess
2
+
3
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
flash_attn/impls/compiled_variants.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/flash_attention.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/hf_kernels_flash_attn.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/hf_kernels_flash_attn3.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/index.html ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /flash_attn/impls</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /flash_attn/impls</h1>
84
+ <ul>
85
+ <li><a href='compiled_variants.html' class='file'>compiled_variants.html</a></li>
86
+ <li><a href='flash_attention.html' class='file'>flash_attention.html</a></li>
87
+ <li><a href='hf_kernels_flash_attn.html' class='file'>hf_kernels_flash_attn.html</a></li>
88
+ <li><a href='hf_kernels_flash_attn3.html' class='file'>hf_kernels_flash_attn3.html</a></li>
89
+ <li><a href='mem_efficient_attention.html' class='file'>mem_efficient_attention.html</a></li>
90
+ <li><a href='sage_attention.html' class='file'>sage_attention.html</a></li>
91
+ <li><a href='xformers.html' class='file'>xformers.html</a></li>
92
+ </ul>
93
+ </body>
94
+ </html>
flash_attn/impls/mem_efficient_attention.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/sage_attention.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/impls/xformers.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /flash_attn</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /flash_attn</h1>
84
+ <ul>
85
+ <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
+ <li><a href='results/index.html' class='dir'>results/</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
flash_attn/results/artifacts/combine/latency.csv ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Implementation,Impl ID,Workload,Batch,Seq Length,Heads,Head Dim,Dtype,Mean (ms),P10 (ms),P50 (ms),P90 (ms),Reps,Peak Mem (MB),Backend,Family
2
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L128,1,1152,24,128,bfloat16,0.407123202085495,0.40537598729133606,0.40755200386047363,0.407584011554718,5,83.38,FLASH,torch-sdpa
3
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L256,1,1280,24,128,bfloat16,0.5235007882118226,0.5212159752845764,0.5232639908790588,0.523360013961792,5,90.62,FLASH,torch-sdpa
4
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L320,1,1344,24,128,bfloat16,0.545849597454071,0.5418559908866882,0.5468159914016724,0.5469120144844055,5,95.06,FLASH,torch-sdpa
5
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L384,1,1408,24,128,bfloat16,0.5892416119575501,0.5867519974708557,0.5888000130653381,0.5888000130653381,5,99.88,FLASH,torch-sdpa
6
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L448,1,1472,24,128,bfloat16,0.6449280023574829,0.6430720090866089,0.6442239880561829,0.6450240015983582,5,103.81,FLASH,torch-sdpa
7
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L512,1,1536,24,128,bfloat16,0.6823423862457275,0.6777600049972534,0.6809599995613098,0.6818559765815735,5,109.12,FLASH,torch-sdpa
8
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L128,1,1152,24,128,bfloat16,0.48371200561523436,0.4821760058403015,0.4833280146121979,0.4853760004043579,5,83.38,EFFICIENT,torch-sdpa
9
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L256,1,1280,24,128,bfloat16,0.6268800020217895,0.6246399879455566,0.6266880035400391,0.6286720037460327,5,90.62,EFFICIENT,torch-sdpa
10
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L320,1,1344,24,128,bfloat16,0.699776005744934,0.6973440051078796,0.7004160284996033,0.7004479765892029,5,95.94,EFFICIENT,torch-sdpa
11
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L384,1,1408,24,128,bfloat16,0.8333312034606933,0.8284159898757935,0.8325120210647583,0.8376320004463196,5,100.0,EFFICIENT,torch-sdpa
12
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L448,1,1472,24,128,bfloat16,0.9533439993858337,0.9502720236778259,0.9512959718704224,0.9572479724884033,5,103.81,EFFICIENT,torch-sdpa
13
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L512,1,1536,24,128,bfloat16,1.0066367864608765,1.0024960041046143,1.0045440196990967,1.0097919702529907,5,109.12,EFFICIENT,torch-sdpa
14
+ xFormers,xformers_meff,flux_L128,1,1152,24,128,bfloat16,0.3452928066253662,0.3389439880847931,0.3461120128631592,0.3461120128631592,5,83.38,memory_efficient,xformers
15
+ xFormers,xformers_meff,flux_L256,1,1280,24,128,bfloat16,0.41234560012817384,0.40959998965263367,0.41280001401901245,0.41286399960517883,5,90.62,memory_efficient,xformers
16
+ xFormers,xformers_meff,flux_L320,1,1344,24,128,bfloat16,0.4366208016872406,0.4310399889945984,0.4331519901752472,0.4362240135669708,5,95.06,memory_efficient,xformers
17
+ xFormers,xformers_meff,flux_L384,1,1408,24,128,bfloat16,0.4450624048709869,0.4359680116176605,0.44361600279808044,0.447488009929657,5,99.88,memory_efficient,xformers
18
+ xFormers,xformers_meff,flux_L448,1,1472,24,128,bfloat16,0.4750400006771088,0.4711039960384369,0.47513601183891296,0.4763199985027313,5,103.81,memory_efficient,xformers
19
+ xFormers,xformers_meff,flux_L512,1,1536,24,128,bfloat16,0.5009407997131348,0.49663999676704407,0.4997119903564453,0.5038080215454102,5,109.12,memory_efficient,xformers
20
+ Compiled (default),torch_flash_compiled_default,flux_L128,1,1152,24,128,bfloat16,0.3856383919715881,0.3563520014286041,0.35942399501800537,0.3624959886074066,5,83.38,FLASH,torch-sdpa
21
+ Compiled (default),torch_flash_compiled_default,flux_L256,1,1280,24,128,bfloat16,0.4982912003993988,0.4926080107688904,0.49663999676704407,0.5017600059509277,5,90.62,FLASH,torch-sdpa
22
+ Compiled (default),torch_flash_compiled_default,flux_L320,1,1344,24,128,bfloat16,0.5369919896125793,0.5335040092468262,0.5366079807281494,0.5386239886283875,5,95.25,FLASH,torch-sdpa
23
+ Compiled (default),torch_flash_compiled_default,flux_L384,1,1408,24,128,bfloat16,0.5841408014297486,0.5775359869003296,0.5868800282478333,0.5877760052680969,5,99.88,FLASH,torch-sdpa
24
+ Compiled (default),torch_flash_compiled_default,flux_L448,1,1472,24,128,bfloat16,0.6184704065322876,0.6072319746017456,0.6113280057907104,0.6144000291824341,5,103.81,FLASH,torch-sdpa
25
+ Compiled (default),torch_flash_compiled_default,flux_L512,1,1536,24,128,bfloat16,0.6428672075271606,0.6399999856948853,0.6430720090866089,0.6430720090866089,5,109.12,FLASH,torch-sdpa
26
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L128,1,1152,24,128,bfloat16,0.40020479559898375,0.3665919899940491,0.3768320083618164,0.41171199083328247,5,81.75,FLASH,torch-sdpa
27
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L256,1,1280,24,128,bfloat16,0.5535807967185974,0.5160959959030151,0.5489599704742432,0.5631359815597534,5,92.88,FLASH,torch-sdpa
28
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L320,1,1344,24,128,bfloat16,0.6143999934196472,0.562175989151001,0.6144000291824341,0.6318079829216003,5,95.13,FLASH,torch-sdpa
29
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L384,1,1408,24,128,bfloat16,0.6754495978355408,0.6512640118598938,0.6584320068359375,0.6799359917640686,5,97.13,FLASH,torch-sdpa
30
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L448,1,1472,24,128,bfloat16,0.7210752129554748,0.6973119974136353,0.7014080286026001,0.7229440212249756,5,99.0,FLASH,torch-sdpa
31
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L512,1,1536,24,128,bfloat16,0.7735359907150269,0.7485439777374268,0.7557439804077148,0.7710719704627991,5,101.63,FLASH,torch-sdpa
32
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L128,1,1152,24,128,bfloat16,0.2456959992647171,0.24371199309825897,0.24566400051116943,0.2457599937915802,5,83.38,flash-attn,hf-kernels
33
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L256,1,1280,24,128,bfloat16,0.3215551972389221,0.3164159953594208,0.319487988948822,0.32051199674606323,5,90.62,flash-attn,hf-kernels
34
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L320,1,1344,24,128,bfloat16,0.3384703993797302,0.33670398592948914,0.33792001008987427,0.33983999490737915,5,95.06,flash-attn,hf-kernels
35
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L384,1,1408,24,128,bfloat16,0.3510208010673523,0.3481599986553192,0.3491840064525604,0.35225600004196167,5,99.88,flash-attn,hf-kernels
36
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L448,1,1472,24,128,bfloat16,0.3829823970794678,0.38095998764038086,0.3829759955406189,0.3840000033378601,5,103.81,flash-attn,hf-kernels
37
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L512,1,1536,24,128,bfloat16,0.4259391903877258,0.4227519929409027,0.4249599874019623,0.4259839951992035,5,109.12,flash-attn,hf-kernels
38
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L128,1,1152,24,128,bfloat16,0.2755008041858673,0.26736000180244446,0.27561599016189575,0.27955201268196106,5,83.38,flash-attn3,hf-kernels
39
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L256,1,1280,24,128,bfloat16,0.3397440016269684,0.3368000090122223,0.3399679958820343,0.34191998839378357,5,90.62,flash-attn3,hf-kernels
40
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L320,1,1344,24,128,bfloat16,0.36019839644432067,0.3563520014286041,0.3604480028152466,0.36137598752975464,5,95.06,flash-attn3,hf-kernels
41
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L384,1,1408,24,128,bfloat16,0.37342079877853396,0.3718400001525879,0.37379199266433716,0.3746879994869232,5,99.88,flash-attn3,hf-kernels
42
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L448,1,1472,24,128,bfloat16,0.4024448037147522,0.3993600010871887,0.4014720022678375,0.4034560024738312,5,103.81,flash-attn3,hf-kernels
43
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L512,1,1536,24,128,bfloat16,0.4305088043212891,0.4270080029964447,0.4291520118713379,0.4331519901752472,5,109.12,flash-attn3,hf-kernels
flash_attn/results/artifacts/combine/latency.svg ADDED

Git LFS Details

  • SHA256: 7bb668989495a1f179dd65f7426ee2a611f8dc193219b9a7385c79f6701d161a
  • Pointer size: 130 Bytes
  • Size of remote file: 29.8 kB
flash_attn/results/cells/combine.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels-benchmark-tools",
7
+ # "matplotlib",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
12
+ # ///
13
+ import os
14
+ import sys
15
+ from pathlib import Path
16
+ import json
17
+ import torch # noqa: F401 # imported because upstream may expect torch to be importable
18
+ import kernels_benchmark_tools as kbt
19
+
20
+ # --- Matplotlib setup and helpers ------------------------------------------------
21
+ import matplotlib as mpl
22
+ import matplotlib.pyplot as plt
23
+ import csv
24
+
25
+
26
+ # Keep text as text (not paths) so CSS can style fonts, size, etc.
27
+ mpl.rcParams["svg.fonttype"] = "none"
28
+ # Make ids deterministic across builds
29
+ mpl.rcParams["svg.hashsalt"] = "latency-benchmark-combined"
30
+ # Avoid auto-closed figures interfering with our tagging
31
+ mpl.rcParams["figure.autolayout"] = True
32
+ # Make background transparent
33
+ mpl.rcParams["figure.facecolor"] = "none"
34
+ mpl.rcParams["axes.facecolor"] = "none"
35
+ mpl.rcParams["savefig.facecolor"] = "none"
36
+ mpl.rcParams["savefig.edgecolor"] = "none"
37
+
38
+ def _slugify(s: str) -> str:
39
+ s = (s or "").strip().lower()
40
+ keep = []
41
+ for ch in s:
42
+ if ch.isalnum():
43
+ keep.append(ch)
44
+ elif ch in (" ", "-", "_", "/", ".", ":"):
45
+ keep.append("-")
46
+ else:
47
+ keep.append("")
48
+ out = "".join(keep)
49
+ while "--" in out:
50
+ out = out.replace("--", "-")
51
+ return out.strip("-") or "unnamed"
52
+
53
+ def _tag_current_figure(default_series_prefix="series"):
54
+ """Attach SVG ids (gid) to key artists so they can be targeted from CSS."""
55
+ fig = plt.gcf()
56
+ if fig is None:
57
+ return
58
+
59
+ # Tag the figure itself
60
+ fig.set_gid("figure--latency")
61
+
62
+ for ax_idx, ax in enumerate(fig.get_axes(), start=1):
63
+ ax.set_gid(f"axes--{ax_idx}")
64
+
65
+ # Axis labels & title
66
+ if ax.get_title():
67
+ for t in ax.texts:
68
+ if t.get_text() == ax.get_title():
69
+ t.set_gid("title--main")
70
+ if ax.xaxis and ax.xaxis.get_label():
71
+ ax.xaxis.label.set_gid("label--x")
72
+ if ax.yaxis and ax.yaxis.get_label():
73
+ ax.yaxis.label.set_gid("label--y")
74
+
75
+ # Gridlines
76
+ for i, gl in enumerate(ax.get_xgridlines(), start=1):
77
+ gl.set_gid(f"grid-x--{i}")
78
+ for i, gl in enumerate(ax.get_ygridlines(), start=1):
79
+ gl.set_gid(f"grid-y--{i}")
80
+
81
+ # Legend block & entries
82
+ leg = ax.get_legend()
83
+ if leg is not None:
84
+ leg.set_gid("legend")
85
+ for i, txt in enumerate(leg.get_texts(), start=1):
86
+ label_slug = _slugify(txt.get_text())
87
+ txt.set_gid(f"legend-label--{label_slug or i}")
88
+
89
+ # Series (lines, patches)
90
+ # Lines
91
+ line_seen = {}
92
+ for ln in getattr(ax, "lines", []):
93
+ raw_label = ln.get_label() or ""
94
+ # Matplotlib uses labels beginning with "_" for non-legendable items
95
+ label = raw_label if not raw_label.startswith("_") else f"{default_series_prefix}"
96
+ slug = _slugify(label)
97
+ line_seen[slug] = line_seen.get(slug, 0) + 1
98
+ suffix = "" if line_seen[slug] == 1 else f"-{line_seen[slug]}"
99
+ ln.set_gid(f"series--{slug}{suffix}")
100
+
101
+ # Patches (bars, areas)
102
+ patch_seen = {}
103
+ for pt in getattr(ax, "patches", []):
104
+ label = getattr(pt, "get_label", lambda: "")() or f"{default_series_prefix}"
105
+ if isinstance(label, str) and label.startswith("_"):
106
+ label = default_series_prefix
107
+ slug = _slugify(label)
108
+ patch_seen[slug] = patch_seen.get(slug, 0) + 1
109
+ suffix = "" if patch_seen[slug] == 1 else f"-{patch_seen[slug]}"
110
+ pt.set_gid(f"series--{slug}{suffix}")
111
+
112
+ def _postprocess_svg_add_classes(svg_path: Path):
113
+ """Add convenient CSS classes alongside ids (e.g., class='series grid grid-x')."""
114
+ try:
115
+ import xml.etree.ElementTree as ET
116
+ ET.register_namespace("", "http://www.w3.org/2000/svg")
117
+ tree = ET.parse(svg_path)
118
+ root = tree.getroot()
119
+ for el in root.iter():
120
+ el_id = el.attrib.get("id", "")
121
+ if not el_id:
122
+ continue
123
+ cls = []
124
+ if el_id.startswith("figure--"):
125
+ cls.append("figure")
126
+ elif el_id.startswith("axes--"):
127
+ cls.append("axes")
128
+ elif el_id.startswith("grid-x--"):
129
+ cls += ["grid", "grid-x"]
130
+ elif el_id.startswith("grid-y--"):
131
+ cls += ["grid", "grid-y"]
132
+ elif el_id.startswith("legend"):
133
+ cls.append("legend")
134
+ elif el_id.startswith("label--x"):
135
+ cls.append("xlabel")
136
+ elif el_id.startswith("label--y"):
137
+ cls.append("ylabel")
138
+ elif el_id.startswith("title--"):
139
+ cls.append("title")
140
+ elif el_id.startswith("series--"):
141
+ cls.append("series")
142
+ if cls:
143
+ # Preserve any existing class (unlikely from Matplotlib)
144
+ existing = el.attrib.get("class", "")
145
+ el.set("class", (existing + " " + " ".join(cls)).strip())
146
+ tree.write(svg_path, encoding="utf-8", xml_declaration=True)
147
+ except Exception as e:
148
+ print(f"✗ SVG postprocess (classes) skipped: {e}")
149
+
150
+ # Monkey-patch savefig to force SVG & ensure tagging occurs even if kbt.viz saves internally.
151
+ _orig_savefig = plt.savefig
152
+ def _savefig_svg(fname, *args, **kwargs):
153
+ # Always save as SVG at a stable path for the artifact system
154
+ out = Path("latency.svg")
155
+ kwargs["format"] = "svg"
156
+ # Ensure everything we care about has ids before export
157
+ _tag_current_figure()
158
+ res = _orig_savefig(out, *args, **kwargs)
159
+ # Add helpful CSS classes on top of ids
160
+ _postprocess_svg_add_classes(out)
161
+ print(f"✓ Combined visualization saved as {out}")
162
+ return res
163
+
164
+ plt.savefig = _savefig_svg # apply patch
165
+
166
+ # Capture close calls in case kbt.viz() closes figures before we re-save
167
+ _orig_close = plt.close
168
+ _last_closed = {"fig": None}
169
+ def _capture_close(arg=None):
170
+ try:
171
+ if hasattr(arg, "savefig"): # looks like a Figure
172
+ _last_closed["fig"] = arg
173
+ else:
174
+ _last_closed["fig"] = plt.gcf()
175
+ finally:
176
+ return _orig_close(arg)
177
+ plt.close = _capture_close
178
+
179
+ # --- Locate benchmark artifacts --------------------------------------------------
180
+ cache_dirs = {
181
+ "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
182
+ "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
183
+ "Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
184
+ "xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
185
+ "SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
186
+ "Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
187
+ "Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
188
+ "HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
189
+ "HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
190
+ }
191
+
192
+ print("LOADING BENCHMARK DATA")
193
+ for name, cache_dir in cache_dirs.items():
194
+ print(f"{name:30s}: {cache_dir}")
195
+ print()
196
+
197
+ file_mapping = {
198
+ "Flash (PyTorch SDPA)": "attn.jsonl",
199
+ "MemEff (PyTorch SDPA)": "attn.jsonl",
200
+ "Flash Attn 2": "attn.jsonl",
201
+ "xFormers": "attn.jsonl",
202
+ "SageAttention": "attn.jsonl",
203
+ "Compiled (default)": "attn_default.jsonl",
204
+ "Compiled (max-autotune)": "attn_max_autotune.jsonl",
205
+ "HF Kernels Flash Attn": "attn.jsonl",
206
+ "HF Kernels Flash Attn3": "attn.jsonl",
207
+ }
208
+
209
+ all_paths = []
210
+ for name, cache_dir in cache_dirs.items():
211
+ if cache_dir:
212
+ path = Path(cache_dir) / file_mapping[name]
213
+ if path.exists() and path.stat().st_size > 0:
214
+ all_paths.append(str(path))
215
+ print(f"✓ Found {name}: {path}")
216
+ else:
217
+ print(f"⊘ Empty/Missing {name}: {path}")
218
+ else:
219
+ print(f"✗ No cache dir for {name}")
220
+ print()
221
+
222
+ if not all_paths:
223
+ print("ERROR: No benchmark data files found!")
224
+ # restore patched functions before exiting
225
+ plt.savefig = _orig_savefig
226
+ plt.close = _orig_close
227
+ sys.exit(1)
228
+
229
+ # --- Summary + Visualization -----------------------------------------------------
230
+ print("COMBINED BENCHMARK SUMMARY\n")
231
+ kbt.summarize(all_paths)
232
+ print("\nGENERATING COMBINED VISUALIZATION\n")
233
+
234
+ try:
235
+ # If kbt.viz saves internally, our patched savefig ensures SVG gets written,
236
+ # and it will carry ids/classes for CSS styling.
237
+ kbt.viz(all_paths)
238
+ # Safety net: if kbt.viz didn't save, save now.
239
+ # if not Path("latency.svg").exists():
240
+ # _tag_current_figure()
241
+ # plt.savefig("latency.svg")
242
+
243
+ plt.savefig("latency.svg") # ensure saved with tagging
244
+
245
+ print("✓ SVG visualization ready: latency.svg!")
246
+ except ImportError as e:
247
+ print(f"✗ Visualization requires matplotlib: {e}")
248
+ except Exception as e:
249
+ print(f"✗ Visualization failed: {e}")
250
+ finally:
251
+ # Clean up patches to avoid side effects in later cells
252
+ plt.savefig = _orig_savefig
253
+ plt.close = _orig_close
254
+
255
+ print()
256
+ print("ANALYSIS COMPLETE")
257
+ print(f"Total implementations analyzed: {len(all_paths)}")
258
+ print(f"\nImplementations included:")
259
+ for name, cache_dir in cache_dirs.items():
260
+ if cache_dir:
261
+ path = Path(cache_dir) / file_mapping[name]
262
+ if path.exists() and path.stat().st_size > 0:
263
+ print(f" ✓ {name}")
264
+
265
+
266
+
267
+ # Collect all benchmark data and export to CSV
268
+ all_data = {}
269
+ for name, cache_dir in cache_dirs.items():
270
+ if cache_dir:
271
+ path = Path(cache_dir) / file_mapping[name]
272
+ if path.exists() and path.stat().st_size > 0:
273
+ with open(path, 'r') as f:
274
+ records = [json.loads(line) for line in f]
275
+ all_data[name] = records
276
+
277
+ # Export to CSV
278
+ csv_path = Path("latency.csv")
279
+ with open(csv_path, 'w', newline='') as csvfile:
280
+ writer = csv.writer(csvfile)
281
+
282
+ # Write header
283
+ header = ["Implementation", "Impl ID", "Workload", "Batch", "Seq Length", "Heads", "Head Dim", "Dtype",
284
+ "Mean (ms)", "P10 (ms)", "P50 (ms)", "P90 (ms)", "Reps",
285
+ # "Compile (ms)",
286
+ "Peak Mem (MB)", "Backend", "Family"]
287
+ writer.writerow(header)
288
+
289
+ # Write data rows
290
+ for impl_name, records in all_data.items():
291
+ for record in records:
292
+ wl = record.get('wl', {})
293
+ lat = record.get('lat_ms', {})
294
+ tags = record.get('tags', {})
295
+
296
+ row = [
297
+ impl_name,
298
+ record.get('impl', ''),
299
+ wl.get('name', ''),
300
+ wl.get('batch', ''),
301
+ wl.get('seq_len', ''),
302
+ wl.get('heads', ''),
303
+ wl.get('head_dim', ''),
304
+ wl.get('dtype', ''),
305
+ lat.get('mean', ''),
306
+ lat.get('p10', ''),
307
+ lat.get('p50', ''),
308
+ lat.get('p90', ''),
309
+ lat.get('reps', ''),
310
+ # record.get('compile_ms', ''),
311
+ round(record.get('peak_bytes', 0) / 1024 / 1024, 2) if record.get('peak_bytes') else '',
312
+ tags.get('backend', ''),
313
+ tags.get('family', ''),
314
+ ]
315
+ writer.writerow(row)
316
+
317
+ print(f"✓ CSV export complete: {csv_path}")
318
+ print(f"Total implementations: {len(all_data)}")
319
+ print(f"Total records: {sum(len(records) for records in all_data.values())}")
flash_attn/results/cells/csv_export.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels-benchmark-tools",
7
+ # ]
8
+ #
9
+ # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
11
+ # ///
12
+ import os
13
+ import csv
14
+ from pathlib import Path
15
+ import json
16
+
17
+ # --- Locate benchmark artifacts --------------------------------------------------
18
+ cache_dirs = {
19
+ "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
20
+ "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
21
+ "Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
22
+ "xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
23
+ "SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
24
+ "Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
25
+ "Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
26
+ "HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
27
+ "HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
28
+ }
29
+
30
+ file_mapping = {
31
+ "Flash (PyTorch SDPA)": "attn.jsonl",
32
+ "MemEff (PyTorch SDPA)": "attn.jsonl",
33
+ "Flash Attn 2": "attn.jsonl",
34
+ "xFormers": "attn.jsonl",
35
+ "SageAttention": "attn.jsonl",
36
+ "Compiled (default)": "attn_default.jsonl",
37
+ "Compiled (max-autotune)": "attn_max_autotune.jsonl",
38
+ "HF Kernels Flash Attn": "attn.jsonl",
39
+ "HF Kernels Flash Attn3": "attn.jsonl",
40
+ }
41
+
42
+ # Collect all benchmark data
43
+ all_data = {}
44
+ for name, cache_dir in cache_dirs.items():
45
+ if cache_dir:
46
+ path = Path(cache_dir) / file_mapping[name]
47
+ if path.exists() and path.stat().st_size > 0:
48
+ with open(path, 'r') as f:
49
+ records = [json.loads(line) for line in f]
50
+ all_data[name] = records
51
+
52
+ # Export to CSV
53
+ csv_path = Path("latency.csv")
54
+ with open(csv_path, 'w', newline='') as csvfile:
55
+ writer = csv.writer(csvfile)
56
+
57
+ # Write header
58
+ header = ["Implementation", "Sequence Length", "Latency (ms)", "Min (ms)", "Max (ms)", "Median (ms)"]
59
+ writer.writerow(header)
60
+
61
+ # Write data rows
62
+ for impl_name, records in all_data.items():
63
+ for record in records:
64
+ row = [
65
+ impl_name,
66
+ record.get('seqlen', ''),
67
+ record.get('latency', ''),
68
+ record.get('min', ''),
69
+ record.get('max', ''),
70
+ record.get('median', ''),
71
+ ]
72
+ writer.writerow(row)
73
+
74
+ print(f"✓ CSV export complete: {csv_path}")
75
+ print(f"Total implementations: {len(all_data)}")
76
+ print(f"Total records: {sum(len(records) for records in all_data.values())}")
flash_attn/results/combined_results.html ADDED
The diff for this file is too large to render. See raw diff
 
flash_attn/results/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /flash_attn/results</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /flash_attn/results</h1>
84
+ <ul>
85
+ <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
86
+ </ul>
87
+ </body>
88
+ </html>
index.html ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <h1>Index of /</h1>
81
+ <ul>
82
+ <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
83
+ </ul>
84
+ </body>
85
+ </html>