Upload folder using huggingface_hub
Browse files- flash_attn/impls/artifacts/benchmark/attn.jsonl +6 -6
- flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +6 -6
- flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +6 -6
- flash_attn/impls/cells/benchmark.py +19 -16
- flash_attn/impls/compiled_variants.html +61 -47
- flash_attn/impls/flash_attention.html +47 -33
- flash_attn/impls/hf_kernels_flash_attn.html +47 -33
- flash_attn/impls/hf_kernels_flash_attn3.html +46 -32
- flash_attn/impls/index.html +74 -10
- flash_attn/impls/mem_efficient_attention.html +37 -23
- flash_attn/impls/sage_attention.html +49 -36
- flash_attn/impls/xformers.html +41 -27
- flash_attn/index.html +74 -10
- flash_attn/results/artifacts/combine/latency.csv +42 -42
- flash_attn/results/artifacts/combine/latency.svg +2 -2
- flash_attn/results/combined_results.html +0 -0
- flash_attn/results/index.html +74 -10
- index.html +71 -9
flash_attn/impls/artifacts/benchmark/attn.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.3603839874267578, "p50": 0.361952006816864, "p90": 0.3624640107154846, "mean": 0.3619711995124817, "reps": 5, "warmup": 2}, "compile_ms": 1.5701119899749756, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.3892799913883209, "p50": 0.3909760117530823, "p90": 0.3922559916973114, "mean": 0.3912447988986969, "reps": 5, "warmup": 2}, "compile_ms": 0.35811200737953186, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5240640044212341, "p50": 0.5248960256576538, "p90": 0.5248960256576538, "mean": 0.5258048176765442, "reps": 5, "warmup": 2}, "compile_ms": 0.4891839921474457, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5265600085258484, "p50": 0.5277760028839111, "p90": 0.5282559990882874, "mean": 0.5276032090187073, "reps": 5, "warmup": 2}, "compile_ms": 0.4968000054359436, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5639039874076843, "p50": 0.5657920241355896, "p90": 0.5668479800224304, "mean": 0.5656383991241455, "reps": 5, "warmup": 2}, "compile_ms": 0.5312319993972778, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5689600110054016, "p50": 0.5698239803314209, "p90": 0.5713919997215271, "mean": 0.5789952039718628, "reps": 5, "warmup": 2}, "compile_ms": 0.5350080132484436, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T19:58:18Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5141760110855103, "p50": 0.5175679922103882, "p90": 0.5197759866714478, "mean": 0.5181439876556396, "reps": 5, "warmup": 2}, "compile_ms": 3084.621826171875, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5549119710922241, "p50": 0.5582720041275024, "p90": 0.5598080158233643, "mean": 0.5579584002494812, "reps": 5, "warmup": 2}, "compile_ms": 270.21795654296875, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6853119730949402, "p50": 0.687391996383667, "p90": 0.6883519887924194, "mean": 0.6872959971427918, "reps": 5, "warmup": 2}, "compile_ms": 269.78741455078125, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7128639817237854, "p50": 0.7160959839820862, "p90": 0.7167680263519287, "mean": 0.716153597831726, "reps": 5, "warmup": 2}, "compile_ms": 269.8607177734375, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7386879920959473, "p50": 0.7400959730148315, "p90": 0.7415040135383606, "mean": 0.7418303966522217, "reps": 5, "warmup": 2}, "compile_ms": 269.20501708984375, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T19:58:20Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7708160281181335, "p50": 0.7740799784660339, "p90": 0.7753919959068298, "mean": 0.7745471954345703, "reps": 5, "warmup": 2}, "compile_ms": 270.93829345703125, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T19:57:25Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6144000291824341, "p50": 0.6245759725570679, "p90": 0.6483200192451477, "mean": 0.6468096017837525, "reps": 5, "warmup": 2}, "compile_ms": 4407.3388671875, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T19:57:27Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6689280271530151, "p50": 0.6851199865341187, "p90": 0.7184960246086121, "mean": 0.7060160160064697, "reps": 5, "warmup": 2}, "compile_ms": 1686.2735595703125, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T19:57:29Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7953600287437439, "p50": 0.8155840039253235, "p90": 0.8403519988059998, "mean": 0.8332608103752136, "reps": 5, "warmup": 2}, "compile_ms": 1462.938232421875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T19:57:31Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8470720052719116, "p50": 0.849727988243103, "p90": 0.8745279908180237, "mean": 0.8719295978546142, "reps": 5, "warmup": 2}, "compile_ms": 1689.3455810546875, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T19:57:33Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8677120208740234, "p50": 0.8835520148277283, "p90": 0.9034240245819092, "mean": 0.9034304022789001, "reps": 5, "warmup": 2}, "compile_ms": 1693.035888671875, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T19:57:34Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9154239892959595, "p50": 0.9213759899139404, "p90": 0.9359679818153381, "mean": 0.9387519836425782, "reps": 5, "warmup": 2}, "compile_ms": 1689.36279296875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -14,32 +14,35 @@ import torch
|
|
| 14 |
import sys
|
| 15 |
import os
|
| 16 |
import kernels_benchmark_tools as kbt
|
| 17 |
-
|
| 18 |
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
return xops.memory_efficient_attention(q, k, v)
|
| 24 |
|
| 25 |
|
| 26 |
kbt.add(
|
| 27 |
-
"
|
| 28 |
-
|
| 29 |
-
tags={"family": "
|
| 30 |
)
|
| 31 |
|
| 32 |
if __name__ == "__main__":
|
| 33 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# Flux-like workloads
|
| 37 |
-
base = 1024
|
| 38 |
-
flux_sizes =
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
heads = 24 if device == "cuda" else 8
|
| 42 |
-
head_dim = 128 if device == "cuda" else 64
|
| 43 |
|
| 44 |
wl = []
|
| 45 |
for L in flux_sizes:
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
|
|
| 14 |
import sys
|
| 15 |
import os
|
| 16 |
import kernels_benchmark_tools as kbt
|
| 17 |
+
from kernels import get_kernel
|
| 18 |
|
| 19 |
+
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
|
| 20 |
|
| 21 |
+
|
| 22 |
+
def hf_flash_attention3(query, key, value):
|
| 23 |
+
return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
kbt.add(
|
| 27 |
+
"hf_kernels_flash_attn3",
|
| 28 |
+
hf_flash_attention3,
|
| 29 |
+
tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
|
| 30 |
)
|
| 31 |
|
| 32 |
if __name__ == "__main__":
|
| 33 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 34 |
+
|
| 35 |
+
if device == "cpu":
|
| 36 |
+
print("HF Kernels Flash Attention 3 requires CUDA - skipping benchmark")
|
| 37 |
+
sys.exit(0)
|
| 38 |
+
|
| 39 |
+
dtype = "bfloat16"
|
| 40 |
|
| 41 |
# Flux-like workloads
|
| 42 |
+
base = 1024
|
| 43 |
+
flux_sizes = [128, 256, 320, 384, 448, 512]
|
| 44 |
+
heads = 24
|
| 45 |
+
head_dim = 128
|
|
|
|
|
|
|
| 46 |
|
| 47 |
wl = []
|
| 48 |
for L in flux_sizes:
|
flash_attn/impls/compiled_variants.html
CHANGED
|
@@ -101,10 +101,12 @@
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
-
:root[data-ui="monocolor"] .reset-toggle
|
|
|
|
| 105 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 106 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 107 |
-
:root[data-ui="monocolor"] .reset-toggle:hover
|
|
|
|
| 108 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 109 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
@@ -214,7 +216,8 @@
|
|
| 214 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 215 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 216 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 217 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle
|
|
|
|
| 218 |
background: #f6f6f6;
|
| 219 |
border: 1px solid #cccccc;
|
| 220 |
color: #222222;
|
|
@@ -244,7 +247,8 @@
|
|
| 244 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 245 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 246 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 247 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover
|
|
|
|
| 248 |
background: #ededed;
|
| 249 |
border-color: #bbbbbb;
|
| 250 |
color: #000000;
|
|
@@ -302,7 +306,8 @@
|
|
| 302 |
}
|
| 303 |
|
| 304 |
.theme-toggle,
|
| 305 |
-
.reset-toggle
|
|
|
|
| 306 |
background: var(--bg-secondary);
|
| 307 |
border: 1px solid var(--border-primary);
|
| 308 |
padding: 8px 12px;
|
|
@@ -313,9 +318,15 @@
|
|
| 313 |
font-size: 0.9rem;
|
| 314 |
user-select: none;
|
| 315 |
}
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
.theme-toggle:hover,
|
| 318 |
-
.reset-toggle:hover
|
|
|
|
| 319 |
color: var(--text-primary);
|
| 320 |
background: var(--bg-tertiary);
|
| 321 |
}
|
|
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3697 |
<body>
|
| 3698 |
<div class="controls">
|
| 3699 |
<div class="controls-buttons">
|
|
|
|
|
|
|
|
|
|
| 3700 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3701 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3702 |
<div class="menu-button" onclick="toggleMenu()">
|
|
@@ -3746,7 +3760,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3746 |
<span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
|
| 3747 |
<span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3748 |
</span> |
|
| 3749 |
-
Cell: benchmark_default |
|
| 3750 |
| <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
|
| 3751 |
<button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
|
| 3752 |
<a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3830,7 +3844,7 @@ Cell: benchmark_default | 46.78s
|
|
| 3830 |
</div>
|
| 3831 |
<div id="output-benchmark_default" class="cell-output">
|
| 3832 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3833 |
-
torch_flash_compiled_default flux_L128 0.
|
| 3834 |
torch_flash_compiled_default flux_L256 0.56 True
|
| 3835 |
torch_flash_compiled_default flux_L320 0.69 True
|
| 3836 |
torch_flash_compiled_default flux_L384 0.72 True
|
|
@@ -3841,41 +3855,41 @@ torch_flash_compiled_default flux_L512 0.77 True
|
|
| 3841 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3842 |
<div class="uv-logs-content" style="display: none;">
|
| 3843 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3844 |
-
Downloading
|
| 3845 |
-
Downloading
|
| 3846 |
Downloading networkx (1.9MiB)
|
| 3847 |
Downloading setuptools (1.1MiB)
|
| 3848 |
-
Downloading nvidia-
|
| 3849 |
-
Downloading
|
|
|
|
| 3850 |
Downloading sympy (6.0MiB)
|
| 3851 |
-
Downloading
|
| 3852 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3853 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3854 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3855 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3856 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3857 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
|
|
|
|
|
|
| 3858 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3859 |
-
Downloading
|
| 3860 |
-
Downloading nvidia-
|
| 3861 |
-
Downloading fonttools (4.7MiB)
|
| 3862 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3863 |
-
Downloading kiwisolver (1.4MiB)
|
| 3864 |
Downloading triton (148.3MiB)
|
| 3865 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 3866 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3867 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3868 |
Downloading nvidia-cufile-cu12
|
| 3869 |
Downloading kiwisolver
|
| 3870 |
Downloading setuptools
|
| 3871 |
-
Downloading networkx
|
| 3872 |
Downloading fonttools
|
|
|
|
| 3873 |
Downloading pillow
|
| 3874 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3875 |
Downloading nvidia-cuda-cupti-cu12
|
| 3876 |
Downloading matplotlib
|
| 3877 |
-
Downloading sympy
|
| 3878 |
Downloading numpy
|
|
|
|
| 3879 |
Downloading nvidia-nvjitlink-cu12
|
| 3880 |
Downloading nvidia-curand-cu12
|
| 3881 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
@@ -3888,7 +3902,7 @@ Downloading numpy (16.2MiB)
|
|
| 3888 |
Downloading nvidia-cublas-cu12
|
| 3889 |
Downloading nvidia-cudnn-cu12
|
| 3890 |
Downloading torch
|
| 3891 |
-
Installed 37 packages in
|
| 3892 |
</div>
|
| 3893 |
</div>
|
| 3894 |
<div class="cell-artifacts">
|
|
@@ -3906,7 +3920,7 @@ Installed 37 packages in 557ms
|
|
| 3906 |
<span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
|
| 3907 |
<span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3908 |
</span> |
|
| 3909 |
-
Cell: benchmark_max_autotune |
|
| 3910 |
| <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
|
| 3911 |
<button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
|
| 3912 |
<a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3990,39 +4004,39 @@ Cell: benchmark_max_autotune | 53.65s
|
|
| 3990 |
</div>
|
| 3991 |
<div id="output-benchmark_max_autotune" class="cell-output">
|
| 3992 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3993 |
-
torch_flash_compiled_max_autotune flux_L128 0.
|
| 3994 |
-
torch_flash_compiled_max_autotune flux_L256 0.
|
| 3995 |
torch_flash_compiled_max_autotune flux_L320 0.82 True
|
| 3996 |
torch_flash_compiled_max_autotune flux_L384 0.85 True
|
| 3997 |
-
torch_flash_compiled_max_autotune flux_L448 0.
|
| 3998 |
torch_flash_compiled_max_autotune flux_L512 0.92 True
|
| 3999 |
</div>
|
| 4000 |
<div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
|
| 4001 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4002 |
<div class="uv-logs-content" style="display: none;">
|
| 4003 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 4004 |
-
Downloading nvidia-
|
| 4005 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4006 |
-
Downloading sympy (6.0MiB)
|
| 4007 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4008 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4009 |
-
Downloading
|
| 4010 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4011 |
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
|
|
|
| 4012 |
Downloading triton (148.3MiB)
|
| 4013 |
-
Downloading
|
| 4014 |
-
Downloading fonttools (4.7MiB)
|
| 4015 |
Downloading torch (846.9MiB)
|
| 4016 |
-
Downloading nvidia-
|
| 4017 |
-
Downloading nvidia-
|
| 4018 |
-
Downloading kiwisolver (1.4MiB)
|
| 4019 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4020 |
-
Downloading nvidia-
|
| 4021 |
Downloading numpy (16.2MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4022 |
Downloading pillow (6.3MiB)
|
| 4023 |
-
Downloading
|
| 4024 |
Downloading setuptools (1.1MiB)
|
| 4025 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4026 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 4027 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 4028 |
Downloading nvidia-cufile-cu12
|
|
@@ -4042,13 +4056,13 @@ Downloading nvidia-curand-cu12 (60.7MiB)
|
|
| 4042 |
Downloading triton
|
| 4043 |
Downloading nvidia-cufft-cu12
|
| 4044 |
Downloading nvidia-cusolver-cu12
|
| 4045 |
-
Downloading nvidia-cusparselt-cu12
|
| 4046 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 4047 |
Downloading nvidia-nccl-cu12
|
| 4048 |
Downloading nvidia-cublas-cu12
|
| 4049 |
Downloading nvidia-cudnn-cu12
|
| 4050 |
Downloading torch
|
| 4051 |
-
Installed 37 packages in
|
| 4052 |
</div>
|
| 4053 |
</div>
|
| 4054 |
<div class="cell-artifacts">
|
|
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
+
:root[data-ui="monocolor"] .reset-toggle,
|
| 105 |
+
:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 106 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 107 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 108 |
+
:root[data-ui="monocolor"] .reset-toggle:hover,
|
| 109 |
+
:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 111 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 112 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
|
|
| 216 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 217 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 218 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 219 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle,
|
| 220 |
+
:root[data-ui="none"][data-widgets="on"] .back-button {
|
| 221 |
background: #f6f6f6;
|
| 222 |
border: 1px solid #cccccc;
|
| 223 |
color: #222222;
|
|
|
|
| 247 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 248 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 249 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 250 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
|
| 251 |
+
:root[data-ui="none"][data-widgets="on"] .back-button:hover {
|
| 252 |
background: #ededed;
|
| 253 |
border-color: #bbbbbb;
|
| 254 |
color: #000000;
|
|
|
|
| 306 |
}
|
| 307 |
|
| 308 |
.theme-toggle,
|
| 309 |
+
.reset-toggle,
|
| 310 |
+
.back-button {
|
| 311 |
background: var(--bg-secondary);
|
| 312 |
border: 1px solid var(--border-primary);
|
| 313 |
padding: 8px 12px;
|
|
|
|
| 318 |
font-size: 0.9rem;
|
| 319 |
user-select: none;
|
| 320 |
}
|
| 321 |
+
|
| 322 |
+
.back-button {
|
| 323 |
+
text-decoration: none;
|
| 324 |
+
display: inline-block;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
.theme-toggle:hover,
|
| 328 |
+
.reset-toggle:hover,
|
| 329 |
+
.back-button:hover {
|
| 330 |
color: var(--text-primary);
|
| 331 |
background: var(--bg-tertiary);
|
| 332 |
}
|
|
|
|
| 3708 |
<body>
|
| 3709 |
<div class="controls">
|
| 3710 |
<div class="controls-buttons">
|
| 3711 |
+
|
| 3712 |
+
<a href="index.html" class="back-button">← back</a>
|
| 3713 |
+
|
| 3714 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3715 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3716 |
<div class="menu-button" onclick="toggleMenu()">
|
|
|
|
| 3760 |
<span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
|
| 3761 |
<span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3762 |
</span> |
|
| 3763 |
+
Cell: benchmark_default | 45.23s
|
| 3764 |
| <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
|
| 3765 |
<button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
|
| 3766 |
<a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3844 |
</div>
|
| 3845 |
<div id="output-benchmark_default" class="cell-output">
|
| 3846 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3847 |
+
torch_flash_compiled_default flux_L128 0.52 True
|
| 3848 |
torch_flash_compiled_default flux_L256 0.56 True
|
| 3849 |
torch_flash_compiled_default flux_L320 0.69 True
|
| 3850 |
torch_flash_compiled_default flux_L384 0.72 True
|
|
|
|
| 3855 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3856 |
<div class="uv-logs-content" style="display: none;">
|
| 3857 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3858 |
+
Downloading pillow (6.3MiB)
|
| 3859 |
+
Downloading numpy (16.2MiB)
|
| 3860 |
Downloading networkx (1.9MiB)
|
| 3861 |
Downloading setuptools (1.1MiB)
|
| 3862 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3863 |
+
Downloading fonttools (4.7MiB)
|
| 3864 |
+
Downloading kiwisolver (1.4MiB)
|
| 3865 |
Downloading sympy (6.0MiB)
|
| 3866 |
+
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
| 3867 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
| 3868 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3869 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3870 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3871 |
+
Downloading matplotlib (8.3MiB)
|
| 3872 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3873 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3874 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
|
|
|
| 3875 |
Downloading triton (148.3MiB)
|
| 3876 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3877 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3878 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3879 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3880 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3881 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3882 |
Downloading nvidia-cufile-cu12
|
| 3883 |
Downloading kiwisolver
|
| 3884 |
Downloading setuptools
|
|
|
|
| 3885 |
Downloading fonttools
|
| 3886 |
+
Downloading networkx
|
| 3887 |
Downloading pillow
|
| 3888 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3889 |
Downloading nvidia-cuda-cupti-cu12
|
| 3890 |
Downloading matplotlib
|
|
|
|
| 3891 |
Downloading numpy
|
| 3892 |
+
Downloading sympy
|
| 3893 |
Downloading nvidia-nvjitlink-cu12
|
| 3894 |
Downloading nvidia-curand-cu12
|
| 3895 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
|
|
| 3902 |
Downloading nvidia-cublas-cu12
|
| 3903 |
Downloading nvidia-cudnn-cu12
|
| 3904 |
Downloading torch
|
| 3905 |
+
Installed 37 packages in 551ms
|
| 3906 |
</div>
|
| 3907 |
</div>
|
| 3908 |
<div class="cell-artifacts">
|
|
|
|
| 3920 |
<span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark_max_autotune | 54.06s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
|
| 3926 |
<a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4004 |
</div>
|
| 4005 |
<div id="output-benchmark_max_autotune" class="cell-output">
|
| 4006 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 4007 |
+
torch_flash_compiled_max_autotune flux_L128 0.62 True
|
| 4008 |
+
torch_flash_compiled_max_autotune flux_L256 0.69 True
|
| 4009 |
torch_flash_compiled_max_autotune flux_L320 0.82 True
|
| 4010 |
torch_flash_compiled_max_autotune flux_L384 0.85 True
|
| 4011 |
+
torch_flash_compiled_max_autotune flux_L448 0.88 True
|
| 4012 |
torch_flash_compiled_max_autotune flux_L512 0.92 True
|
| 4013 |
</div>
|
| 4014 |
<div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
|
| 4015 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4016 |
<div class="uv-logs-content" style="display: none;">
|
| 4017 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 4018 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
| 4019 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4020 |
+
Downloading networkx (1.9MiB)
|
|
|
|
| 4021 |
Downloading matplotlib (8.3MiB)
|
| 4022 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4023 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4024 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4025 |
Downloading triton (148.3MiB)
|
| 4026 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 4027 |
Downloading torch (846.9MiB)
|
| 4028 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4029 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
| 4030 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4031 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4032 |
Downloading numpy (16.2MiB)
|
| 4033 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4034 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4035 |
+
Downloading sympy (6.0MiB)
|
| 4036 |
+
Downloading kiwisolver (1.4MiB)
|
| 4037 |
Downloading pillow (6.3MiB)
|
| 4038 |
+
Downloading fonttools (4.7MiB)
|
| 4039 |
Downloading setuptools (1.1MiB)
|
|
|
|
| 4040 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 4041 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 4042 |
Downloading nvidia-cufile-cu12
|
|
|
|
| 4056 |
Downloading triton
|
| 4057 |
Downloading nvidia-cufft-cu12
|
| 4058 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4059 |
Downloading nvidia-cusparse-cu12
|
| 4060 |
+
Downloading nvidia-cusparselt-cu12
|
| 4061 |
Downloading nvidia-nccl-cu12
|
| 4062 |
Downloading nvidia-cublas-cu12
|
| 4063 |
Downloading nvidia-cudnn-cu12
|
| 4064 |
Downloading torch
|
| 4065 |
+
Installed 37 packages in 513ms
|
| 4066 |
</div>
|
| 4067 |
</div>
|
| 4068 |
<div class="cell-artifacts">
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -101,10 +101,12 @@
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
-
:root[data-ui="monocolor"] .reset-toggle
|
|
|
|
| 105 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 106 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 107 |
-
:root[data-ui="monocolor"] .reset-toggle:hover
|
|
|
|
| 108 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 109 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
@@ -214,7 +216,8 @@
|
|
| 214 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 215 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 216 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 217 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle
|
|
|
|
| 218 |
background: #f6f6f6;
|
| 219 |
border: 1px solid #cccccc;
|
| 220 |
color: #222222;
|
|
@@ -244,7 +247,8 @@
|
|
| 244 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 245 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 246 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 247 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover
|
|
|
|
| 248 |
background: #ededed;
|
| 249 |
border-color: #bbbbbb;
|
| 250 |
color: #000000;
|
|
@@ -302,7 +306,8 @@
|
|
| 302 |
}
|
| 303 |
|
| 304 |
.theme-toggle,
|
| 305 |
-
.reset-toggle
|
|
|
|
| 306 |
background: var(--bg-secondary);
|
| 307 |
border: 1px solid var(--border-primary);
|
| 308 |
padding: 8px 12px;
|
|
@@ -313,9 +318,15 @@
|
|
| 313 |
font-size: 0.9rem;
|
| 314 |
user-select: none;
|
| 315 |
}
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
.theme-toggle:hover,
|
| 318 |
-
.reset-toggle:hover
|
|
|
|
| 319 |
color: var(--text-primary);
|
| 320 |
background: var(--bg-tertiary);
|
| 321 |
}
|
|
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3697 |
<body>
|
| 3698 |
<div class="controls">
|
| 3699 |
<div class="controls-buttons">
|
|
|
|
|
|
|
|
|
|
| 3700 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3701 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3702 |
<div class="menu-button" onclick="toggleMenu()">
|
|
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3745 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
-
Cell: nv | 0.
|
| 3749 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3751 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3761,7 +3775,7 @@ Cell: nv | 0.70s
|
|
| 3761 |
</div>
|
| 3762 |
</div>
|
| 3763 |
<div id="output-nv" class="cell-output">
|
| 3764 |
-
<div class="cell-stdout">Thu Oct 2
|
| 3765 |
+-----------------------------------------------------------------------------------------+
|
| 3766 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3767 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3770,19 +3784,19 @@ Cell: nv | 0.70s
|
|
| 3770 |
| | | MIG M. |
|
| 3771 |
|=========================================+========================+======================|
|
| 3772 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3773 |
-
| 0%
|
| 3774 |
| | | N/A |
|
| 3775 |
+-----------------------------------------+------------------------+----------------------+
|
| 3776 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3777 |
-
| 0%
|
| 3778 |
| | | N/A |
|
| 3779 |
+-----------------------------------------+------------------------+----------------------+
|
| 3780 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3781 |
-
| 0%
|
| 3782 |
| | | N/A |
|
| 3783 |
+-----------------------------------------+------------------------+----------------------+
|
| 3784 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3785 |
-
| 0%
|
| 3786 |
| | | N/A |
|
| 3787 |
+-----------------------------------------+------------------------+----------------------+
|
| 3788 |
|
|
@@ -3806,7 +3820,7 @@ Cell: nv | 0.70s
|
|
| 3806 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3807 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3808 |
</span> |
|
| 3809 |
-
Cell: benchmark |
|
| 3810 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3811 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3812 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3886,7 +3900,7 @@ Cell: benchmark | 36.63s
|
|
| 3886 |
</div>
|
| 3887 |
<div id="output-benchmark" class="cell-output">
|
| 3888 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3889 |
-
torch_flash_ma flux_L128 0.
|
| 3890 |
torch_flash_ma flux_L256 0.52 True
|
| 3891 |
torch_flash_ma flux_L320 0.65 True
|
| 3892 |
torch_flash_ma flux_L384 0.68 True
|
|
@@ -3897,35 +3911,35 @@ torch_flash_ma flux_L512 0.74 True
|
|
| 3897 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3898 |
<div class="uv-logs-content" style="display: none;">
|
| 3899 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3900 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3901 |
Downloading networkx (1.9MiB)
|
| 3902 |
-
Downloading kiwisolver (1.4MiB)
|
| 3903 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3904 |
-
Downloading
|
| 3905 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3906 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3907 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3908 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3909 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
| 3910 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3911 |
-
Downloading pillow (6.3MiB)
|
| 3912 |
-
Downloading numpy (16.2MiB)
|
| 3913 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3914 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3915 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3916 |
Downloading setuptools (1.1MiB)
|
| 3917 |
-
Downloading
|
| 3918 |
-
Downloading
|
| 3919 |
-
Downloading
|
| 3920 |
-
Downloading torch (846.9MiB)
|
| 3921 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
|
|
|
| 3922 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3923 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3924 |
Downloading nvidia-cufile-cu12
|
| 3925 |
Downloading kiwisolver
|
| 3926 |
Downloading setuptools
|
| 3927 |
-
Downloading networkx
|
| 3928 |
Downloading fonttools
|
|
|
|
| 3929 |
Downloading pillow
|
| 3930 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3931 |
Downloading nvidia-cuda-cupti-cu12
|
|
@@ -3941,10 +3955,10 @@ Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
| 3941 |
Downloading nvidia-cusparselt-cu12
|
| 3942 |
Downloading nvidia-cusparse-cu12
|
| 3943 |
Downloading nvidia-nccl-cu12
|
| 3944 |
-
Downloading nvidia-cudnn-cu12
|
| 3945 |
Downloading nvidia-cublas-cu12
|
|
|
|
| 3946 |
Downloading torch
|
| 3947 |
-
Installed 37 packages in
|
| 3948 |
</div>
|
| 3949 |
</div>
|
| 3950 |
<div class="cell-artifacts">
|
|
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
+
:root[data-ui="monocolor"] .reset-toggle,
|
| 105 |
+
:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 106 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 107 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 108 |
+
:root[data-ui="monocolor"] .reset-toggle:hover,
|
| 109 |
+
:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 111 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 112 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
|
|
| 216 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 217 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 218 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 219 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle,
|
| 220 |
+
:root[data-ui="none"][data-widgets="on"] .back-button {
|
| 221 |
background: #f6f6f6;
|
| 222 |
border: 1px solid #cccccc;
|
| 223 |
color: #222222;
|
|
|
|
| 247 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 248 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 249 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 250 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
|
| 251 |
+
:root[data-ui="none"][data-widgets="on"] .back-button:hover {
|
| 252 |
background: #ededed;
|
| 253 |
border-color: #bbbbbb;
|
| 254 |
color: #000000;
|
|
|
|
| 306 |
}
|
| 307 |
|
| 308 |
.theme-toggle,
|
| 309 |
+
.reset-toggle,
|
| 310 |
+
.back-button {
|
| 311 |
background: var(--bg-secondary);
|
| 312 |
border: 1px solid var(--border-primary);
|
| 313 |
padding: 8px 12px;
|
|
|
|
| 318 |
font-size: 0.9rem;
|
| 319 |
user-select: none;
|
| 320 |
}
|
| 321 |
+
|
| 322 |
+
.back-button {
|
| 323 |
+
text-decoration: none;
|
| 324 |
+
display: inline-block;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
.theme-toggle:hover,
|
| 328 |
+
.reset-toggle:hover,
|
| 329 |
+
.back-button:hover {
|
| 330 |
color: var(--text-primary);
|
| 331 |
background: var(--bg-tertiary);
|
| 332 |
}
|
|
|
|
| 3708 |
<body>
|
| 3709 |
<div class="controls">
|
| 3710 |
<div class="controls-buttons">
|
| 3711 |
+
|
| 3712 |
+
<a href="index.html" class="back-button">← back</a>
|
| 3713 |
+
|
| 3714 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3715 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3716 |
<div class="menu-button" onclick="toggleMenu()">
|
|
|
|
| 3759 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3760 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3761 |
</span> |
|
| 3762 |
+
Cell: nv | 0.67s
|
| 3763 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3764 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3765 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3775 |
</div>
|
| 3776 |
</div>
|
| 3777 |
<div id="output-nv" class="cell-output">
|
| 3778 |
+
<div class="cell-stdout">Thu Oct 2 19:58:23 2025
|
| 3779 |
+-----------------------------------------------------------------------------------------+
|
| 3780 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3781 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3784 |
| | | MIG M. |
|
| 3785 |
|=========================================+========================+======================|
|
| 3786 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3787 |
+
| 0% 37C P0 92W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3788 |
| | | N/A |
|
| 3789 |
+-----------------------------------------+------------------------+----------------------+
|
| 3790 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3791 |
+
| 0% 29C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3792 |
| | | N/A |
|
| 3793 |
+-----------------------------------------+------------------------+----------------------+
|
| 3794 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3795 |
+
| 0% 29C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3796 |
| | | N/A |
|
| 3797 |
+-----------------------------------------+------------------------+----------------------+
|
| 3798 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3799 |
+
| 0% 30C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3800 |
| | | N/A |
|
| 3801 |
+-----------------------------------------+------------------------+----------------------+
|
| 3802 |
|
|
|
|
| 3820 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3821 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3822 |
</span> |
|
| 3823 |
+
Cell: benchmark | 35.41s
|
| 3824 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3825 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3826 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3900 |
</div>
|
| 3901 |
<div id="output-benchmark" class="cell-output">
|
| 3902 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3903 |
+
torch_flash_ma flux_L128 0.49 True
|
| 3904 |
torch_flash_ma flux_L256 0.52 True
|
| 3905 |
torch_flash_ma flux_L320 0.65 True
|
| 3906 |
torch_flash_ma flux_L384 0.68 True
|
|
|
|
| 3911 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3912 |
<div class="uv-logs-content" style="display: none;">
|
| 3913 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3914 |
+
Downloading triton (148.3MiB)
|
| 3915 |
+
Downloading numpy (16.2MiB)
|
| 3916 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3917 |
+
Downloading matplotlib (8.3MiB)
|
| 3918 |
+
Downloading sympy (6.0MiB)
|
| 3919 |
+
Downloading fonttools (4.7MiB)
|
| 3920 |
Downloading networkx (1.9MiB)
|
|
|
|
| 3921 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3922 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3923 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3924 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3925 |
+
Downloading torch (846.9MiB)
|
| 3926 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
| 3927 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
| 3928 |
Downloading setuptools (1.1MiB)
|
| 3929 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3930 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3931 |
+
Downloading kiwisolver (1.4MiB)
|
|
|
|
| 3932 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3933 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3934 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3935 |
+
Downloading pillow (6.3MiB)
|
| 3936 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3937 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3938 |
Downloading nvidia-cufile-cu12
|
| 3939 |
Downloading kiwisolver
|
| 3940 |
Downloading setuptools
|
|
|
|
| 3941 |
Downloading fonttools
|
| 3942 |
+
Downloading networkx
|
| 3943 |
Downloading pillow
|
| 3944 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3945 |
Downloading nvidia-cuda-cupti-cu12
|
|
|
|
| 3955 |
Downloading nvidia-cusparselt-cu12
|
| 3956 |
Downloading nvidia-cusparse-cu12
|
| 3957 |
Downloading nvidia-nccl-cu12
|
|
|
|
| 3958 |
Downloading nvidia-cublas-cu12
|
| 3959 |
+
Downloading nvidia-cudnn-cu12
|
| 3960 |
Downloading torch
|
| 3961 |
+
Installed 37 packages in 491ms
|
| 3962 |
</div>
|
| 3963 |
</div>
|
| 3964 |
<div class="cell-artifacts">
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -101,10 +101,12 @@
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
-
:root[data-ui="monocolor"] .reset-toggle
|
|
|
|
| 105 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 106 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 107 |
-
:root[data-ui="monocolor"] .reset-toggle:hover
|
|
|
|
| 108 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 109 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
@@ -214,7 +216,8 @@
|
|
| 214 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 215 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 216 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 217 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle
|
|
|
|
| 218 |
background: #f6f6f6;
|
| 219 |
border: 1px solid #cccccc;
|
| 220 |
color: #222222;
|
|
@@ -244,7 +247,8 @@
|
|
| 244 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 245 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 246 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 247 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover
|
|
|
|
| 248 |
background: #ededed;
|
| 249 |
border-color: #bbbbbb;
|
| 250 |
color: #000000;
|
|
@@ -302,7 +306,8 @@
|
|
| 302 |
}
|
| 303 |
|
| 304 |
.theme-toggle,
|
| 305 |
-
.reset-toggle
|
|
|
|
| 306 |
background: var(--bg-secondary);
|
| 307 |
border: 1px solid var(--border-primary);
|
| 308 |
padding: 8px 12px;
|
|
@@ -313,9 +318,15 @@
|
|
| 313 |
font-size: 0.9rem;
|
| 314 |
user-select: none;
|
| 315 |
}
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
.theme-toggle:hover,
|
| 318 |
-
.reset-toggle:hover
|
|
|
|
| 319 |
color: var(--text-primary);
|
| 320 |
background: var(--bg-tertiary);
|
| 321 |
}
|
|
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3697 |
<body>
|
| 3698 |
<div class="controls">
|
| 3699 |
<div class="controls-buttons">
|
|
|
|
|
|
|
|
|
|
| 3700 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3701 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3702 |
<div class="menu-button" onclick="toggleMenu()">
|
|
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3745 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
-
Cell: benchmark |
|
| 3749 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3751 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3831,40 +3845,40 @@ Cell: benchmark | 39.43s
|
|
| 3831 |
</div>
|
| 3832 |
<div id="output-benchmark" class="cell-output">
|
| 3833 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3834 |
-
hf_kernels_flash_attn flux_L128 0.
|
| 3835 |
hf_kernels_flash_attn flux_L256 0.38 True
|
| 3836 |
hf_kernels_flash_attn flux_L320 0.49 True
|
| 3837 |
-
hf_kernels_flash_attn flux_L384 0.
|
| 3838 |
hf_kernels_flash_attn flux_L448 0.54 True
|
| 3839 |
-
hf_kernels_flash_attn flux_L512 0.
|
| 3840 |
</div>
|
| 3841 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3842 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3843 |
<div class="uv-logs-content" style="display: none;">
|
| 3844 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
|
|
|
| 3845 |
Downloading sympy (6.0MiB)
|
| 3846 |
-
Downloading
|
|
|
|
| 3847 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3848 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3849 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3850 |
Downloading networkx (1.9MiB)
|
| 3851 |
-
Downloading
|
| 3852 |
-
Downloading
|
|
|
|
|
|
|
| 3853 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3854 |
-
Downloading
|
| 3855 |
Downloading triton (148.3MiB)
|
| 3856 |
-
Downloading nvidia-
|
|
|
|
| 3857 |
Downloading numpy (16.2MiB)
|
| 3858 |
-
Downloading nvidia-
|
| 3859 |
-
Downloading
|
| 3860 |
-
Downloading hf-xet (3.0MiB)
|
| 3861 |
-
Downloading pillow (6.3MiB)
|
| 3862 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3863 |
-
Downloading
|
| 3864 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3865 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3866 |
-
Downloading fonttools (4.7MiB)
|
| 3867 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
| 3868 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3869 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3870 |
Downloading nvidia-cufile-cu12
|
|
@@ -3875,29 +3889,29 @@ Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
| 3875 |
Downloading fonttools
|
| 3876 |
Downloading pillow
|
| 3877 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3878 |
-
Downloading matplotlib
|
| 3879 |
Downloading nvidia-cuda-cupti-cu12
|
| 3880 |
-
Downloading
|
| 3881 |
Downloading sympy
|
|
|
|
| 3882 |
Downloading nvidia-nvjitlink-cu12
|
| 3883 |
Downloading nvidia-curand-cu12
|
| 3884 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 3885 |
Downloading triton
|
| 3886 |
Downloading nvidia-cufft-cu12
|
| 3887 |
Downloading nvidia-cusolver-cu12
|
| 3888 |
-
Downloading nvidia-cusparse-cu12
|
| 3889 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 3890 |
Downloading nvidia-nccl-cu12
|
| 3891 |
Downloading nvidia-cublas-cu12
|
| 3892 |
Downloading nvidia-cudnn-cu12
|
| 3893 |
Downloading torch
|
| 3894 |
-
Installed 47 packages in
|
| 3895 |
</div>
|
| 3896 |
</div>
|
| 3897 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 3898 |
-
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03, 5.
|
| 3899 |
-
Fetching 20 files: 10%|█ | 2/20 [00:01<00:
|
| 3900 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00,
|
| 3901 |
<div class="cell-artifacts">
|
| 3902 |
<h4>Artifacts:</h4>
|
| 3903 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
|
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
+
:root[data-ui="monocolor"] .reset-toggle,
|
| 105 |
+
:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 106 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 107 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 108 |
+
:root[data-ui="monocolor"] .reset-toggle:hover,
|
| 109 |
+
:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 111 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 112 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
|
|
| 216 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 217 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 218 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 219 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle,
|
| 220 |
+
:root[data-ui="none"][data-widgets="on"] .back-button {
|
| 221 |
background: #f6f6f6;
|
| 222 |
border: 1px solid #cccccc;
|
| 223 |
color: #222222;
|
|
|
|
| 247 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 248 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 249 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 250 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
|
| 251 |
+
:root[data-ui="none"][data-widgets="on"] .back-button:hover {
|
| 252 |
background: #ededed;
|
| 253 |
border-color: #bbbbbb;
|
| 254 |
color: #000000;
|
|
|
|
| 306 |
}
|
| 307 |
|
| 308 |
.theme-toggle,
|
| 309 |
+
.reset-toggle,
|
| 310 |
+
.back-button {
|
| 311 |
background: var(--bg-secondary);
|
| 312 |
border: 1px solid var(--border-primary);
|
| 313 |
padding: 8px 12px;
|
|
|
|
| 318 |
font-size: 0.9rem;
|
| 319 |
user-select: none;
|
| 320 |
}
|
| 321 |
+
|
| 322 |
+
.back-button {
|
| 323 |
+
text-decoration: none;
|
| 324 |
+
display: inline-block;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
.theme-toggle:hover,
|
| 328 |
+
.reset-toggle:hover,
|
| 329 |
+
.back-button:hover {
|
| 330 |
color: var(--text-primary);
|
| 331 |
background: var(--bg-tertiary);
|
| 332 |
}
|
|
|
|
| 3708 |
<body>
|
| 3709 |
<div class="controls">
|
| 3710 |
<div class="controls-buttons">
|
| 3711 |
+
|
| 3712 |
+
<a href="index.html" class="back-button">← back</a>
|
| 3713 |
+
|
| 3714 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3715 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3716 |
<div class="menu-button" onclick="toggleMenu()">
|
|
|
|
| 3759 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3760 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3761 |
</span> |
|
| 3762 |
+
Cell: benchmark | 38.65s
|
| 3763 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3764 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3765 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3845 |
</div>
|
| 3846 |
<div id="output-benchmark" class="cell-output">
|
| 3847 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3848 |
+
hf_kernels_flash_attn flux_L128 0.35 True
|
| 3849 |
hf_kernels_flash_attn flux_L256 0.38 True
|
| 3850 |
hf_kernels_flash_attn flux_L320 0.49 True
|
| 3851 |
+
hf_kernels_flash_attn flux_L384 0.52 True
|
| 3852 |
hf_kernels_flash_attn flux_L448 0.54 True
|
| 3853 |
+
hf_kernels_flash_attn flux_L512 0.56 True
|
| 3854 |
</div>
|
| 3855 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3856 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3857 |
<div class="uv-logs-content" style="display: none;">
|
| 3858 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3859 |
+
Downloading kiwisolver (1.4MiB)
|
| 3860 |
Downloading sympy (6.0MiB)
|
| 3861 |
+
Downloading hf-xet (3.0MiB)
|
| 3862 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3863 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
| 3864 |
Downloading networkx (1.9MiB)
|
| 3865 |
+
Downloading pillow (6.3MiB)
|
| 3866 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3867 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3868 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3869 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3870 |
+
Downloading matplotlib (8.3MiB)
|
| 3871 |
Downloading triton (148.3MiB)
|
| 3872 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3873 |
+
Downloading fonttools (4.7MiB)
|
| 3874 |
Downloading numpy (16.2MiB)
|
| 3875 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3876 |
+
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
| 3877 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3878 |
+
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
|
|
|
| 3879 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3880 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3881 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3882 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3883 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3884 |
Downloading nvidia-cufile-cu12
|
|
|
|
| 3889 |
Downloading fonttools
|
| 3890 |
Downloading pillow
|
| 3891 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
|
|
|
| 3892 |
Downloading nvidia-cuda-cupti-cu12
|
| 3893 |
+
Downloading matplotlib
|
| 3894 |
Downloading sympy
|
| 3895 |
+
Downloading numpy
|
| 3896 |
Downloading nvidia-nvjitlink-cu12
|
| 3897 |
Downloading nvidia-curand-cu12
|
| 3898 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 3899 |
Downloading triton
|
| 3900 |
Downloading nvidia-cufft-cu12
|
| 3901 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3902 |
Downloading nvidia-cusparselt-cu12
|
| 3903 |
+
Downloading nvidia-cusparse-cu12
|
| 3904 |
Downloading nvidia-nccl-cu12
|
| 3905 |
Downloading nvidia-cublas-cu12
|
| 3906 |
Downloading nvidia-cudnn-cu12
|
| 3907 |
Downloading torch
|
| 3908 |
+
Installed 47 packages in 527ms
|
| 3909 |
</div>
|
| 3910 |
</div>
|
| 3911 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 3912 |
+
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03, 5.70it/s]
|
| 3913 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:13, 1.36it/s]
|
| 3914 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 15.31it/s]</div>
|
| 3915 |
<div class="cell-artifacts">
|
| 3916 |
<h4>Artifacts:</h4>
|
| 3917 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -101,10 +101,12 @@
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
-
:root[data-ui="monocolor"] .reset-toggle
|
|
|
|
| 105 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 106 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 107 |
-
:root[data-ui="monocolor"] .reset-toggle:hover
|
|
|
|
| 108 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 109 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
@@ -214,7 +216,8 @@
|
|
| 214 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 215 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 216 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 217 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle
|
|
|
|
| 218 |
background: #f6f6f6;
|
| 219 |
border: 1px solid #cccccc;
|
| 220 |
color: #222222;
|
|
@@ -244,7 +247,8 @@
|
|
| 244 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 245 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 246 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 247 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover
|
|
|
|
| 248 |
background: #ededed;
|
| 249 |
border-color: #bbbbbb;
|
| 250 |
color: #000000;
|
|
@@ -302,7 +306,8 @@
|
|
| 302 |
}
|
| 303 |
|
| 304 |
.theme-toggle,
|
| 305 |
-
.reset-toggle
|
|
|
|
| 306 |
background: var(--bg-secondary);
|
| 307 |
border: 1px solid var(--border-primary);
|
| 308 |
padding: 8px 12px;
|
|
@@ -313,9 +318,15 @@
|
|
| 313 |
font-size: 0.9rem;
|
| 314 |
user-select: none;
|
| 315 |
}
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
.theme-toggle:hover,
|
| 318 |
-
.reset-toggle:hover
|
|
|
|
| 319 |
color: var(--text-primary);
|
| 320 |
background: var(--bg-tertiary);
|
| 321 |
}
|
|
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3697 |
<body>
|
| 3698 |
<div class="controls">
|
| 3699 |
<div class="controls-buttons">
|
|
|
|
|
|
|
|
|
|
| 3700 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3701 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3702 |
<div class="menu-button" onclick="toggleMenu()">
|
|
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3745 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
-
Cell: benchmark |
|
| 3749 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3751 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3833,7 +3847,7 @@ Cell: benchmark | 39.41s
|
|
| 3833 |
hf_kernels_flash_attn3 flux_L128 0.36 True
|
| 3834 |
hf_kernels_flash_attn3 flux_L256 0.39 True
|
| 3835 |
hf_kernels_flash_attn3 flux_L320 0.52 True
|
| 3836 |
-
hf_kernels_flash_attn3 flux_L384 0.
|
| 3837 |
hf_kernels_flash_attn3 flux_L448 0.57 True
|
| 3838 |
hf_kernels_flash_attn3 flux_L512 0.57 True
|
| 3839 |
</div>
|
|
@@ -3841,62 +3855,62 @@ hf_kernels_flash_attn3 flux_L512 0.57 True
|
|
| 3841 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3842 |
<div class="uv-logs-content" style="display: none;">
|
| 3843 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3844 |
-
Downloading
|
| 3845 |
-
Downloading
|
| 3846 |
-
Downloading
|
| 3847 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3848 |
-
Downloading
|
| 3849 |
-
Downloading
|
|
|
|
|
|
|
| 3850 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3851 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3852 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3853 |
-
Downloading networkx (1.9MiB)
|
| 3854 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3855 |
-
Downloading sympy (6.0MiB)
|
| 3856 |
Downloading hf-xet (3.0MiB)
|
| 3857 |
-
Downloading nvidia-
|
| 3858 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3859 |
Downloading torch (846.9MiB)
|
| 3860 |
-
Downloading
|
| 3861 |
-
Downloading
|
| 3862 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3863 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3864 |
Downloading fonttools (4.7MiB)
|
| 3865 |
-
Downloading
|
| 3866 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3867 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3868 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3869 |
Downloading nvidia-cufile-cu12
|
| 3870 |
Downloading kiwisolver
|
| 3871 |
Downloading hf-xet
|
| 3872 |
Downloading setuptools
|
| 3873 |
-
Downloading networkx
|
| 3874 |
Downloading fonttools
|
|
|
|
| 3875 |
Downloading pillow
|
| 3876 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3877 |
Downloading nvidia-cuda-cupti-cu12
|
| 3878 |
Downloading matplotlib
|
| 3879 |
-
Downloading sympy
|
| 3880 |
Downloading numpy
|
|
|
|
| 3881 |
Downloading nvidia-nvjitlink-cu12
|
| 3882 |
Downloading nvidia-curand-cu12
|
| 3883 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 3884 |
Downloading triton
|
| 3885 |
Downloading nvidia-cufft-cu12
|
| 3886 |
Downloading nvidia-cusolver-cu12
|
| 3887 |
-
Downloading nvidia-cusparse-cu12
|
| 3888 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 3889 |
Downloading nvidia-nccl-cu12
|
| 3890 |
Downloading nvidia-cublas-cu12
|
| 3891 |
Downloading nvidia-cudnn-cu12
|
| 3892 |
Downloading torch
|
| 3893 |
-
Installed 47 packages in
|
| 3894 |
</div>
|
| 3895 |
</div>
|
| 3896 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 3897 |
-
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00,
|
| 3898 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 3899 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 3900 |
<div class="cell-artifacts">
|
| 3901 |
<h4>Artifacts:</h4>
|
| 3902 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
|
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
+
:root[data-ui="monocolor"] .reset-toggle,
|
| 105 |
+
:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 106 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 107 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 108 |
+
:root[data-ui="monocolor"] .reset-toggle:hover,
|
| 109 |
+
:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 111 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 112 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
|
|
| 216 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 217 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 218 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 219 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle,
|
| 220 |
+
:root[data-ui="none"][data-widgets="on"] .back-button {
|
| 221 |
background: #f6f6f6;
|
| 222 |
border: 1px solid #cccccc;
|
| 223 |
color: #222222;
|
|
|
|
| 247 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 248 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 249 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 250 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
|
| 251 |
+
:root[data-ui="none"][data-widgets="on"] .back-button:hover {
|
| 252 |
background: #ededed;
|
| 253 |
border-color: #bbbbbb;
|
| 254 |
color: #000000;
|
|
|
|
| 306 |
}
|
| 307 |
|
| 308 |
.theme-toggle,
|
| 309 |
+
.reset-toggle,
|
| 310 |
+
.back-button {
|
| 311 |
background: var(--bg-secondary);
|
| 312 |
border: 1px solid var(--border-primary);
|
| 313 |
padding: 8px 12px;
|
|
|
|
| 318 |
font-size: 0.9rem;
|
| 319 |
user-select: none;
|
| 320 |
}
|
| 321 |
+
|
| 322 |
+
.back-button {
|
| 323 |
+
text-decoration: none;
|
| 324 |
+
display: inline-block;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
.theme-toggle:hover,
|
| 328 |
+
.reset-toggle:hover,
|
| 329 |
+
.back-button:hover {
|
| 330 |
color: var(--text-primary);
|
| 331 |
background: var(--bg-tertiary);
|
| 332 |
}
|
|
|
|
| 3708 |
<body>
|
| 3709 |
<div class="controls">
|
| 3710 |
<div class="controls-buttons">
|
| 3711 |
+
|
| 3712 |
+
<a href="index.html" class="back-button">← back</a>
|
| 3713 |
+
|
| 3714 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3715 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3716 |
<div class="menu-button" onclick="toggleMenu()">
|
|
|
|
| 3759 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3760 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3761 |
</span> |
|
| 3762 |
+
Cell: benchmark | 38.16s
|
| 3763 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3764 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3765 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3847 |
hf_kernels_flash_attn3 flux_L128 0.36 True
|
| 3848 |
hf_kernels_flash_attn3 flux_L256 0.39 True
|
| 3849 |
hf_kernels_flash_attn3 flux_L320 0.52 True
|
| 3850 |
+
hf_kernels_flash_attn3 flux_L384 0.53 True
|
| 3851 |
hf_kernels_flash_attn3 flux_L448 0.57 True
|
| 3852 |
hf_kernels_flash_attn3 flux_L512 0.57 True
|
| 3853 |
</div>
|
|
|
|
| 3855 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3856 |
<div class="uv-logs-content" style="display: none;">
|
| 3857 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3858 |
+
Downloading networkx (1.9MiB)
|
| 3859 |
+
Downloading kiwisolver (1.4MiB)
|
| 3860 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3861 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3862 |
+
Downloading matplotlib (8.3MiB)
|
| 3863 |
+
Downloading setuptools (1.1MiB)
|
| 3864 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3865 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3866 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3867 |
Downloading hf-xet (3.0MiB)
|
| 3868 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3869 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3870 |
Downloading torch (846.9MiB)
|
| 3871 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3872 |
+
Downloading pillow (6.3MiB)
|
| 3873 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
| 3874 |
Downloading fonttools (4.7MiB)
|
| 3875 |
+
Downloading numpy (16.2MiB)
|
| 3876 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3877 |
+
Downloading sympy (6.0MiB)
|
| 3878 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3879 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3880 |
+
Downloading triton (148.3MiB)
|
| 3881 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3882 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3883 |
Downloading nvidia-cufile-cu12
|
| 3884 |
Downloading kiwisolver
|
| 3885 |
Downloading hf-xet
|
| 3886 |
Downloading setuptools
|
|
|
|
| 3887 |
Downloading fonttools
|
| 3888 |
+
Downloading networkx
|
| 3889 |
Downloading pillow
|
| 3890 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3891 |
Downloading nvidia-cuda-cupti-cu12
|
| 3892 |
Downloading matplotlib
|
|
|
|
| 3893 |
Downloading numpy
|
| 3894 |
+
Downloading sympy
|
| 3895 |
Downloading nvidia-nvjitlink-cu12
|
| 3896 |
Downloading nvidia-curand-cu12
|
| 3897 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 3898 |
Downloading triton
|
| 3899 |
Downloading nvidia-cufft-cu12
|
| 3900 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3901 |
Downloading nvidia-cusparselt-cu12
|
| 3902 |
+
Downloading nvidia-cusparse-cu12
|
| 3903 |
Downloading nvidia-nccl-cu12
|
| 3904 |
Downloading nvidia-cublas-cu12
|
| 3905 |
Downloading nvidia-cudnn-cu12
|
| 3906 |
Downloading torch
|
| 3907 |
+
Installed 47 packages in 565ms
|
| 3908 |
</div>
|
| 3909 |
</div>
|
| 3910 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 3911 |
+
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 5.17it/s]
|
| 3912 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.22it/s]
|
| 3913 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.76it/s]</div>
|
| 3914 |
<div class="cell-artifacts">
|
| 3915 |
<h4>Artifacts:</h4>
|
| 3916 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
flash_attn/impls/index.html
CHANGED
|
@@ -2,22 +2,86 @@
|
|
| 2 |
<html>
|
| 3 |
<head>
|
| 4 |
<meta charset='UTF-8'>
|
| 5 |
-
<
|
|
|
|
| 6 |
<style>
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
</style>
|
| 16 |
</head>
|
| 17 |
<body>
|
|
|
|
|
|
|
|
|
|
| 18 |
<h1>Index of /flash_attn/impls</h1>
|
| 19 |
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
<li><a href='compiled_variants.html' class='file'>compiled_variants.html</a></li>
|
| 22 |
<li><a href='flash_attention.html' class='file'>flash_attention.html</a></li>
|
| 23 |
<li><a href='hf_kernels_flash_attn.html' class='file'>hf_kernels_flash_attn.html</a></li>
|
|
|
|
| 2 |
<html>
|
| 3 |
<head>
|
| 4 |
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /flash_attn/impls</title>
|
| 7 |
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
</style>
|
| 78 |
</head>
|
| 79 |
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
<h1>Index of /flash_attn/impls</h1>
|
| 84 |
<ul>
|
|
|
|
| 85 |
<li><a href='compiled_variants.html' class='file'>compiled_variants.html</a></li>
|
| 86 |
<li><a href='flash_attention.html' class='file'>flash_attention.html</a></li>
|
| 87 |
<li><a href='hf_kernels_flash_attn.html' class='file'>hf_kernels_flash_attn.html</a></li>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -101,10 +101,12 @@
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
-
:root[data-ui="monocolor"] .reset-toggle
|
|
|
|
| 105 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 106 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 107 |
-
:root[data-ui="monocolor"] .reset-toggle:hover
|
|
|
|
| 108 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 109 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
@@ -214,7 +216,8 @@
|
|
| 214 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 215 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 216 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 217 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle
|
|
|
|
| 218 |
background: #f6f6f6;
|
| 219 |
border: 1px solid #cccccc;
|
| 220 |
color: #222222;
|
|
@@ -244,7 +247,8 @@
|
|
| 244 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 245 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 246 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 247 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover
|
|
|
|
| 248 |
background: #ededed;
|
| 249 |
border-color: #bbbbbb;
|
| 250 |
color: #000000;
|
|
@@ -302,7 +306,8 @@
|
|
| 302 |
}
|
| 303 |
|
| 304 |
.theme-toggle,
|
| 305 |
-
.reset-toggle
|
|
|
|
| 306 |
background: var(--bg-secondary);
|
| 307 |
border: 1px solid var(--border-primary);
|
| 308 |
padding: 8px 12px;
|
|
@@ -313,9 +318,15 @@
|
|
| 313 |
font-size: 0.9rem;
|
| 314 |
user-select: none;
|
| 315 |
}
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
.theme-toggle:hover,
|
| 318 |
-
.reset-toggle:hover
|
|
|
|
| 319 |
color: var(--text-primary);
|
| 320 |
background: var(--bg-tertiary);
|
| 321 |
}
|
|
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3697 |
<body>
|
| 3698 |
<div class="controls">
|
| 3699 |
<div class="controls-buttons">
|
|
|
|
|
|
|
|
|
|
| 3700 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3701 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3702 |
<div class="menu-button" onclick="toggleMenu()">
|
|
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3745 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
-
Cell: benchmark | 36.
|
| 3749 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3751 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3838,28 +3852,28 @@ torch_mem_eff flux_L512 0.95 True
|
|
| 3838 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3839 |
<div class="uv-logs-content" style="display: none;">
|
| 3840 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
|
|
|
|
|
|
| 3841 |
Downloading sympy (6.0MiB)
|
| 3842 |
-
Downloading
|
|
|
|
| 3843 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
| 3844 |
Downloading kiwisolver (1.4MiB)
|
| 3845 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3846 |
-
Downloading torch (846.9MiB)
|
| 3847 |
-
Downloading matplotlib (8.3MiB)
|
| 3848 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3849 |
-
Downloading
|
| 3850 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3851 |
-
Downloading networkx (1.9MiB)
|
| 3852 |
-
Downloading numpy (16.2MiB)
|
| 3853 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3854 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3855 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3856 |
-
Downloading
|
|
|
|
| 3857 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3858 |
-
Downloading nvidia-
|
|
|
|
| 3859 |
Downloading fonttools (4.7MiB)
|
| 3860 |
-
Downloading nvidia-
|
| 3861 |
-
Downloading
|
| 3862 |
-
Downloading
|
|
|
|
| 3863 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3864 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3865 |
Downloading nvidia-cufile-cu12
|
|
@@ -3885,7 +3899,7 @@ Downloading triton (148.3MiB)
|
|
| 3885 |
Downloading nvidia-cublas-cu12
|
| 3886 |
Downloading nvidia-cudnn-cu12
|
| 3887 |
Downloading torch
|
| 3888 |
-
Installed 37 packages in
|
| 3889 |
</div>
|
| 3890 |
</div>
|
| 3891 |
<div class="cell-artifacts">
|
|
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
+
:root[data-ui="monocolor"] .reset-toggle,
|
| 105 |
+
:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 106 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 107 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 108 |
+
:root[data-ui="monocolor"] .reset-toggle:hover,
|
| 109 |
+
:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 111 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 112 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
|
|
| 216 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 217 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 218 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 219 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle,
|
| 220 |
+
:root[data-ui="none"][data-widgets="on"] .back-button {
|
| 221 |
background: #f6f6f6;
|
| 222 |
border: 1px solid #cccccc;
|
| 223 |
color: #222222;
|
|
|
|
| 247 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 248 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 249 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 250 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
|
| 251 |
+
:root[data-ui="none"][data-widgets="on"] .back-button:hover {
|
| 252 |
background: #ededed;
|
| 253 |
border-color: #bbbbbb;
|
| 254 |
color: #000000;
|
|
|
|
| 306 |
}
|
| 307 |
|
| 308 |
.theme-toggle,
|
| 309 |
+
.reset-toggle,
|
| 310 |
+
.back-button {
|
| 311 |
background: var(--bg-secondary);
|
| 312 |
border: 1px solid var(--border-primary);
|
| 313 |
padding: 8px 12px;
|
|
|
|
| 318 |
font-size: 0.9rem;
|
| 319 |
user-select: none;
|
| 320 |
}
|
| 321 |
+
|
| 322 |
+
.back-button {
|
| 323 |
+
text-decoration: none;
|
| 324 |
+
display: inline-block;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
.theme-toggle:hover,
|
| 328 |
+
.reset-toggle:hover,
|
| 329 |
+
.back-button:hover {
|
| 330 |
color: var(--text-primary);
|
| 331 |
background: var(--bg-tertiary);
|
| 332 |
}
|
|
|
|
| 3708 |
<body>
|
| 3709 |
<div class="controls">
|
| 3710 |
<div class="controls-buttons">
|
| 3711 |
+
|
| 3712 |
+
<a href="index.html" class="back-button">← back</a>
|
| 3713 |
+
|
| 3714 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3715 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3716 |
<div class="menu-button" onclick="toggleMenu()">
|
|
|
|
| 3759 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3760 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3761 |
</span> |
|
| 3762 |
+
Cell: benchmark | 36.80s
|
| 3763 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3764 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3765 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3852 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3853 |
<div class="uv-logs-content" style="display: none;">
|
| 3854 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3855 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3856 |
+
Downloading numpy (16.2MiB)
|
| 3857 |
Downloading sympy (6.0MiB)
|
| 3858 |
+
Downloading networkx (1.9MiB)
|
| 3859 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3860 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3861 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3862 |
Downloading kiwisolver (1.4MiB)
|
|
|
|
|
|
|
|
|
|
| 3863 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3864 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3865 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3866 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3867 |
+
Downloading triton (148.3MiB)
|
| 3868 |
+
Downloading setuptools (1.1MiB)
|
| 3869 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3870 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3871 |
+
Downloading pillow (6.3MiB)
|
| 3872 |
Downloading fonttools (4.7MiB)
|
| 3873 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3874 |
+
Downloading torch (846.9MiB)
|
| 3875 |
+
Downloading matplotlib (8.3MiB)
|
| 3876 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3877 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3878 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3879 |
Downloading nvidia-cufile-cu12
|
|
|
|
| 3899 |
Downloading nvidia-cublas-cu12
|
| 3900 |
Downloading nvidia-cudnn-cu12
|
| 3901 |
Downloading torch
|
| 3902 |
+
Installed 37 packages in 448ms
|
| 3903 |
</div>
|
| 3904 |
</div>
|
| 3905 |
<div class="cell-artifacts">
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -101,10 +101,12 @@
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
-
:root[data-ui="monocolor"] .reset-toggle
|
|
|
|
| 105 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 106 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 107 |
-
:root[data-ui="monocolor"] .reset-toggle:hover
|
|
|
|
| 108 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 109 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
@@ -214,7 +216,8 @@
|
|
| 214 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 215 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 216 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 217 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle
|
|
|
|
| 218 |
background: #f6f6f6;
|
| 219 |
border: 1px solid #cccccc;
|
| 220 |
color: #222222;
|
|
@@ -244,7 +247,8 @@
|
|
| 244 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 245 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 246 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 247 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover
|
|
|
|
| 248 |
background: #ededed;
|
| 249 |
border-color: #bbbbbb;
|
| 250 |
color: #000000;
|
|
@@ -302,7 +306,8 @@
|
|
| 302 |
}
|
| 303 |
|
| 304 |
.theme-toggle,
|
| 305 |
-
.reset-toggle
|
|
|
|
| 306 |
background: var(--bg-secondary);
|
| 307 |
border: 1px solid var(--border-primary);
|
| 308 |
padding: 8px 12px;
|
|
@@ -313,9 +318,15 @@
|
|
| 313 |
font-size: 0.9rem;
|
| 314 |
user-select: none;
|
| 315 |
}
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
.theme-toggle:hover,
|
| 318 |
-
.reset-toggle:hover
|
|
|
|
| 319 |
color: var(--text-primary);
|
| 320 |
background: var(--bg-tertiary);
|
| 321 |
}
|
|
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3697 |
<body>
|
| 3698 |
<div class="controls">
|
| 3699 |
<div class="controls-buttons">
|
|
|
|
|
|
|
|
|
|
| 3700 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3701 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3702 |
<div class="menu-button" onclick="toggleMenu()">
|
|
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3745 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
-
Cell: benchmark | 40.
|
| 3749 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3751 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3839,45 +3853,45 @@ Cell: benchmark | 40.08s
|
|
| 3839 |
<div id="output-benchmark" class="cell-output">
|
| 3840 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3841 |
sage_int8_fp16 flux_L128 FAIL False
|
| 3842 |
-
Error: module '
|
| 3843 |
sage_int8_fp16 flux_L256 FAIL False
|
| 3844 |
-
Error: module '
|
| 3845 |
sage_int8_fp16 flux_L320 FAIL False
|
| 3846 |
-
Error: module '
|
| 3847 |
sage_int8_fp16 flux_L384 FAIL False
|
| 3848 |
-
Error: module '
|
| 3849 |
sage_int8_fp16 flux_L448 FAIL False
|
| 3850 |
-
Error: module '
|
| 3851 |
sage_int8_fp16 flux_L512 FAIL False
|
| 3852 |
-
Error: module '
|
| 3853 |
</div>
|
| 3854 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3855 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3856 |
<div class="uv-logs-content" style="display: none;">
|
| 3857 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3858 |
-
Downloading sympy (6.0MiB)
|
| 3859 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3860 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3861 |
-
Downloading numpy (16.2MiB)
|
| 3862 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3863 |
-
Downloading networkx (1.9MiB)
|
| 3864 |
-
Downloading hf-xet (3.0MiB)
|
| 3865 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3866 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3867 |
-
Downloading pillow (6.3MiB)
|
| 3868 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3869 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3870 |
Downloading setuptools (1.1MiB)
|
| 3871 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 3872 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3873 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3874 |
-
Downloading
|
|
|
|
|
|
|
| 3875 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3876 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3877 |
-
Downloading kiwisolver (1.4MiB)
|
| 3878 |
-
Downloading fonttools (4.7MiB)
|
| 3879 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3880 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
| 3881 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3882 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3883 |
Downloading nvidia-cufile-cu12
|
|
@@ -3888,8 +3902,8 @@ Downloading torch (846.9MiB)
|
|
| 3888 |
Downloading networkx
|
| 3889 |
Downloading pillow
|
| 3890 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3891 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 3892 |
Downloading matplotlib
|
|
|
|
| 3893 |
Downloading numpy
|
| 3894 |
Downloading sympy
|
| 3895 |
Downloading nvidia-nvjitlink-cu12
|
|
@@ -3898,20 +3912,19 @@ Downloading torch (846.9MiB)
|
|
| 3898 |
Downloading triton
|
| 3899 |
Downloading nvidia-cufft-cu12
|
| 3900 |
Downloading nvidia-cusolver-cu12
|
| 3901 |
-
Downloading nvidia-cusparselt-cu12
|
| 3902 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 3903 |
Downloading nvidia-nccl-cu12
|
| 3904 |
Downloading nvidia-cublas-cu12
|
| 3905 |
Downloading nvidia-cudnn-cu12
|
| 3906 |
Downloading torch
|
| 3907 |
-
Installed 48 packages in
|
| 3908 |
</div>
|
| 3909 |
</div>
|
| 3910 |
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3911 |
-
Fetching 11 files: 9%|▉ | 1/11 [00:00<00:
|
| 3912 |
-
Fetching 11 files:
|
| 3913 |
-
Fetching 11 files:
|
| 3914 |
-
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 14.37it/s]</div>
|
| 3915 |
<div class="cell-artifacts">
|
| 3916 |
<h4>Artifacts:</h4>
|
| 3917 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
|
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
+
:root[data-ui="monocolor"] .reset-toggle,
|
| 105 |
+
:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 106 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 107 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 108 |
+
:root[data-ui="monocolor"] .reset-toggle:hover,
|
| 109 |
+
:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 111 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 112 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
|
|
| 216 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 217 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 218 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 219 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle,
|
| 220 |
+
:root[data-ui="none"][data-widgets="on"] .back-button {
|
| 221 |
background: #f6f6f6;
|
| 222 |
border: 1px solid #cccccc;
|
| 223 |
color: #222222;
|
|
|
|
| 247 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 248 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 249 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 250 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
|
| 251 |
+
:root[data-ui="none"][data-widgets="on"] .back-button:hover {
|
| 252 |
background: #ededed;
|
| 253 |
border-color: #bbbbbb;
|
| 254 |
color: #000000;
|
|
|
|
| 306 |
}
|
| 307 |
|
| 308 |
.theme-toggle,
|
| 309 |
+
.reset-toggle,
|
| 310 |
+
.back-button {
|
| 311 |
background: var(--bg-secondary);
|
| 312 |
border: 1px solid var(--border-primary);
|
| 313 |
padding: 8px 12px;
|
|
|
|
| 318 |
font-size: 0.9rem;
|
| 319 |
user-select: none;
|
| 320 |
}
|
| 321 |
+
|
| 322 |
+
.back-button {
|
| 323 |
+
text-decoration: none;
|
| 324 |
+
display: inline-block;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
.theme-toggle:hover,
|
| 328 |
+
.reset-toggle:hover,
|
| 329 |
+
.back-button:hover {
|
| 330 |
color: var(--text-primary);
|
| 331 |
background: var(--bg-tertiary);
|
| 332 |
}
|
|
|
|
| 3708 |
<body>
|
| 3709 |
<div class="controls">
|
| 3710 |
<div class="controls-buttons">
|
| 3711 |
+
|
| 3712 |
+
<a href="index.html" class="back-button">← back</a>
|
| 3713 |
+
|
| 3714 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3715 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3716 |
<div class="menu-button" onclick="toggleMenu()">
|
|
|
|
| 3759 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3760 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3761 |
</span> |
|
| 3762 |
+
Cell: benchmark | 40.58s
|
| 3763 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3764 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3765 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3853 |
<div id="output-benchmark" class="cell-output">
|
| 3854 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3855 |
sage_int8_fp16 flux_L128 FAIL False
|
| 3856 |
+
Error: module 'sage_attention_1863f4c92418f0f6' has no attribute 'fwd'
|
| 3857 |
sage_int8_fp16 flux_L256 FAIL False
|
| 3858 |
+
Error: module 'sage_attention_1863f4c92418f0f6' has no attribute 'fwd'
|
| 3859 |
sage_int8_fp16 flux_L320 FAIL False
|
| 3860 |
+
Error: module 'sage_attention_1863f4c92418f0f6' has no attribute 'fwd'
|
| 3861 |
sage_int8_fp16 flux_L384 FAIL False
|
| 3862 |
+
Error: module 'sage_attention_1863f4c92418f0f6' has no attribute 'fwd'
|
| 3863 |
sage_int8_fp16 flux_L448 FAIL False
|
| 3864 |
+
Error: module 'sage_attention_1863f4c92418f0f6' has no attribute 'fwd'
|
| 3865 |
sage_int8_fp16 flux_L512 FAIL False
|
| 3866 |
+
Error: module 'sage_attention_1863f4c92418f0f6' has no attribute 'fwd'
|
| 3867 |
</div>
|
| 3868 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3869 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3870 |
<div class="uv-logs-content" style="display: none;">
|
| 3871 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3872 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
|
|
|
| 3873 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 3874 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3875 |
+
Downloading pillow (6.3MiB)
|
| 3876 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3877 |
+
Downloading hf-xet (3.0MiB)
|
| 3878 |
+
Downloading networkx (1.9MiB)
|
| 3879 |
+
Downloading numpy (16.2MiB)
|
| 3880 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3881 |
Downloading setuptools (1.1MiB)
|
| 3882 |
+
Downloading kiwisolver (1.4MiB)
|
| 3883 |
+
Downloading matplotlib (8.3MiB)
|
| 3884 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3885 |
+
Downloading fonttools (4.7MiB)
|
| 3886 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3887 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3888 |
+
Downloading triton (148.3MiB)
|
| 3889 |
+
Downloading sympy (6.0MiB)
|
| 3890 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3891 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3892 |
Downloading torch (846.9MiB)
|
| 3893 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3894 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3895 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3896 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3897 |
Downloading nvidia-cufile-cu12
|
|
|
|
| 3902 |
Downloading networkx
|
| 3903 |
Downloading pillow
|
| 3904 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
|
|
|
| 3905 |
Downloading matplotlib
|
| 3906 |
+
Downloading nvidia-cuda-cupti-cu12
|
| 3907 |
Downloading numpy
|
| 3908 |
Downloading sympy
|
| 3909 |
Downloading nvidia-nvjitlink-cu12
|
|
|
|
| 3912 |
Downloading triton
|
| 3913 |
Downloading nvidia-cufft-cu12
|
| 3914 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3915 |
Downloading nvidia-cusparse-cu12
|
| 3916 |
+
Downloading nvidia-cusparselt-cu12
|
| 3917 |
Downloading nvidia-nccl-cu12
|
| 3918 |
Downloading nvidia-cublas-cu12
|
| 3919 |
Downloading nvidia-cudnn-cu12
|
| 3920 |
Downloading torch
|
| 3921 |
+
Installed 48 packages in 591ms
|
| 3922 |
</div>
|
| 3923 |
</div>
|
| 3924 |
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3925 |
+
Fetching 11 files: 9%|▉ | 1/11 [00:00<00:01, 5.59it/s]
|
| 3926 |
+
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 12.79it/s]
|
| 3927 |
+
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 16.77it/s]</div>
|
|
|
|
| 3928 |
<div class="cell-artifacts">
|
| 3929 |
<h4>Artifacts:</h4>
|
| 3930 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -101,10 +101,12 @@
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
-
:root[data-ui="monocolor"] .reset-toggle
|
|
|
|
| 105 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 106 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 107 |
-
:root[data-ui="monocolor"] .reset-toggle:hover
|
|
|
|
| 108 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 109 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
@@ -214,7 +216,8 @@
|
|
| 214 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 215 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 216 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 217 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle
|
|
|
|
| 218 |
background: #f6f6f6;
|
| 219 |
border: 1px solid #cccccc;
|
| 220 |
color: #222222;
|
|
@@ -244,7 +247,8 @@
|
|
| 244 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 245 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 246 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 247 |
-
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover
|
|
|
|
| 248 |
background: #ededed;
|
| 249 |
border-color: #bbbbbb;
|
| 250 |
color: #000000;
|
|
@@ -302,7 +306,8 @@
|
|
| 302 |
}
|
| 303 |
|
| 304 |
.theme-toggle,
|
| 305 |
-
.reset-toggle
|
|
|
|
| 306 |
background: var(--bg-secondary);
|
| 307 |
border: 1px solid var(--border-primary);
|
| 308 |
padding: 8px 12px;
|
|
@@ -313,9 +318,15 @@
|
|
| 313 |
font-size: 0.9rem;
|
| 314 |
user-select: none;
|
| 315 |
}
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
.theme-toggle:hover,
|
| 318 |
-
.reset-toggle:hover
|
|
|
|
| 319 |
color: var(--text-primary);
|
| 320 |
background: var(--bg-tertiary);
|
| 321 |
}
|
|
@@ -3697,6 +3708,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3697 |
<body>
|
| 3698 |
<div class="controls">
|
| 3699 |
<div class="controls-buttons">
|
|
|
|
|
|
|
|
|
|
| 3700 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3701 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3702 |
<div class="menu-button" onclick="toggleMenu()">
|
|
@@ -3745,7 +3759,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3745 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
-
Cell: benchmark |
|
| 3749 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3751 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3838,38 +3852,38 @@ xformers_meff flux_L512 0.65 True
|
|
| 3838 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3839 |
<div class="uv-logs-content" style="display: none;">
|
| 3840 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3841 |
-
Downloading
|
| 3842 |
-
Downloading
|
| 3843 |
-
Downloading
|
| 3844 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3845 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3846 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 3847 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3848 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3849 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3850 |
-
Downloading
|
| 3851 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3852 |
Downloading setuptools (1.1MiB)
|
| 3853 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3854 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3855 |
-
Downloading networkx (1.9MiB)
|
| 3856 |
-
Downloading kiwisolver (1.4MiB)
|
| 3857 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3858 |
Downloading torch (846.9MiB)
|
| 3859 |
Downloading matplotlib (8.3MiB)
|
| 3860 |
-
Downloading triton (148.3MiB)
|
| 3861 |
-
Downloading sympy (6.0MiB)
|
| 3862 |
Downloading fonttools (4.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3863 |
Downloading xformers (111.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3864 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3865 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3866 |
Downloading nvidia-cufile-cu12
|
| 3867 |
Downloading kiwisolver
|
| 3868 |
Downloading setuptools
|
| 3869 |
-
Downloading fonttools
|
| 3870 |
Downloading networkx
|
| 3871 |
-
Downloading
|
| 3872 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
|
|
|
| 3873 |
Downloading nvidia-cuda-cupti-cu12
|
| 3874 |
Downloading matplotlib
|
| 3875 |
Downloading sympy
|
|
@@ -3884,10 +3898,10 @@ Downloading xformers (111.8MiB)
|
|
| 3884 |
Downloading nvidia-cusparselt-cu12
|
| 3885 |
Downloading nvidia-cusparse-cu12
|
| 3886 |
Downloading nvidia-nccl-cu12
|
| 3887 |
-
Downloading nvidia-cublas-cu12
|
| 3888 |
Downloading nvidia-cudnn-cu12
|
|
|
|
| 3889 |
Downloading torch
|
| 3890 |
-
Installed 38 packages in
|
| 3891 |
</div>
|
| 3892 |
</div>
|
| 3893 |
<div class="cell-artifacts">
|
|
|
|
| 101 |
:root[data-ui="monocolor"] a { color: var(--mono-color); }
|
| 102 |
:root[data-ui="monocolor"] .menu-button,
|
| 103 |
:root[data-ui="monocolor"] .theme-toggle,
|
| 104 |
+
:root[data-ui="monocolor"] .reset-toggle,
|
| 105 |
+
:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 106 |
:root[data-ui="monocolor"] .menu-button:hover,
|
| 107 |
:root[data-ui="monocolor"] .theme-toggle:hover,
|
| 108 |
+
:root[data-ui="monocolor"] .reset-toggle:hover,
|
| 109 |
+
:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
|
| 110 |
:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
|
| 111 |
:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
|
| 112 |
:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
|
|
|
|
| 216 |
/* Keep default control styling when widgets are enabled, even in minimal UI */
|
| 217 |
:root[data-ui="none"][data-widgets="on"] .menu-button,
|
| 218 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle,
|
| 219 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle,
|
| 220 |
+
:root[data-ui="none"][data-widgets="on"] .back-button {
|
| 221 |
background: #f6f6f6;
|
| 222 |
border: 1px solid #cccccc;
|
| 223 |
color: #222222;
|
|
|
|
| 247 |
:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
|
| 248 |
:root[data-ui="none"][data-widgets="on"] .menu-button:hover,
|
| 249 |
:root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
|
| 250 |
+
:root[data-ui="none"][data-widgets="on"] .reset-toggle:hover,
|
| 251 |
+
:root[data-ui="none"][data-widgets="on"] .back-button:hover {
|
| 252 |
background: #ededed;
|
| 253 |
border-color: #bbbbbb;
|
| 254 |
color: #000000;
|
|
|
|
| 306 |
}
|
| 307 |
|
| 308 |
.theme-toggle,
|
| 309 |
+
.reset-toggle,
|
| 310 |
+
.back-button {
|
| 311 |
background: var(--bg-secondary);
|
| 312 |
border: 1px solid var(--border-primary);
|
| 313 |
padding: 8px 12px;
|
|
|
|
| 318 |
font-size: 0.9rem;
|
| 319 |
user-select: none;
|
| 320 |
}
|
| 321 |
+
|
| 322 |
+
.back-button {
|
| 323 |
+
text-decoration: none;
|
| 324 |
+
display: inline-block;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
.theme-toggle:hover,
|
| 328 |
+
.reset-toggle:hover,
|
| 329 |
+
.back-button:hover {
|
| 330 |
color: var(--text-primary);
|
| 331 |
background: var(--bg-tertiary);
|
| 332 |
}
|
|
|
|
| 3708 |
<body>
|
| 3709 |
<div class="controls">
|
| 3710 |
<div class="controls-buttons">
|
| 3711 |
+
|
| 3712 |
+
<a href="index.html" class="back-button">← back</a>
|
| 3713 |
+
|
| 3714 |
<div class="theme-toggle" onclick="toggleTheme()">light</div>
|
| 3715 |
<div class="reset-toggle" onclick="resetLayout()">reset</div>
|
| 3716 |
<div class="menu-button" onclick="toggleMenu()">
|
|
|
|
| 3759 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3760 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3761 |
</span> |
|
| 3762 |
+
Cell: benchmark | 42.08s
|
| 3763 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3764 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3765 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3852 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3853 |
<div class="uv-logs-content" style="display: none;">
|
| 3854 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3855 |
+
Downloading numpy (16.2MiB)
|
| 3856 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3857 |
+
Downloading networkx (1.9MiB)
|
|
|
|
|
|
|
| 3858 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3859 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3860 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
| 3861 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3862 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
| 3863 |
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3864 |
Downloading torch (846.9MiB)
|
| 3865 |
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
| 3866 |
Downloading fonttools (4.7MiB)
|
| 3867 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3868 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3869 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3870 |
+
Downloading kiwisolver (1.4MiB)
|
| 3871 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3872 |
Downloading xformers (111.8MiB)
|
| 3873 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3874 |
+
Downloading pillow (6.3MiB)
|
| 3875 |
+
Downloading sympy (6.0MiB)
|
| 3876 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3877 |
+
Downloading triton (148.3MiB)
|
| 3878 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3879 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3880 |
Downloading nvidia-cufile-cu12
|
| 3881 |
Downloading kiwisolver
|
| 3882 |
Downloading setuptools
|
|
|
|
| 3883 |
Downloading networkx
|
| 3884 |
+
Downloading fonttools
|
| 3885 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3886 |
+
Downloading pillow
|
| 3887 |
Downloading nvidia-cuda-cupti-cu12
|
| 3888 |
Downloading matplotlib
|
| 3889 |
Downloading sympy
|
|
|
|
| 3898 |
Downloading nvidia-cusparselt-cu12
|
| 3899 |
Downloading nvidia-cusparse-cu12
|
| 3900 |
Downloading nvidia-nccl-cu12
|
|
|
|
| 3901 |
Downloading nvidia-cudnn-cu12
|
| 3902 |
+
Downloading nvidia-cublas-cu12
|
| 3903 |
Downloading torch
|
| 3904 |
+
Installed 38 packages in 541ms
|
| 3905 |
</div>
|
| 3906 |
</div>
|
| 3907 |
<div class="cell-artifacts">
|
flash_attn/index.html
CHANGED
|
@@ -2,22 +2,86 @@
|
|
| 2 |
<html>
|
| 3 |
<head>
|
| 4 |
<meta charset='UTF-8'>
|
| 5 |
-
<
|
|
|
|
| 6 |
<style>
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
</style>
|
| 16 |
</head>
|
| 17 |
<body>
|
|
|
|
|
|
|
|
|
|
| 18 |
<h1>Index of /flash_attn</h1>
|
| 19 |
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 22 |
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 23 |
</ul>
|
|
|
|
| 2 |
<html>
|
| 3 |
<head>
|
| 4 |
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /flash_attn</title>
|
| 7 |
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
</style>
|
| 78 |
</head>
|
| 79 |
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
<h1>Index of /flash_attn</h1>
|
| 84 |
<ul>
|
|
|
|
| 85 |
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
</ul>
|
flash_attn/results/artifacts/combine/latency.csv
CHANGED
|
@@ -1,43 +1,43 @@
|
|
| 1 |
Implementation,Impl ID,Workload,Batch,Seq Length,Heads,Head Dim,Dtype,Mean (ms),P10 (ms),P50 (ms),P90 (ms),Reps,Peak Mem (MB),Backend,Family
|
| 2 |
-
Flash (PyTorch SDPA),torch_flash_ma,flux_L128,1,1152,24,128,bfloat16,0.
|
| 3 |
-
Flash (PyTorch SDPA),torch_flash_ma,flux_L256,1,1280,24,128,bfloat16,0.
|
| 4 |
-
Flash (PyTorch SDPA),torch_flash_ma,flux_L320,1,1344,24,128,bfloat16,0.
|
| 5 |
-
Flash (PyTorch SDPA),torch_flash_ma,flux_L384,1,1408,24,128,bfloat16,0.
|
| 6 |
-
Flash (PyTorch SDPA),torch_flash_ma,flux_L448,1,1472,24,128,bfloat16,0.
|
| 7 |
-
Flash (PyTorch SDPA),torch_flash_ma,flux_L512,1,1536,24,128,bfloat16,0.
|
| 8 |
-
MemEff (PyTorch SDPA),torch_mem_eff,flux_L128,1,1152,24,128,bfloat16,0.
|
| 9 |
-
MemEff (PyTorch SDPA),torch_mem_eff,flux_L256,1,1280,24,128,bfloat16,0.
|
| 10 |
-
MemEff (PyTorch SDPA),torch_mem_eff,flux_L320,1,1344,24,128,bfloat16,0.
|
| 11 |
-
MemEff (PyTorch SDPA),torch_mem_eff,flux_L384,1,1408,24,128,bfloat16,0.
|
| 12 |
-
MemEff (PyTorch SDPA),torch_mem_eff,flux_L448,1,1472,24,128,bfloat16,0.
|
| 13 |
-
MemEff (PyTorch SDPA),torch_mem_eff,flux_L512,1,1536,24,128,bfloat16,0.
|
| 14 |
-
xFormers,xformers_meff,flux_L128,1,1152,24,128,bfloat16,0.
|
| 15 |
-
xFormers,xformers_meff,flux_L256,1,1280,24,128,bfloat16,0.
|
| 16 |
-
xFormers,xformers_meff,flux_L320,1,1344,24,128,bfloat16,0.
|
| 17 |
-
xFormers,xformers_meff,flux_L384,1,1408,24,128,bfloat16,0.
|
| 18 |
-
xFormers,xformers_meff,flux_L448,1,1472,24,128,bfloat16,0.
|
| 19 |
-
xFormers,xformers_meff,flux_L512,1,1536,24,128,bfloat16,0.
|
| 20 |
-
Compiled (default),torch_flash_compiled_default,flux_L128,1,1152,24,128,bfloat16,0.
|
| 21 |
-
Compiled (default),torch_flash_compiled_default,flux_L256,1,1280,24,128,bfloat16,0.
|
| 22 |
-
Compiled (default),torch_flash_compiled_default,flux_L320,1,1344,24,128,bfloat16,0.
|
| 23 |
-
Compiled (default),torch_flash_compiled_default,flux_L384,1,1408,24,128,bfloat16,0.
|
| 24 |
-
Compiled (default),torch_flash_compiled_default,flux_L448,1,1472,24,128,bfloat16,0.
|
| 25 |
-
Compiled (default),torch_flash_compiled_default,flux_L512,1,1536,24,128,bfloat16,0.
|
| 26 |
-
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L128,1,1152,24,128,bfloat16,0.
|
| 27 |
-
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L256,1,1280,24,128,bfloat16,0.
|
| 28 |
-
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L320,1,1344,24,128,bfloat16,0.
|
| 29 |
-
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L384,1,1408,24,128,bfloat16,0.
|
| 30 |
-
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L448,1,1472,24,128,bfloat16,0.
|
| 31 |
-
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L512,1,1536,24,128,bfloat16,0.
|
| 32 |
-
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L128,1,1152,24,128,bfloat16,0.
|
| 33 |
-
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L256,1,1280,24,128,bfloat16,0.
|
| 34 |
-
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L320,1,1344,24,128,bfloat16,0.
|
| 35 |
-
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L384,1,1408,24,128,bfloat16,0.
|
| 36 |
-
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L448,1,1472,24,128,bfloat16,0.
|
| 37 |
-
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L512,1,1536,24,128,bfloat16,0.
|
| 38 |
-
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L128,1,1152,24,128,bfloat16,0.
|
| 39 |
-
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L256,1,1280,24,128,bfloat16,0.
|
| 40 |
-
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L320,1,1344,24,128,bfloat16,0.
|
| 41 |
-
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L384,1,1408,24,128,bfloat16,0.
|
| 42 |
-
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L448,1,1472,24,128,bfloat16,0.
|
| 43 |
-
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L512,1,1536,24,128,bfloat16,0.
|
|
|
|
| 1 |
Implementation,Impl ID,Workload,Batch,Seq Length,Heads,Head Dim,Dtype,Mean (ms),P10 (ms),P50 (ms),P90 (ms),Reps,Peak Mem (MB),Backend,Family
|
| 2 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L128,1,1152,24,128,bfloat16,0.49411200881004336,0.48844799399375916,0.4936000108718872,0.4944640100002289,5,83.38,FLASH,torch-sdpa
|
| 3 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L256,1,1280,24,128,bfloat16,0.5234112024307251,0.5224320292472839,0.5235199928283691,0.5235840082168579,5,90.62,FLASH,torch-sdpa
|
| 4 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L320,1,1344,24,128,bfloat16,0.6527232170104981,0.6503040194511414,0.6524800062179565,0.6545600295066833,5,95.06,FLASH,torch-sdpa
|
| 5 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L384,1,1408,24,128,bfloat16,0.682803213596344,0.6805760264396667,0.6828799843788147,0.6832640171051025,5,99.88,FLASH,torch-sdpa
|
| 6 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L448,1,1472,24,128,bfloat16,0.7075456142425537,0.7057600021362305,0.7063360214233398,0.7070720195770264,5,103.81,FLASH,torch-sdpa
|
| 7 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L512,1,1536,24,128,bfloat16,0.7379711985588073,0.7368639707565308,0.7372480034828186,0.7391039729118347,5,109.12,FLASH,torch-sdpa
|
| 8 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L128,1,1152,24,128,bfloat16,0.5874239921569824,0.5861759781837463,0.5873280167579651,0.5877439975738525,5,83.38,EFFICIENT,torch-sdpa
|
| 9 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L256,1,1280,24,128,bfloat16,0.6502719998359681,0.6490240097045898,0.649183988571167,0.6517760157585144,5,90.62,EFFICIENT,torch-sdpa
|
| 10 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L320,1,1344,24,128,bfloat16,0.7812095880508423,0.7761600017547607,0.7803199887275696,0.7852799892425537,5,95.94,EFFICIENT,torch-sdpa
|
| 11 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L384,1,1408,24,128,bfloat16,0.7948480010032654,0.7911999821662903,0.7935360074043274,0.7948480248451233,5,100.0,EFFICIENT,torch-sdpa
|
| 12 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L448,1,1472,24,128,bfloat16,0.8463295936584473,0.8449919819831848,0.8459839820861816,0.8461120128631592,5,103.81,EFFICIENT,torch-sdpa
|
| 13 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L512,1,1536,24,128,bfloat16,0.9538687944412232,0.9492800235748291,0.9518399834632874,0.9581760168075562,5,109.12,EFFICIENT,torch-sdpa
|
| 14 |
+
xFormers,xformers_meff,flux_L128,1,1152,24,128,bfloat16,0.4515071928501129,0.44364801049232483,0.4524799883365631,0.4557119905948639,5,83.38,memory_efficient,xformers
|
| 15 |
+
xFormers,xformers_meff,flux_L256,1,1280,24,128,bfloat16,0.46787199974060056,0.46489599347114563,0.4684160053730011,0.46908798813819885,5,90.62,memory_efficient,xformers
|
| 16 |
+
xFormers,xformers_meff,flux_L320,1,1344,24,128,bfloat16,0.6001471996307373,0.596992015838623,0.5984640121459961,0.6016640067100525,5,95.06,memory_efficient,xformers
|
| 17 |
+
xFormers,xformers_meff,flux_L384,1,1408,24,128,bfloat16,0.6023231983184815,0.5997440218925476,0.6031039953231812,0.6032639741897583,5,99.88,memory_efficient,xformers
|
| 18 |
+
xFormers,xformers_meff,flux_L448,1,1472,24,128,bfloat16,0.6411136031150818,0.6381760239601135,0.6414719820022583,0.6421440243721008,5,103.81,memory_efficient,xformers
|
| 19 |
+
xFormers,xformers_meff,flux_L512,1,1536,24,128,bfloat16,0.6594688057899475,0.6441280245780945,0.6496639847755432,0.6527680158615112,5,109.12,memory_efficient,xformers
|
| 20 |
+
Compiled (default),torch_flash_compiled_default,flux_L128,1,1152,24,128,bfloat16,0.5181439876556396,0.5141760110855103,0.5175679922103882,0.5197759866714478,5,83.38,FLASH,torch-sdpa
|
| 21 |
+
Compiled (default),torch_flash_compiled_default,flux_L256,1,1280,24,128,bfloat16,0.5579584002494812,0.5549119710922241,0.5582720041275024,0.5598080158233643,5,90.62,FLASH,torch-sdpa
|
| 22 |
+
Compiled (default),torch_flash_compiled_default,flux_L320,1,1344,24,128,bfloat16,0.6872959971427918,0.6853119730949402,0.687391996383667,0.6883519887924194,5,95.25,FLASH,torch-sdpa
|
| 23 |
+
Compiled (default),torch_flash_compiled_default,flux_L384,1,1408,24,128,bfloat16,0.716153597831726,0.7128639817237854,0.7160959839820862,0.7167680263519287,5,99.88,FLASH,torch-sdpa
|
| 24 |
+
Compiled (default),torch_flash_compiled_default,flux_L448,1,1472,24,128,bfloat16,0.7418303966522217,0.7386879920959473,0.7400959730148315,0.7415040135383606,5,103.81,FLASH,torch-sdpa
|
| 25 |
+
Compiled (default),torch_flash_compiled_default,flux_L512,1,1536,24,128,bfloat16,0.7745471954345703,0.7708160281181335,0.7740799784660339,0.7753919959068298,5,109.12,FLASH,torch-sdpa
|
| 26 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L128,1,1152,24,128,bfloat16,0.6468096017837525,0.6144000291824341,0.6245759725570679,0.6483200192451477,5,67.5,FLASH,torch-sdpa
|
| 27 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L256,1,1280,24,128,bfloat16,0.7060160160064697,0.6689280271530151,0.6851199865341187,0.7184960246086121,5,75.0,FLASH,torch-sdpa
|
| 28 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L320,1,1344,24,128,bfloat16,0.8332608103752136,0.7953600287437439,0.8155840039253235,0.8403519988059998,5,80.38,FLASH,torch-sdpa
|
| 29 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L384,1,1408,24,128,bfloat16,0.8719295978546142,0.8470720052719116,0.849727988243103,0.8745279908180237,5,82.5,FLASH,torch-sdpa
|
| 30 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L448,1,1472,24,128,bfloat16,0.9034304022789001,0.8677120208740234,0.8835520148277283,0.9034240245819092,5,86.25,FLASH,torch-sdpa
|
| 31 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L512,1,1536,24,128,bfloat16,0.9387519836425782,0.9154239892959595,0.9213759899139404,0.9359679818153381,5,90.0,FLASH,torch-sdpa
|
| 32 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L128,1,1152,24,128,bfloat16,0.3455295979976654,0.34355199337005615,0.34563198685646057,0.34643200039863586,5,83.38,flash-attn,hf-kernels
|
| 33 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L256,1,1280,24,128,bfloat16,0.3756160080432892,0.37411201000213623,0.3752000033855438,0.3770880103111267,5,90.62,flash-attn,hf-kernels
|
| 34 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L320,1,1344,24,128,bfloat16,0.4953216016292572,0.49324798583984375,0.49433600902557373,0.49663999676704407,5,95.06,flash-attn,hf-kernels
|
| 35 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L384,1,1408,24,128,bfloat16,0.5157055854797363,0.5142719745635986,0.516319990158081,0.516543984413147,5,99.88,flash-attn,hf-kernels
|
| 36 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L448,1,1472,24,128,bfloat16,0.5356672048568726,0.5346879959106445,0.5358080267906189,0.5361599922180176,5,103.81,flash-attn,hf-kernels
|
| 37 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L512,1,1536,24,128,bfloat16,0.5587136030197144,0.5557760000228882,0.5574079751968384,0.5581120252609253,5,109.12,flash-attn,hf-kernels
|
| 38 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L128,1,1152,24,128,bfloat16,0.3619711995124817,0.3603839874267578,0.361952006816864,0.3624640107154846,5,83.38,flash-attn3,hf-kernels
|
| 39 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L256,1,1280,24,128,bfloat16,0.3912447988986969,0.3892799913883209,0.3909760117530823,0.3922559916973114,5,90.62,flash-attn3,hf-kernels
|
| 40 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L320,1,1344,24,128,bfloat16,0.5258048176765442,0.5240640044212341,0.5248960256576538,0.5248960256576538,5,95.06,flash-attn3,hf-kernels
|
| 41 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L384,1,1408,24,128,bfloat16,0.5276032090187073,0.5265600085258484,0.5277760028839111,0.5282559990882874,5,99.88,flash-attn3,hf-kernels
|
| 42 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L448,1,1472,24,128,bfloat16,0.5656383991241455,0.5639039874076843,0.5657920241355896,0.5668479800224304,5,103.81,flash-attn3,hf-kernels
|
| 43 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L512,1,1536,24,128,bfloat16,0.5789952039718628,0.5689600110054016,0.5698239803314209,0.5713919997215271,5,109.12,flash-attn3,hf-kernels
|
flash_attn/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
flash_attn/results/combined_results.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/results/index.html
CHANGED
|
@@ -2,22 +2,86 @@
|
|
| 2 |
<html>
|
| 3 |
<head>
|
| 4 |
<meta charset='UTF-8'>
|
| 5 |
-
<
|
|
|
|
| 6 |
<style>
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
</style>
|
| 16 |
</head>
|
| 17 |
<body>
|
|
|
|
|
|
|
|
|
|
| 18 |
<h1>Index of /flash_attn/results</h1>
|
| 19 |
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 22 |
</ul>
|
| 23 |
</body>
|
|
|
|
| 2 |
<html>
|
| 3 |
<head>
|
| 4 |
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /flash_attn/results</title>
|
| 7 |
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
</style>
|
| 78 |
</head>
|
| 79 |
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
<h1>Index of /flash_attn/results</h1>
|
| 84 |
<ul>
|
|
|
|
| 85 |
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
</ul>
|
| 87 |
</body>
|
index.html
CHANGED
|
@@ -2,16 +2,78 @@
|
|
| 2 |
<html>
|
| 3 |
<head>
|
| 4 |
<meta charset='UTF-8'>
|
| 5 |
-
<
|
|
|
|
| 6 |
<style>
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
</style>
|
| 16 |
</head>
|
| 17 |
<body>
|
|
|
|
| 2 |
<html>
|
| 3 |
<head>
|
| 4 |
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /</title>
|
| 7 |
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
</style>
|
| 78 |
</head>
|
| 79 |
<body>
|