Upload folder using huggingface_hub
Browse files- flash_attn/impls/artifacts/benchmark/attn.jsonl +6 -6
- flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +6 -6
- flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +6 -6
- flash_attn/impls/compiled_variants.html +80 -45
- flash_attn/impls/flash_attention.html +62 -27
- flash_attn/impls/hf_kernels_flash_attn.html +61 -26
- flash_attn/impls/hf_kernels_flash_attn3.html +57 -22
- flash_attn/impls/mem_efficient_attention.html +53 -18
- flash_attn/impls/sage_attention.html +63 -27
- flash_attn/impls/xformers.html +53 -18
- flash_attn/results/artifacts/combine/latency.csv +43 -0
- flash_attn/results/artifacts/combine/latency.svg +3 -0
- flash_attn/results/cells/combine.py +244 -21
- flash_attn/results/combined_results.html +0 -0
flash_attn/impls/artifacts/benchmark/attn.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4474239945411682, "p50": 0.44921600818634033, "p90": 0.45241600275039673, "mean": 0.45066879987716674, "reps": 5, "warmup": 2}, "compile_ms": 1.7530560493469238, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4652479887008667, "p50": 0.4705919921398163, "p90": 0.4716799855232239, "mean": 0.47004159688949587, "reps": 5, "warmup": 2}, "compile_ms": 0.36032000184059143, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5987840294837952, "p50": 0.6021760106086731, "p90": 0.6045759916305542, "mean": 0.6022783994674683, "reps": 5, "warmup": 2}, "compile_ms": 0.4950079917907715, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6000319719314575, "p50": 0.600383996963501, "p90": 0.6016640067100525, "mean": 0.6013055920600892, "reps": 5, "warmup": 2}, "compile_ms": 0.49647998809814453, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.639136016368866, "p50": 0.6404479742050171, "p90": 0.6416320204734802, "mean": 0.6408192038536071, "reps": 5, "warmup": 2}, "compile_ms": 0.530239999294281, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6447359919548035, "p50": 0.6462399959564209, "p90": 0.6483839750289917, "mean": 0.6466111898422241, "reps": 5, "warmup": 2}, "compile_ms": 0.5342720150947571, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T18:08:46Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5194560289382935, "p50": 0.5272960066795349, "p90": 0.5312960147857666, "mean": 0.527347207069397, "reps": 5, "warmup": 2}, "compile_ms": 3354.235107421875, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T18:08:47Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5560640096664429, "p50": 0.5571519732475281, "p90": 0.5611839890480042, "mean": 0.5586367964744567, "reps": 5, "warmup": 2}, "compile_ms": 471.23529052734375, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T18:08:47Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6841920018196106, "p50": 0.6860160231590271, "p90": 0.6869760155677795, "mean": 0.6860736012458801, "reps": 5, "warmup": 2}, "compile_ms": 468.1533508300781, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T18:08:48Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7152000069618225, "p50": 0.7161920070648193, "p90": 0.7164160013198853, "mean": 0.7167360067367554, "reps": 5, "warmup": 2}, "compile_ms": 465.7891540527344, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T18:08:48Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7400959730148315, "p50": 0.742143988609314, "p90": 0.7431039810180664, "mean": 0.7423295855522156, "reps": 5, "warmup": 2}, "compile_ms": 468.6272888183594, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T18:08:49Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7718080282211304, "p50": 0.7745919823646545, "p90": 0.7748159766197205, "mean": 0.7743871927261352, "reps": 5, "warmup": 2}, "compile_ms": 475.9334716796875, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T18:09:34Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6148160099983215, "p50": 0.6296960115432739, "p90": 0.6522240042686462, "mean": 0.6489088058471679, "reps": 5, "warmup": 2}, "compile_ms": 4649.109375, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T18:09:35Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6615359783172607, "p50": 0.6821119785308838, "p90": 0.7128959894180298, "mean": 0.700761592388153, "reps": 5, "warmup": 2}, "compile_ms": 1487.6849365234375, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T18:09:37Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7967039942741394, "p50": 0.8164799809455872, "p90": 0.8463680148124695, "mean": 0.834444797039032, "reps": 5, "warmup": 2}, "compile_ms": 1492.66748046875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T18:09:39Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8432319760322571, "p50": 0.8498560190200806, "p90": 0.8750079870223999, "mean": 0.8709375977516174, "reps": 5, "warmup": 2}, "compile_ms": 1477.6558837890625, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T18:09:41Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8775359988212585, "p50": 0.9030719995498657, "p90": 0.903872013092041, "mean": 0.9069631934165955, "reps": 5, "warmup": 2}, "compile_ms": 1919.1016845703125, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T18:09:43Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9145920276641846, "p50": 0.9164159893989563, "p90": 0.9357439875602722, "mean": 0.9371584057807922, "reps": 5, "warmup": 2}, "compile_ms": 1487.1219482421875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/compiled_variants.html
CHANGED
|
@@ -719,6 +719,41 @@
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
.cell-failed {
|
| 723 |
border-color: var(--border-cell-failed);
|
| 724 |
}
|
|
@@ -3711,7 +3746,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3711 |
<span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
|
| 3712 |
<span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3713 |
</span> |
|
| 3714 |
-
Cell: benchmark_default |
|
| 3715 |
| <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
|
| 3716 |
<button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
|
| 3717 |
<a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3795,7 +3830,7 @@ Cell: benchmark_default | 44.25s
|
|
| 3795 |
</div>
|
| 3796 |
<div id="output-benchmark_default" class="cell-output">
|
| 3797 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3798 |
-
torch_flash_compiled_default flux_L128 0.
|
| 3799 |
torch_flash_compiled_default flux_L256 0.56 True
|
| 3800 |
torch_flash_compiled_default flux_L320 0.69 True
|
| 3801 |
torch_flash_compiled_default flux_L384 0.72 True
|
|
@@ -3806,28 +3841,28 @@ torch_flash_compiled_default flux_L512 0.77 True
|
|
| 3806 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3807 |
<div class="uv-logs-content" style="display: none;">
|
| 3808 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3809 |
-
Downloading
|
| 3810 |
-
Downloading torch (846.9MiB)
|
| 3811 |
-
Downloading kiwisolver (1.4MiB)
|
| 3812 |
-
Downloading fonttools (4.7MiB)
|
| 3813 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3814 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3815 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3816 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3817 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3818 |
-
Downloading setuptools (1.1MiB)
|
| 3819 |
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3820 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3821 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3822 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3823 |
Downloading numpy (16.2MiB)
|
| 3824 |
-
Downloading sympy (6.0MiB)
|
| 3825 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3826 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3827 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3828 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3829 |
-
Downloading networkx (1.9MiB)
|
| 3830 |
-
Downloading pillow (6.3MiB)
|
| 3831 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3832 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3833 |
Downloading nvidia-cufile-cu12
|
|
@@ -3839,21 +3874,21 @@ Downloading pillow (6.3MiB)
|
|
| 3839 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3840 |
Downloading nvidia-cuda-cupti-cu12
|
| 3841 |
Downloading matplotlib
|
| 3842 |
-
Downloading numpy
|
| 3843 |
Downloading sympy
|
|
|
|
| 3844 |
Downloading nvidia-nvjitlink-cu12
|
| 3845 |
Downloading nvidia-curand-cu12
|
| 3846 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 3847 |
Downloading triton
|
| 3848 |
Downloading nvidia-cufft-cu12
|
| 3849 |
Downloading nvidia-cusolver-cu12
|
| 3850 |
-
Downloading nvidia-cusparselt-cu12
|
| 3851 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 3852 |
Downloading nvidia-nccl-cu12
|
| 3853 |
Downloading nvidia-cublas-cu12
|
| 3854 |
Downloading nvidia-cudnn-cu12
|
| 3855 |
Downloading torch
|
| 3856 |
-
Installed 37 packages in
|
| 3857 |
</div>
|
| 3858 |
</div>
|
| 3859 |
<div class="cell-artifacts">
|
|
@@ -3871,7 +3906,7 @@ Installed 37 packages in 516ms
|
|
| 3871 |
<span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark_max_autotune |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
|
| 3877 |
<a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3955,65 +3990,65 @@ Cell: benchmark_max_autotune | 56.94s
|
|
| 3955 |
</div>
|
| 3956 |
<div id="output-benchmark_max_autotune" class="cell-output">
|
| 3957 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3958 |
-
torch_flash_compiled_max_autotune flux_L128 0.
|
| 3959 |
torch_flash_compiled_max_autotune flux_L256 0.68 True
|
| 3960 |
torch_flash_compiled_max_autotune flux_L320 0.82 True
|
| 3961 |
torch_flash_compiled_max_autotune flux_L384 0.85 True
|
| 3962 |
-
torch_flash_compiled_max_autotune flux_L448 0.
|
| 3963 |
torch_flash_compiled_max_autotune flux_L512 0.92 True
|
| 3964 |
</div>
|
| 3965 |
<div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
|
| 3966 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3967 |
<div class="uv-logs-content" style="display: none;">
|
| 3968 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3969 |
-
Downloading
|
| 3970 |
-
Downloading setuptools (1.1MiB)
|
| 3971 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3972 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 3973 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
|
|
|
|
|
|
| 3974 |
Downloading fonttools (4.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3975 |
Downloading numpy (16.2MiB)
|
| 3976 |
Downloading pillow (6.3MiB)
|
| 3977 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3978 |
-
Downloading
|
| 3979 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3980 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3981 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3982 |
-
Downloading networkx (1.9MiB)
|
| 3983 |
-
Downloading torch (846.9MiB)
|
| 3984 |
-
Downloading triton (148.3MiB)
|
| 3985 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3986 |
-
Downloading kiwisolver (1.4MiB)
|
| 3987 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3988 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3989 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3990 |
-
Downloading sympy (6.0MiB)
|
| 3991 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3992 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3993 |
Downloading nvidia-cufile-cu12
|
| 3994 |
Downloading kiwisolver
|
| 3995 |
Downloading setuptools
|
| 3996 |
-
Downloading fonttools
|
| 3997 |
Downloading networkx
|
|
|
|
| 3998 |
Downloading pillow
|
| 3999 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 4000 |
-
Downloading matplotlib
|
| 4001 |
Downloading nvidia-cuda-cupti-cu12
|
| 4002 |
-
Downloading
|
| 4003 |
Downloading numpy
|
|
|
|
| 4004 |
Downloading nvidia-nvjitlink-cu12
|
| 4005 |
Downloading nvidia-curand-cu12
|
| 4006 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4007 |
Downloading triton
|
| 4008 |
Downloading nvidia-cufft-cu12
|
| 4009 |
Downloading nvidia-cusolver-cu12
|
| 4010 |
-
Downloading nvidia-cusparse-cu12
|
| 4011 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 4012 |
Downloading nvidia-nccl-cu12
|
| 4013 |
Downloading nvidia-cublas-cu12
|
| 4014 |
Downloading nvidia-cudnn-cu12
|
| 4015 |
Downloading torch
|
| 4016 |
-
Installed 37 packages in
|
| 4017 |
</div>
|
| 4018 |
</div>
|
| 4019 |
<div class="cell-artifacts">
|
|
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
| 722 |
+
/* CSV table styling */
|
| 723 |
+
.artifact-csv {
|
| 724 |
+
margin-top: 1rem;
|
| 725 |
+
overflow-x: auto;
|
| 726 |
+
}
|
| 727 |
+
.csv-table {
|
| 728 |
+
width: 100%;
|
| 729 |
+
border-collapse: collapse;
|
| 730 |
+
font-size: 0.9rem;
|
| 731 |
+
background: var(--bg-secondary);
|
| 732 |
+
border: 1px solid var(--border-primary);
|
| 733 |
+
border-radius: 1px;
|
| 734 |
+
}
|
| 735 |
+
.csv-table th,
|
| 736 |
+
.csv-table td {
|
| 737 |
+
padding: 0.5rem 0.75rem;
|
| 738 |
+
text-align: left;
|
| 739 |
+
border: 1px solid var(--border-primary);
|
| 740 |
+
}
|
| 741 |
+
.csv-table th {
|
| 742 |
+
background: var(--bg-tertiary);
|
| 743 |
+
font-weight: 600;
|
| 744 |
+
color: var(--text-primary);
|
| 745 |
+
}
|
| 746 |
+
.csv-table tbody tr:hover {
|
| 747 |
+
background: var(--bg-artifact-hover);
|
| 748 |
+
}
|
| 749 |
+
.artifact-csv-error {
|
| 750 |
+
margin-top: 1rem;
|
| 751 |
+
padding: 1rem;
|
| 752 |
+
background: var(--bg-error);
|
| 753 |
+
color: var(--text-error);
|
| 754 |
+
border: 1px solid var(--border-error);
|
| 755 |
+
border-radius: 1px;
|
| 756 |
+
}
|
| 757 |
.cell-failed {
|
| 758 |
border-color: var(--border-cell-failed);
|
| 759 |
}
|
|
|
|
| 3746 |
<span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
|
| 3747 |
<span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3748 |
</span> |
|
| 3749 |
+
Cell: benchmark_default | 46.78s
|
| 3750 |
| <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
|
| 3751 |
<button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
|
| 3752 |
<a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3830 |
</div>
|
| 3831 |
<div id="output-benchmark_default" class="cell-output">
|
| 3832 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3833 |
+
torch_flash_compiled_default flux_L128 0.53 True
|
| 3834 |
torch_flash_compiled_default flux_L256 0.56 True
|
| 3835 |
torch_flash_compiled_default flux_L320 0.69 True
|
| 3836 |
torch_flash_compiled_default flux_L384 0.72 True
|
|
|
|
| 3841 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3842 |
<div class="uv-logs-content" style="display: none;">
|
| 3843 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3844 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3845 |
Downloading matplotlib (8.3MiB)
|
| 3846 |
+
Downloading networkx (1.9MiB)
|
| 3847 |
+
Downloading setuptools (1.1MiB)
|
| 3848 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3849 |
+
Downloading pillow (6.3MiB)
|
| 3850 |
+
Downloading sympy (6.0MiB)
|
| 3851 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3852 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3853 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3854 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3855 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3856 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3857 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3858 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3859 |
+
Downloading torch (846.9MiB)
|
| 3860 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3861 |
+
Downloading fonttools (4.7MiB)
|
| 3862 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3863 |
+
Downloading kiwisolver (1.4MiB)
|
| 3864 |
+
Downloading triton (148.3MiB)
|
| 3865 |
Downloading numpy (16.2MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3866 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3867 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3868 |
Downloading nvidia-cufile-cu12
|
|
|
|
| 3874 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3875 |
Downloading nvidia-cuda-cupti-cu12
|
| 3876 |
Downloading matplotlib
|
|
|
|
| 3877 |
Downloading sympy
|
| 3878 |
+
Downloading numpy
|
| 3879 |
Downloading nvidia-nvjitlink-cu12
|
| 3880 |
Downloading nvidia-curand-cu12
|
| 3881 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 3882 |
Downloading triton
|
| 3883 |
Downloading nvidia-cufft-cu12
|
| 3884 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3885 |
Downloading nvidia-cusparse-cu12
|
| 3886 |
+
Downloading nvidia-cusparselt-cu12
|
| 3887 |
Downloading nvidia-nccl-cu12
|
| 3888 |
Downloading nvidia-cublas-cu12
|
| 3889 |
Downloading nvidia-cudnn-cu12
|
| 3890 |
Downloading torch
|
| 3891 |
+
Installed 37 packages in 557ms
|
| 3892 |
</div>
|
| 3893 |
</div>
|
| 3894 |
<div class="cell-artifacts">
|
|
|
|
| 3906 |
<span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
|
| 3907 |
<span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3908 |
</span> |
|
| 3909 |
+
Cell: benchmark_max_autotune | 53.65s
|
| 3910 |
| <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
|
| 3911 |
<button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
|
| 3912 |
<a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3990 |
</div>
|
| 3991 |
<div id="output-benchmark_max_autotune" class="cell-output">
|
| 3992 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3993 |
+
torch_flash_compiled_max_autotune flux_L128 0.63 True
|
| 3994 |
torch_flash_compiled_max_autotune flux_L256 0.68 True
|
| 3995 |
torch_flash_compiled_max_autotune flux_L320 0.82 True
|
| 3996 |
torch_flash_compiled_max_autotune flux_L384 0.85 True
|
| 3997 |
+
torch_flash_compiled_max_autotune flux_L448 0.90 True
|
| 3998 |
torch_flash_compiled_max_autotune flux_L512 0.92 True
|
| 3999 |
</div>
|
| 4000 |
<div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
|
| 4001 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4002 |
<div class="uv-logs-content" style="display: none;">
|
| 4003 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 4004 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 4005 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4006 |
+
Downloading sympy (6.0MiB)
|
| 4007 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4008 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4009 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4010 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4011 |
+
Downloading matplotlib (8.3MiB)
|
| 4012 |
+
Downloading triton (148.3MiB)
|
| 4013 |
+
Downloading networkx (1.9MiB)
|
| 4014 |
Downloading fonttools (4.7MiB)
|
| 4015 |
+
Downloading torch (846.9MiB)
|
| 4016 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4017 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4018 |
+
Downloading kiwisolver (1.4MiB)
|
| 4019 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4020 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4021 |
Downloading numpy (16.2MiB)
|
| 4022 |
Downloading pillow (6.3MiB)
|
| 4023 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4024 |
+
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4025 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 4026 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 4027 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 4028 |
Downloading nvidia-cufile-cu12
|
| 4029 |
Downloading kiwisolver
|
| 4030 |
Downloading setuptools
|
|
|
|
| 4031 |
Downloading networkx
|
| 4032 |
+
Downloading fonttools
|
| 4033 |
Downloading pillow
|
| 4034 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
|
|
|
| 4035 |
Downloading nvidia-cuda-cupti-cu12
|
| 4036 |
+
Downloading matplotlib
|
| 4037 |
Downloading numpy
|
| 4038 |
+
Downloading sympy
|
| 4039 |
Downloading nvidia-nvjitlink-cu12
|
| 4040 |
Downloading nvidia-curand-cu12
|
| 4041 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4042 |
Downloading triton
|
| 4043 |
Downloading nvidia-cufft-cu12
|
| 4044 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4045 |
Downloading nvidia-cusparselt-cu12
|
| 4046 |
+
Downloading nvidia-cusparse-cu12
|
| 4047 |
Downloading nvidia-nccl-cu12
|
| 4048 |
Downloading nvidia-cublas-cu12
|
| 4049 |
Downloading nvidia-cudnn-cu12
|
| 4050 |
Downloading torch
|
| 4051 |
+
Installed 37 packages in 525ms
|
| 4052 |
</div>
|
| 4053 |
</div>
|
| 4054 |
<div class="cell-artifacts">
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -719,6 +719,41 @@
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
.cell-failed {
|
| 723 |
border-color: var(--border-cell-failed);
|
| 724 |
}
|
|
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: nv | 0.
|
| 3714 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3716 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3726,7 +3761,7 @@ Cell: nv | 0.66s
|
|
| 3726 |
</div>
|
| 3727 |
</div>
|
| 3728 |
<div id="output-nv" class="cell-output">
|
| 3729 |
-
<div class="cell-stdout">Thu Oct 2
|
| 3730 |
+-----------------------------------------------------------------------------------------+
|
| 3731 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3732 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3735,19 +3770,19 @@ Cell: nv | 0.66s
|
|
| 3735 |
| | | MIG M. |
|
| 3736 |
|=========================================+========================+======================|
|
| 3737 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3738 |
-
| 0%
|
| 3739 |
| | | N/A |
|
| 3740 |
+-----------------------------------------+------------------------+----------------------+
|
| 3741 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3742 |
-
| 0%
|
| 3743 |
| | | N/A |
|
| 3744 |
+-----------------------------------------+------------------------+----------------------+
|
| 3745 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3746 |
-
| 0%
|
| 3747 |
| | | N/A |
|
| 3748 |
+-----------------------------------------+------------------------+----------------------+
|
| 3749 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3750 |
-
| 0%
|
| 3751 |
| | | N/A |
|
| 3752 |
+-----------------------------------------+------------------------+----------------------+
|
| 3753 |
|
|
@@ -3771,7 +3806,7 @@ Cell: nv | 0.66s
|
|
| 3771 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3772 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3773 |
</span> |
|
| 3774 |
-
Cell: benchmark |
|
| 3775 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3776 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3777 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3852,7 +3887,7 @@ Cell: benchmark | 37.94s
|
|
| 3852 |
<div id="output-benchmark" class="cell-output">
|
| 3853 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3854 |
torch_flash_ma flux_L128 0.48 True
|
| 3855 |
-
torch_flash_ma flux_L256 0.
|
| 3856 |
torch_flash_ma flux_L320 0.65 True
|
| 3857 |
torch_flash_ma flux_L384 0.68 True
|
| 3858 |
torch_flash_ma flux_L448 0.71 True
|
|
@@ -3862,35 +3897,35 @@ torch_flash_ma flux_L512 0.74 True
|
|
| 3862 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3863 |
<div class="uv-logs-content" style="display: none;">
|
| 3864 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3865 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3866 |
-
Downloading sympy (6.0MiB)
|
| 3867 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3868 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3869 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3870 |
Downloading networkx (1.9MiB)
|
| 3871 |
-
Downloading
|
| 3872 |
-
Downloading
|
| 3873 |
-
Downloading
|
| 3874 |
-
Downloading
|
| 3875 |
-
Downloading pillow (6.3MiB)
|
| 3876 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3877 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3878 |
-
Downloading nvidia-
|
| 3879 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3880 |
-
Downloading numpy (16.2MiB)
|
| 3881 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3882 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3883 |
-
Downloading nvidia-
|
| 3884 |
-
Downloading
|
| 3885 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3886 |
Downloading triton (148.3MiB)
|
|
|
|
|
|
|
|
|
|
| 3887 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3888 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3889 |
Downloading nvidia-cufile-cu12
|
| 3890 |
Downloading kiwisolver
|
| 3891 |
Downloading setuptools
|
| 3892 |
-
Downloading fonttools
|
| 3893 |
Downloading networkx
|
|
|
|
| 3894 |
Downloading pillow
|
| 3895 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3896 |
Downloading nvidia-cuda-cupti-cu12
|
|
@@ -3906,10 +3941,10 @@ Downloading triton (148.3MiB)
|
|
| 3906 |
Downloading nvidia-cusparselt-cu12
|
| 3907 |
Downloading nvidia-cusparse-cu12
|
| 3908 |
Downloading nvidia-nccl-cu12
|
| 3909 |
-
Downloading nvidia-cublas-cu12
|
| 3910 |
Downloading nvidia-cudnn-cu12
|
|
|
|
| 3911 |
Downloading torch
|
| 3912 |
-
Installed 37 packages in
|
| 3913 |
</div>
|
| 3914 |
</div>
|
| 3915 |
<div class="cell-artifacts">
|
|
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
| 722 |
+
/* CSV table styling */
|
| 723 |
+
.artifact-csv {
|
| 724 |
+
margin-top: 1rem;
|
| 725 |
+
overflow-x: auto;
|
| 726 |
+
}
|
| 727 |
+
.csv-table {
|
| 728 |
+
width: 100%;
|
| 729 |
+
border-collapse: collapse;
|
| 730 |
+
font-size: 0.9rem;
|
| 731 |
+
background: var(--bg-secondary);
|
| 732 |
+
border: 1px solid var(--border-primary);
|
| 733 |
+
border-radius: 1px;
|
| 734 |
+
}
|
| 735 |
+
.csv-table th,
|
| 736 |
+
.csv-table td {
|
| 737 |
+
padding: 0.5rem 0.75rem;
|
| 738 |
+
text-align: left;
|
| 739 |
+
border: 1px solid var(--border-primary);
|
| 740 |
+
}
|
| 741 |
+
.csv-table th {
|
| 742 |
+
background: var(--bg-tertiary);
|
| 743 |
+
font-weight: 600;
|
| 744 |
+
color: var(--text-primary);
|
| 745 |
+
}
|
| 746 |
+
.csv-table tbody tr:hover {
|
| 747 |
+
background: var(--bg-artifact-hover);
|
| 748 |
+
}
|
| 749 |
+
.artifact-csv-error {
|
| 750 |
+
margin-top: 1rem;
|
| 751 |
+
padding: 1rem;
|
| 752 |
+
background: var(--bg-error);
|
| 753 |
+
color: var(--text-error);
|
| 754 |
+
border: 1px solid var(--border-error);
|
| 755 |
+
border-radius: 1px;
|
| 756 |
+
}
|
| 757 |
.cell-failed {
|
| 758 |
border-color: var(--border-cell-failed);
|
| 759 |
}
|
|
|
|
| 3745 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
+
Cell: nv | 0.70s
|
| 3749 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3751 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3761 |
</div>
|
| 3762 |
</div>
|
| 3763 |
<div id="output-nv" class="cell-output">
|
| 3764 |
+
<div class="cell-stdout">Thu Oct 2 18:06:49 2025
|
| 3765 |
+-----------------------------------------------------------------------------------------+
|
| 3766 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3767 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3770 |
| | | MIG M. |
|
| 3771 |
|=========================================+========================+======================|
|
| 3772 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3773 |
+
| 0% 26C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3774 |
| | | N/A |
|
| 3775 |
+-----------------------------------------+------------------------+----------------------+
|
| 3776 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3777 |
+
| 0% 26C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3778 |
| | | N/A |
|
| 3779 |
+-----------------------------------------+------------------------+----------------------+
|
| 3780 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3781 |
+
| 0% 26C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3782 |
| | | N/A |
|
| 3783 |
+-----------------------------------------+------------------------+----------------------+
|
| 3784 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3785 |
+
| 0% 27C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3786 |
| | | N/A |
|
| 3787 |
+-----------------------------------------+------------------------+----------------------+
|
| 3788 |
|
|
|
|
| 3806 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3807 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3808 |
</span> |
|
| 3809 |
+
Cell: benchmark | 36.63s
|
| 3810 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3811 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3812 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
<div id="output-benchmark" class="cell-output">
|
| 3888 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3889 |
torch_flash_ma flux_L128 0.48 True
|
| 3890 |
+
torch_flash_ma flux_L256 0.52 True
|
| 3891 |
torch_flash_ma flux_L320 0.65 True
|
| 3892 |
torch_flash_ma flux_L384 0.68 True
|
| 3893 |
torch_flash_ma flux_L448 0.71 True
|
|
|
|
| 3897 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3898 |
<div class="uv-logs-content" style="display: none;">
|
| 3899 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
|
|
|
|
|
|
|
|
|
| 3900 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
| 3901 |
Downloading networkx (1.9MiB)
|
| 3902 |
+
Downloading kiwisolver (1.4MiB)
|
| 3903 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3904 |
+
Downloading sympy (6.0MiB)
|
| 3905 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
| 3906 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3907 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3908 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
| 3909 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3910 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3911 |
+
Downloading pillow (6.3MiB)
|
| 3912 |
+
Downloading numpy (16.2MiB)
|
| 3913 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3914 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3915 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3916 |
+
Downloading setuptools (1.1MiB)
|
| 3917 |
+
Downloading matplotlib (8.3MiB)
|
| 3918 |
Downloading triton (148.3MiB)
|
| 3919 |
+
Downloading fonttools (4.7MiB)
|
| 3920 |
+
Downloading torch (846.9MiB)
|
| 3921 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3922 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3923 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3924 |
Downloading nvidia-cufile-cu12
|
| 3925 |
Downloading kiwisolver
|
| 3926 |
Downloading setuptools
|
|
|
|
| 3927 |
Downloading networkx
|
| 3928 |
+
Downloading fonttools
|
| 3929 |
Downloading pillow
|
| 3930 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3931 |
Downloading nvidia-cuda-cupti-cu12
|
|
|
|
| 3941 |
Downloading nvidia-cusparselt-cu12
|
| 3942 |
Downloading nvidia-cusparse-cu12
|
| 3943 |
Downloading nvidia-nccl-cu12
|
|
|
|
| 3944 |
Downloading nvidia-cudnn-cu12
|
| 3945 |
+
Downloading nvidia-cublas-cu12
|
| 3946 |
Downloading torch
|
| 3947 |
+
Installed 37 packages in 548ms
|
| 3948 |
</div>
|
| 3949 |
</div>
|
| 3950 |
<div class="cell-artifacts">
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -719,6 +719,41 @@
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
.cell-failed {
|
| 723 |
border-color: var(--border-cell-failed);
|
| 724 |
}
|
|
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: benchmark |
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3735,7 +3770,7 @@ Cell: benchmark | 38.08s
|
|
| 3735 |
<span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
|
| 3736 |
<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
|
| 3737 |
|
| 3738 |
-
<span class="n">hf_kernels_flash_attn</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">"kernels-community/flash-attn"</span><span class="p">)</span>
|
| 3739 |
|
| 3740 |
|
| 3741 |
<span class="k">def</span><span class="w"> </span><span class="nf">hf_flash_attention</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
|
|
@@ -3797,39 +3832,39 @@ Cell: benchmark | 38.08s
|
|
| 3797 |
<div id="output-benchmark" class="cell-output">
|
| 3798 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3799 |
hf_kernels_flash_attn flux_L128 0.34 True
|
| 3800 |
-
hf_kernels_flash_attn flux_L256 0.
|
| 3801 |
hf_kernels_flash_attn flux_L320 0.49 True
|
| 3802 |
hf_kernels_flash_attn flux_L384 0.51 True
|
| 3803 |
-
hf_kernels_flash_attn flux_L448 0.
|
| 3804 |
-
hf_kernels_flash_attn flux_L512 0.
|
| 3805 |
</div>
|
| 3806 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3807 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3808 |
<div class="uv-logs-content" style="display: none;">
|
| 3809 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3810 |
-
Downloading
|
| 3811 |
-
Downloading
|
| 3812 |
-
Downloading
|
| 3813 |
-
Downloading
|
| 3814 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3815 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3816 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3817 |
Downloading networkx (1.9MiB)
|
| 3818 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3819 |
Downloading torch (846.9MiB)
|
| 3820 |
-
Downloading
|
| 3821 |
-
Downloading triton (148.3MiB)
|
| 3822 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3823 |
-
Downloading sympy (6.0MiB)
|
| 3824 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3825 |
-
Downloading nvidia-
|
|
|
|
| 3826 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3827 |
-
Downloading
|
|
|
|
| 3828 |
Downloading kiwisolver (1.4MiB)
|
| 3829 |
-
Downloading
|
| 3830 |
Downloading pillow (6.3MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3831 |
Downloading fonttools (4.7MiB)
|
| 3832 |
-
Downloading
|
| 3833 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3834 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3835 |
Downloading nvidia-cufile-cu12
|
|
@@ -3840,8 +3875,8 @@ Downloading matplotlib (8.3MiB)
|
|
| 3840 |
Downloading fonttools
|
| 3841 |
Downloading pillow
|
| 3842 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3843 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 3844 |
Downloading matplotlib
|
|
|
|
| 3845 |
Downloading numpy
|
| 3846 |
Downloading sympy
|
| 3847 |
Downloading nvidia-nvjitlink-cu12
|
|
@@ -3850,19 +3885,19 @@ Downloading matplotlib (8.3MiB)
|
|
| 3850 |
Downloading triton
|
| 3851 |
Downloading nvidia-cufft-cu12
|
| 3852 |
Downloading nvidia-cusolver-cu12
|
| 3853 |
-
Downloading nvidia-cusparselt-cu12
|
| 3854 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 3855 |
Downloading nvidia-nccl-cu12
|
| 3856 |
Downloading nvidia-cublas-cu12
|
| 3857 |
Downloading nvidia-cudnn-cu12
|
| 3858 |
Downloading torch
|
| 3859 |
-
Installed 47 packages in
|
| 3860 |
</div>
|
| 3861 |
</div>
|
| 3862 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 3863 |
-
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:
|
| 3864 |
-
Fetching 20 files: 10%|█ | 2/20 [00:01<00:
|
| 3865 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00,
|
| 3866 |
<div class="cell-artifacts">
|
| 3867 |
<h4>Artifacts:</h4>
|
| 3868 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
|
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
| 722 |
+
/* CSV table styling */
|
| 723 |
+
.artifact-csv {
|
| 724 |
+
margin-top: 1rem;
|
| 725 |
+
overflow-x: auto;
|
| 726 |
+
}
|
| 727 |
+
.csv-table {
|
| 728 |
+
width: 100%;
|
| 729 |
+
border-collapse: collapse;
|
| 730 |
+
font-size: 0.9rem;
|
| 731 |
+
background: var(--bg-secondary);
|
| 732 |
+
border: 1px solid var(--border-primary);
|
| 733 |
+
border-radius: 1px;
|
| 734 |
+
}
|
| 735 |
+
.csv-table th,
|
| 736 |
+
.csv-table td {
|
| 737 |
+
padding: 0.5rem 0.75rem;
|
| 738 |
+
text-align: left;
|
| 739 |
+
border: 1px solid var(--border-primary);
|
| 740 |
+
}
|
| 741 |
+
.csv-table th {
|
| 742 |
+
background: var(--bg-tertiary);
|
| 743 |
+
font-weight: 600;
|
| 744 |
+
color: var(--text-primary);
|
| 745 |
+
}
|
| 746 |
+
.csv-table tbody tr:hover {
|
| 747 |
+
background: var(--bg-artifact-hover);
|
| 748 |
+
}
|
| 749 |
+
.artifact-csv-error {
|
| 750 |
+
margin-top: 1rem;
|
| 751 |
+
padding: 1rem;
|
| 752 |
+
background: var(--bg-error);
|
| 753 |
+
color: var(--text-error);
|
| 754 |
+
border: 1px solid var(--border-error);
|
| 755 |
+
border-radius: 1px;
|
| 756 |
+
}
|
| 757 |
.cell-failed {
|
| 758 |
border-color: var(--border-cell-failed);
|
| 759 |
}
|
|
|
|
| 3745 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
+
Cell: benchmark | 39.43s
|
| 3749 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3751 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3770 |
<span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
|
| 3771 |
<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
|
| 3772 |
|
| 3773 |
+
<span class="n">hf_kernels_flash_attn</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">"kernels-community/flash-attn"</span><span class="p">,</span> <span class="n">revision</span><span class="o">=</span><span class="s2">"v0.0.2"</span><span class="p">)</span>
|
| 3774 |
|
| 3775 |
|
| 3776 |
<span class="k">def</span><span class="w"> </span><span class="nf">hf_flash_attention</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
|
|
|
|
| 3832 |
<div id="output-benchmark" class="cell-output">
|
| 3833 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3834 |
hf_kernels_flash_attn flux_L128 0.34 True
|
| 3835 |
+
hf_kernels_flash_attn flux_L256 0.38 True
|
| 3836 |
hf_kernels_flash_attn flux_L320 0.49 True
|
| 3837 |
hf_kernels_flash_attn flux_L384 0.51 True
|
| 3838 |
+
hf_kernels_flash_attn flux_L448 0.54 True
|
| 3839 |
+
hf_kernels_flash_attn flux_L512 0.55 True
|
| 3840 |
</div>
|
| 3841 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3842 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3843 |
<div class="uv-logs-content" style="display: none;">
|
| 3844 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3845 |
+
Downloading sympy (6.0MiB)
|
| 3846 |
+
Downloading matplotlib (8.3MiB)
|
| 3847 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3848 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
| 3849 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3850 |
Downloading networkx (1.9MiB)
|
|
|
|
| 3851 |
Downloading torch (846.9MiB)
|
| 3852 |
+
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
|
|
|
| 3853 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3854 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3855 |
+
Downloading triton (148.3MiB)
|
| 3856 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3857 |
+
Downloading numpy (16.2MiB)
|
| 3858 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3859 |
Downloading kiwisolver (1.4MiB)
|
| 3860 |
+
Downloading hf-xet (3.0MiB)
|
| 3861 |
Downloading pillow (6.3MiB)
|
| 3862 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3863 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3864 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3865 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3866 |
Downloading fonttools (4.7MiB)
|
| 3867 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3868 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3869 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3870 |
Downloading nvidia-cufile-cu12
|
|
|
|
| 3875 |
Downloading fonttools
|
| 3876 |
Downloading pillow
|
| 3877 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
|
|
|
| 3878 |
Downloading matplotlib
|
| 3879 |
+
Downloading nvidia-cuda-cupti-cu12
|
| 3880 |
Downloading numpy
|
| 3881 |
Downloading sympy
|
| 3882 |
Downloading nvidia-nvjitlink-cu12
|
|
|
|
| 3885 |
Downloading triton
|
| 3886 |
Downloading nvidia-cufft-cu12
|
| 3887 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3888 |
Downloading nvidia-cusparse-cu12
|
| 3889 |
+
Downloading nvidia-cusparselt-cu12
|
| 3890 |
Downloading nvidia-nccl-cu12
|
| 3891 |
Downloading nvidia-cublas-cu12
|
| 3892 |
Downloading nvidia-cudnn-cu12
|
| 3893 |
Downloading torch
|
| 3894 |
+
Installed 47 packages in 552ms
|
| 3895 |
</div>
|
| 3896 |
</div>
|
| 3897 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 3898 |
+
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03, 5.41it/s]
|
| 3899 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:16, 1.09it/s]
|
| 3900 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 12.37it/s]</div>
|
| 3901 |
<div class="cell-artifacts">
|
| 3902 |
<h4>Artifacts:</h4>
|
| 3903 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -719,6 +719,41 @@
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
.cell-failed {
|
| 723 |
border-color: var(--border-cell-failed);
|
| 724 |
}
|
|
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: benchmark |
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3798,7 +3833,7 @@ Cell: benchmark | 41.76s
|
|
| 3798 |
hf_kernels_flash_attn3 flux_L128 0.36 True
|
| 3799 |
hf_kernels_flash_attn3 flux_L256 0.39 True
|
| 3800 |
hf_kernels_flash_attn3 flux_L320 0.52 True
|
| 3801 |
-
hf_kernels_flash_attn3 flux_L384 0.
|
| 3802 |
hf_kernels_flash_attn3 flux_L448 0.57 True
|
| 3803 |
hf_kernels_flash_attn3 flux_L512 0.57 True
|
| 3804 |
</div>
|
|
@@ -3806,29 +3841,29 @@ hf_kernels_flash_attn3 flux_L512 0.57 True
|
|
| 3806 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3807 |
<div class="uv-logs-content" style="display: none;">
|
| 3808 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3809 |
-
Downloading sympy (6.0MiB)
|
| 3810 |
-
Downloading networkx (1.9MiB)
|
| 3811 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3812 |
-
Downloading
|
| 3813 |
Downloading setuptools (1.1MiB)
|
| 3814 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3815 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3816 |
-
Downloading
|
|
|
|
| 3817 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3818 |
-
Downloading nvidia-
|
| 3819 |
-
Downloading
|
| 3820 |
-
Downloading
|
| 3821 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3822 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3823 |
-
Downloading hf-xet (3.0MiB)
|
| 3824 |
-
Downloading pillow (6.3MiB)
|
| 3825 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
| 3826 |
Downloading kiwisolver (1.4MiB)
|
| 3827 |
-
Downloading
|
| 3828 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3829 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3830 |
-
Downloading triton (148.3MiB)
|
| 3831 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3832 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3833 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3834 |
Downloading nvidia-cufile-cu12
|
|
@@ -3849,19 +3884,19 @@ Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
| 3849 |
Downloading triton
|
| 3850 |
Downloading nvidia-cufft-cu12
|
| 3851 |
Downloading nvidia-cusolver-cu12
|
| 3852 |
-
Downloading nvidia-cusparselt-cu12
|
| 3853 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 3854 |
Downloading nvidia-nccl-cu12
|
| 3855 |
Downloading nvidia-cublas-cu12
|
| 3856 |
Downloading nvidia-cudnn-cu12
|
| 3857 |
Downloading torch
|
| 3858 |
-
Installed 47 packages in
|
| 3859 |
</div>
|
| 3860 |
</div>
|
| 3861 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 3862 |
-
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00,
|
| 3863 |
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.09it/s]
|
| 3864 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 3865 |
<div class="cell-artifacts">
|
| 3866 |
<h4>Artifacts:</h4>
|
| 3867 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
|
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
| 722 |
+
/* CSV table styling */
|
| 723 |
+
.artifact-csv {
|
| 724 |
+
margin-top: 1rem;
|
| 725 |
+
overflow-x: auto;
|
| 726 |
+
}
|
| 727 |
+
.csv-table {
|
| 728 |
+
width: 100%;
|
| 729 |
+
border-collapse: collapse;
|
| 730 |
+
font-size: 0.9rem;
|
| 731 |
+
background: var(--bg-secondary);
|
| 732 |
+
border: 1px solid var(--border-primary);
|
| 733 |
+
border-radius: 1px;
|
| 734 |
+
}
|
| 735 |
+
.csv-table th,
|
| 736 |
+
.csv-table td {
|
| 737 |
+
padding: 0.5rem 0.75rem;
|
| 738 |
+
text-align: left;
|
| 739 |
+
border: 1px solid var(--border-primary);
|
| 740 |
+
}
|
| 741 |
+
.csv-table th {
|
| 742 |
+
background: var(--bg-tertiary);
|
| 743 |
+
font-weight: 600;
|
| 744 |
+
color: var(--text-primary);
|
| 745 |
+
}
|
| 746 |
+
.csv-table tbody tr:hover {
|
| 747 |
+
background: var(--bg-artifact-hover);
|
| 748 |
+
}
|
| 749 |
+
.artifact-csv-error {
|
| 750 |
+
margin-top: 1rem;
|
| 751 |
+
padding: 1rem;
|
| 752 |
+
background: var(--bg-error);
|
| 753 |
+
color: var(--text-error);
|
| 754 |
+
border: 1px solid var(--border-error);
|
| 755 |
+
border-radius: 1px;
|
| 756 |
+
}
|
| 757 |
.cell-failed {
|
| 758 |
border-color: var(--border-cell-failed);
|
| 759 |
}
|
|
|
|
| 3745 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
+
Cell: benchmark | 39.41s
|
| 3749 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3751 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3833 |
hf_kernels_flash_attn3 flux_L128 0.36 True
|
| 3834 |
hf_kernels_flash_attn3 flux_L256 0.39 True
|
| 3835 |
hf_kernels_flash_attn3 flux_L320 0.52 True
|
| 3836 |
+
hf_kernels_flash_attn3 flux_L384 0.52 True
|
| 3837 |
hf_kernels_flash_attn3 flux_L448 0.57 True
|
| 3838 |
hf_kernels_flash_attn3 flux_L512 0.57 True
|
| 3839 |
</div>
|
|
|
|
| 3841 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3842 |
<div class="uv-logs-content" style="display: none;">
|
| 3843 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
|
|
|
|
|
|
| 3844 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3845 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3846 |
Downloading setuptools (1.1MiB)
|
| 3847 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3848 |
+
Downloading pillow (6.3MiB)
|
| 3849 |
+
Downloading numpy (16.2MiB)
|
| 3850 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3851 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3852 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3853 |
+
Downloading networkx (1.9MiB)
|
| 3854 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3855 |
+
Downloading sympy (6.0MiB)
|
| 3856 |
+
Downloading hf-xet (3.0MiB)
|
| 3857 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3858 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3859 |
+
Downloading torch (846.9MiB)
|
| 3860 |
+
Downloading triton (148.3MiB)
|
|
|
|
| 3861 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
| 3862 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3863 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3864 |
+
Downloading fonttools (4.7MiB)
|
| 3865 |
Downloading kiwisolver (1.4MiB)
|
| 3866 |
+
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3867 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3868 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3869 |
Downloading nvidia-cufile-cu12
|
|
|
|
| 3884 |
Downloading triton
|
| 3885 |
Downloading nvidia-cufft-cu12
|
| 3886 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3887 |
Downloading nvidia-cusparse-cu12
|
| 3888 |
+
Downloading nvidia-cusparselt-cu12
|
| 3889 |
Downloading nvidia-nccl-cu12
|
| 3890 |
Downloading nvidia-cublas-cu12
|
| 3891 |
Downloading nvidia-cudnn-cu12
|
| 3892 |
Downloading torch
|
| 3893 |
+
Installed 47 packages in 529ms
|
| 3894 |
</div>
|
| 3895 |
</div>
|
| 3896 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 3897 |
+
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 6.35it/s]
|
| 3898 |
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.09it/s]
|
| 3899 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.48it/s]</div>
|
| 3900 |
<div class="cell-artifacts">
|
| 3901 |
<h4>Artifacts:</h4>
|
| 3902 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -719,6 +719,41 @@
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
.cell-failed {
|
| 723 |
border-color: var(--border-cell-failed);
|
| 724 |
}
|
|
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: benchmark |
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3803,35 +3838,35 @@ torch_mem_eff flux_L512 0.95 True
|
|
| 3803 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3804 |
<div class="uv-logs-content" style="display: none;">
|
| 3805 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3806 |
-
Downloading
|
| 3807 |
-
Downloading
|
| 3808 |
-
Downloading
|
| 3809 |
Downloading kiwisolver (1.4MiB)
|
| 3810 |
-
Downloading nvidia-
|
|
|
|
|
|
|
| 3811 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3812 |
-
Downloading
|
| 3813 |
-
Downloading sympy (6.0MiB)
|
| 3814 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3815 |
-
Downloading
|
| 3816 |
-
Downloading
|
| 3817 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
| 3818 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3819 |
-
Downloading
|
| 3820 |
-
Downloading matplotlib (8.3MiB)
|
| 3821 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3822 |
-
Downloading
|
| 3823 |
-
Downloading nvidia-
|
| 3824 |
-
Downloading nvidia-
|
| 3825 |
-
Downloading torch (846.9MiB)
|
| 3826 |
Downloading triton (148.3MiB)
|
| 3827 |
-
Downloading pillow (6.3MiB)
|
| 3828 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3829 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3830 |
Downloading nvidia-cufile-cu12
|
| 3831 |
Downloading kiwisolver
|
| 3832 |
Downloading setuptools
|
| 3833 |
-
Downloading fonttools
|
| 3834 |
Downloading networkx
|
|
|
|
| 3835 |
Downloading pillow
|
| 3836 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3837 |
Downloading nvidia-cuda-cupti-cu12
|
|
@@ -3850,7 +3885,7 @@ Downloading pillow (6.3MiB)
|
|
| 3850 |
Downloading nvidia-cublas-cu12
|
| 3851 |
Downloading nvidia-cudnn-cu12
|
| 3852 |
Downloading torch
|
| 3853 |
-
Installed 37 packages in
|
| 3854 |
</div>
|
| 3855 |
</div>
|
| 3856 |
<div class="cell-artifacts">
|
|
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
| 722 |
+
/* CSV table styling */
|
| 723 |
+
.artifact-csv {
|
| 724 |
+
margin-top: 1rem;
|
| 725 |
+
overflow-x: auto;
|
| 726 |
+
}
|
| 727 |
+
.csv-table {
|
| 728 |
+
width: 100%;
|
| 729 |
+
border-collapse: collapse;
|
| 730 |
+
font-size: 0.9rem;
|
| 731 |
+
background: var(--bg-secondary);
|
| 732 |
+
border: 1px solid var(--border-primary);
|
| 733 |
+
border-radius: 1px;
|
| 734 |
+
}
|
| 735 |
+
.csv-table th,
|
| 736 |
+
.csv-table td {
|
| 737 |
+
padding: 0.5rem 0.75rem;
|
| 738 |
+
text-align: left;
|
| 739 |
+
border: 1px solid var(--border-primary);
|
| 740 |
+
}
|
| 741 |
+
.csv-table th {
|
| 742 |
+
background: var(--bg-tertiary);
|
| 743 |
+
font-weight: 600;
|
| 744 |
+
color: var(--text-primary);
|
| 745 |
+
}
|
| 746 |
+
.csv-table tbody tr:hover {
|
| 747 |
+
background: var(--bg-artifact-hover);
|
| 748 |
+
}
|
| 749 |
+
.artifact-csv-error {
|
| 750 |
+
margin-top: 1rem;
|
| 751 |
+
padding: 1rem;
|
| 752 |
+
background: var(--bg-error);
|
| 753 |
+
color: var(--text-error);
|
| 754 |
+
border: 1px solid var(--border-error);
|
| 755 |
+
border-radius: 1px;
|
| 756 |
+
}
|
| 757 |
.cell-failed {
|
| 758 |
border-color: var(--border-cell-failed);
|
| 759 |
}
|
|
|
|
| 3745 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
+
Cell: benchmark | 36.09s
|
| 3749 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3751 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3838 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3839 |
<div class="uv-logs-content" style="display: none;">
|
| 3840 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3841 |
+
Downloading sympy (6.0MiB)
|
| 3842 |
+
Downloading setuptools (1.1MiB)
|
| 3843 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3844 |
Downloading kiwisolver (1.4MiB)
|
| 3845 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3846 |
+
Downloading torch (846.9MiB)
|
| 3847 |
+
Downloading matplotlib (8.3MiB)
|
| 3848 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3849 |
+
Downloading pillow (6.3MiB)
|
|
|
|
| 3850 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3851 |
+
Downloading networkx (1.9MiB)
|
| 3852 |
+
Downloading numpy (16.2MiB)
|
| 3853 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3854 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3855 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3856 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3857 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 3858 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3859 |
+
Downloading fonttools (4.7MiB)
|
| 3860 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3861 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
| 3862 |
Downloading triton (148.3MiB)
|
|
|
|
| 3863 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3864 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3865 |
Downloading nvidia-cufile-cu12
|
| 3866 |
Downloading kiwisolver
|
| 3867 |
Downloading setuptools
|
|
|
|
| 3868 |
Downloading networkx
|
| 3869 |
+
Downloading fonttools
|
| 3870 |
Downloading pillow
|
| 3871 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3872 |
Downloading nvidia-cuda-cupti-cu12
|
|
|
|
| 3885 |
Downloading nvidia-cublas-cu12
|
| 3886 |
Downloading nvidia-cudnn-cu12
|
| 3887 |
Downloading torch
|
| 3888 |
+
Installed 37 packages in 447ms
|
| 3889 |
</div>
|
| 3890 |
</div>
|
| 3891 |
<div class="cell-artifacts">
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -719,6 +719,41 @@
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
.cell-failed {
|
| 723 |
border-color: var(--border-cell-failed);
|
| 724 |
}
|
|
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: benchmark | 40.
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3804,53 +3839,53 @@ Cell: benchmark | 40.43s
|
|
| 3804 |
<div id="output-benchmark" class="cell-output">
|
| 3805 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3806 |
sage_int8_fp16 flux_L128 FAIL False
|
| 3807 |
-
Error: module '
|
| 3808 |
sage_int8_fp16 flux_L256 FAIL False
|
| 3809 |
-
Error: module '
|
| 3810 |
sage_int8_fp16 flux_L320 FAIL False
|
| 3811 |
-
Error: module '
|
| 3812 |
sage_int8_fp16 flux_L384 FAIL False
|
| 3813 |
-
Error: module '
|
| 3814 |
sage_int8_fp16 flux_L448 FAIL False
|
| 3815 |
-
Error: module '
|
| 3816 |
sage_int8_fp16 flux_L512 FAIL False
|
| 3817 |
-
Error: module '
|
| 3818 |
</div>
|
| 3819 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3820 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3821 |
<div class="uv-logs-content" style="display: none;">
|
| 3822 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
|
|
|
| 3823 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3824 |
-
Downloading nvidia-
|
| 3825 |
-
Downloading networkx (1.9MiB)
|
| 3826 |
-
Downloading setuptools (1.1MiB)
|
| 3827 |
Downloading numpy (16.2MiB)
|
| 3828 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3829 |
-
Downloading
|
| 3830 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3831 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3832 |
Downloading hf-xet (3.0MiB)
|
| 3833 |
-
Downloading torch (846.9MiB)
|
| 3834 |
-
Downloading triton (148.3MiB)
|
| 3835 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3836 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3837 |
-
Downloading
|
| 3838 |
Downloading pillow (6.3MiB)
|
| 3839 |
-
Downloading sympy (6.0MiB)
|
| 3840 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3841 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3842 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3843 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 3844 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3845 |
Downloading fonttools (4.7MiB)
|
|
|
|
|
|
|
| 3846 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3847 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3848 |
Downloading nvidia-cufile-cu12
|
| 3849 |
Downloading kiwisolver
|
| 3850 |
Downloading hf-xet
|
| 3851 |
Downloading setuptools
|
| 3852 |
-
Downloading networkx
|
| 3853 |
Downloading fonttools
|
|
|
|
| 3854 |
Downloading pillow
|
| 3855 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3856 |
Downloading nvidia-cuda-cupti-cu12
|
|
@@ -3863,19 +3898,20 @@ Downloading fonttools (4.7MiB)
|
|
| 3863 |
Downloading triton
|
| 3864 |
Downloading nvidia-cufft-cu12
|
| 3865 |
Downloading nvidia-cusolver-cu12
|
| 3866 |
-
Downloading nvidia-cusparse-cu12
|
| 3867 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 3868 |
Downloading nvidia-nccl-cu12
|
| 3869 |
Downloading nvidia-cublas-cu12
|
| 3870 |
Downloading nvidia-cudnn-cu12
|
| 3871 |
Downloading torch
|
| 3872 |
-
Installed 48 packages in
|
| 3873 |
</div>
|
| 3874 |
</div>
|
| 3875 |
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3876 |
-
Fetching 11 files: 9%|▉ | 1/11 [00:00<00:
|
| 3877 |
-
Fetching 11 files:
|
| 3878 |
-
Fetching 11 files:
|
|
|
|
| 3879 |
<div class="cell-artifacts">
|
| 3880 |
<h4>Artifacts:</h4>
|
| 3881 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
|
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
| 722 |
+
/* CSV table styling */
|
| 723 |
+
.artifact-csv {
|
| 724 |
+
margin-top: 1rem;
|
| 725 |
+
overflow-x: auto;
|
| 726 |
+
}
|
| 727 |
+
.csv-table {
|
| 728 |
+
width: 100%;
|
| 729 |
+
border-collapse: collapse;
|
| 730 |
+
font-size: 0.9rem;
|
| 731 |
+
background: var(--bg-secondary);
|
| 732 |
+
border: 1px solid var(--border-primary);
|
| 733 |
+
border-radius: 1px;
|
| 734 |
+
}
|
| 735 |
+
.csv-table th,
|
| 736 |
+
.csv-table td {
|
| 737 |
+
padding: 0.5rem 0.75rem;
|
| 738 |
+
text-align: left;
|
| 739 |
+
border: 1px solid var(--border-primary);
|
| 740 |
+
}
|
| 741 |
+
.csv-table th {
|
| 742 |
+
background: var(--bg-tertiary);
|
| 743 |
+
font-weight: 600;
|
| 744 |
+
color: var(--text-primary);
|
| 745 |
+
}
|
| 746 |
+
.csv-table tbody tr:hover {
|
| 747 |
+
background: var(--bg-artifact-hover);
|
| 748 |
+
}
|
| 749 |
+
.artifact-csv-error {
|
| 750 |
+
margin-top: 1rem;
|
| 751 |
+
padding: 1rem;
|
| 752 |
+
background: var(--bg-error);
|
| 753 |
+
color: var(--text-error);
|
| 754 |
+
border: 1px solid var(--border-error);
|
| 755 |
+
border-radius: 1px;
|
| 756 |
+
}
|
| 757 |
.cell-failed {
|
| 758 |
border-color: var(--border-cell-failed);
|
| 759 |
}
|
|
|
|
| 3745 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
+
Cell: benchmark | 40.08s
|
| 3749 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3751 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3839 |
<div id="output-benchmark" class="cell-output">
|
| 3840 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3841 |
sage_int8_fp16 flux_L128 FAIL False
|
| 3842 |
+
Error: module 'sage_attention_ba12545b014364be' has no attribute 'fwd'
|
| 3843 |
sage_int8_fp16 flux_L256 FAIL False
|
| 3844 |
+
Error: module 'sage_attention_ba12545b014364be' has no attribute 'fwd'
|
| 3845 |
sage_int8_fp16 flux_L320 FAIL False
|
| 3846 |
+
Error: module 'sage_attention_ba12545b014364be' has no attribute 'fwd'
|
| 3847 |
sage_int8_fp16 flux_L384 FAIL False
|
| 3848 |
+
Error: module 'sage_attention_ba12545b014364be' has no attribute 'fwd'
|
| 3849 |
sage_int8_fp16 flux_L448 FAIL False
|
| 3850 |
+
Error: module 'sage_attention_ba12545b014364be' has no attribute 'fwd'
|
| 3851 |
sage_int8_fp16 flux_L512 FAIL False
|
| 3852 |
+
Error: module 'sage_attention_ba12545b014364be' has no attribute 'fwd'
|
| 3853 |
</div>
|
| 3854 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3855 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3856 |
<div class="uv-logs-content" style="display: none;">
|
| 3857 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3858 |
+
Downloading sympy (6.0MiB)
|
| 3859 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3860 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
| 3861 |
Downloading numpy (16.2MiB)
|
| 3862 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3863 |
+
Downloading networkx (1.9MiB)
|
|
|
|
|
|
|
| 3864 |
Downloading hf-xet (3.0MiB)
|
|
|
|
|
|
|
|
|
|
| 3865 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3866 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3867 |
Downloading pillow (6.3MiB)
|
|
|
|
|
|
|
|
|
|
| 3868 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3869 |
+
Downloading triton (148.3MiB)
|
| 3870 |
+
Downloading setuptools (1.1MiB)
|
| 3871 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3872 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3873 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3874 |
+
Downloading matplotlib (8.3MiB)
|
| 3875 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3876 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3877 |
+
Downloading kiwisolver (1.4MiB)
|
| 3878 |
Downloading fonttools (4.7MiB)
|
| 3879 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3880 |
+
Downloading torch (846.9MiB)
|
| 3881 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3882 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3883 |
Downloading nvidia-cufile-cu12
|
| 3884 |
Downloading kiwisolver
|
| 3885 |
Downloading hf-xet
|
| 3886 |
Downloading setuptools
|
|
|
|
| 3887 |
Downloading fonttools
|
| 3888 |
+
Downloading networkx
|
| 3889 |
Downloading pillow
|
| 3890 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3891 |
Downloading nvidia-cuda-cupti-cu12
|
|
|
|
| 3898 |
Downloading triton
|
| 3899 |
Downloading nvidia-cufft-cu12
|
| 3900 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3901 |
Downloading nvidia-cusparselt-cu12
|
| 3902 |
+
Downloading nvidia-cusparse-cu12
|
| 3903 |
Downloading nvidia-nccl-cu12
|
| 3904 |
Downloading nvidia-cublas-cu12
|
| 3905 |
Downloading nvidia-cudnn-cu12
|
| 3906 |
Downloading torch
|
| 3907 |
+
Installed 48 packages in 531ms
|
| 3908 |
</div>
|
| 3909 |
</div>
|
| 3910 |
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3911 |
+
Fetching 11 files: 9%|▉ | 1/11 [00:00<00:02, 4.42it/s]
|
| 3912 |
+
Fetching 11 files: 27%|██▋ | 3/11 [00:00<00:01, 5.95it/s]
|
| 3913 |
+
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 12.27it/s]
|
| 3914 |
+
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 14.37it/s]</div>
|
| 3915 |
<div class="cell-artifacts">
|
| 3916 |
<h4>Artifacts:</h4>
|
| 3917 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -719,6 +719,41 @@
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
.cell-failed {
|
| 723 |
border-color: var(--border-cell-failed);
|
| 724 |
}
|
|
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: benchmark | 40.
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3797,35 +3832,35 @@ xformers_meff flux_L256 0.47 True
|
|
| 3797 |
xformers_meff flux_L320 0.60 True
|
| 3798 |
xformers_meff flux_L384 0.60 True
|
| 3799 |
xformers_meff flux_L448 0.64 True
|
| 3800 |
-
xformers_meff flux_L512 0.
|
| 3801 |
</div>
|
| 3802 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3803 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3804 |
<div class="uv-logs-content" style="display: none;">
|
| 3805 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3806 |
-
Downloading networkx (1.9MiB)
|
| 3807 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
| 3808 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3809 |
-
Downloading nvidia-
|
| 3810 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3811 |
-
Downloading triton (148.3MiB)
|
| 3812 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3813 |
-
Downloading pillow (6.3MiB)
|
| 3814 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3815 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3816 |
-
Downloading nvidia-
|
| 3817 |
-
Downloading nvidia-
|
| 3818 |
Downloading numpy (16.2MiB)
|
| 3819 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3820 |
-
Downloading sympy (6.0MiB)
|
| 3821 |
-
Downloading matplotlib (8.3MiB)
|
| 3822 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3823 |
-
Downloading xformers (111.8MiB)
|
| 3824 |
Downloading setuptools (1.1MiB)
|
|
|
|
| 3825 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 3826 |
Downloading kiwisolver (1.4MiB)
|
| 3827 |
-
Downloading
|
| 3828 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3829 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3830 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3831 |
Downloading nvidia-cufile-cu12
|
|
@@ -3837,8 +3872,8 @@ Downloading torch (846.9MiB)
|
|
| 3837 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3838 |
Downloading nvidia-cuda-cupti-cu12
|
| 3839 |
Downloading matplotlib
|
| 3840 |
-
Downloading numpy
|
| 3841 |
Downloading sympy
|
|
|
|
| 3842 |
Downloading nvidia-nvjitlink-cu12
|
| 3843 |
Downloading nvidia-curand-cu12
|
| 3844 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
@@ -3846,13 +3881,13 @@ Downloading torch (846.9MiB)
|
|
| 3846 |
Downloading triton
|
| 3847 |
Downloading nvidia-cufft-cu12
|
| 3848 |
Downloading nvidia-cusolver-cu12
|
| 3849 |
-
Downloading nvidia-cusparse-cu12
|
| 3850 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 3851 |
Downloading nvidia-nccl-cu12
|
| 3852 |
Downloading nvidia-cublas-cu12
|
| 3853 |
Downloading nvidia-cudnn-cu12
|
| 3854 |
Downloading torch
|
| 3855 |
-
Installed 38 packages in
|
| 3856 |
</div>
|
| 3857 |
</div>
|
| 3858 |
<div class="cell-artifacts">
|
|
|
|
| 719 |
.artifact-preview svg {
|
| 720 |
background: transparent;
|
| 721 |
}
|
| 722 |
+
/* CSV table styling */
|
| 723 |
+
.artifact-csv {
|
| 724 |
+
margin-top: 1rem;
|
| 725 |
+
overflow-x: auto;
|
| 726 |
+
}
|
| 727 |
+
.csv-table {
|
| 728 |
+
width: 100%;
|
| 729 |
+
border-collapse: collapse;
|
| 730 |
+
font-size: 0.9rem;
|
| 731 |
+
background: var(--bg-secondary);
|
| 732 |
+
border: 1px solid var(--border-primary);
|
| 733 |
+
border-radius: 1px;
|
| 734 |
+
}
|
| 735 |
+
.csv-table th,
|
| 736 |
+
.csv-table td {
|
| 737 |
+
padding: 0.5rem 0.75rem;
|
| 738 |
+
text-align: left;
|
| 739 |
+
border: 1px solid var(--border-primary);
|
| 740 |
+
}
|
| 741 |
+
.csv-table th {
|
| 742 |
+
background: var(--bg-tertiary);
|
| 743 |
+
font-weight: 600;
|
| 744 |
+
color: var(--text-primary);
|
| 745 |
+
}
|
| 746 |
+
.csv-table tbody tr:hover {
|
| 747 |
+
background: var(--bg-artifact-hover);
|
| 748 |
+
}
|
| 749 |
+
.artifact-csv-error {
|
| 750 |
+
margin-top: 1rem;
|
| 751 |
+
padding: 1rem;
|
| 752 |
+
background: var(--bg-error);
|
| 753 |
+
color: var(--text-error);
|
| 754 |
+
border: 1px solid var(--border-error);
|
| 755 |
+
border-radius: 1px;
|
| 756 |
+
}
|
| 757 |
.cell-failed {
|
| 758 |
border-color: var(--border-cell-failed);
|
| 759 |
}
|
|
|
|
| 3745 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3746 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3747 |
</span> |
|
| 3748 |
+
Cell: benchmark | 40.41s
|
| 3749 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3750 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3751 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3832 |
xformers_meff flux_L320 0.60 True
|
| 3833 |
xformers_meff flux_L384 0.60 True
|
| 3834 |
xformers_meff flux_L448 0.64 True
|
| 3835 |
+
xformers_meff flux_L512 0.65 True
|
| 3836 |
</div>
|
| 3837 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3838 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3839 |
<div class="uv-logs-content" style="display: none;">
|
| 3840 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
|
|
|
| 3841 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3842 |
+
Downloading pillow (6.3MiB)
|
| 3843 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3844 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3845 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3846 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3847 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3848 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3849 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3850 |
Downloading numpy (16.2MiB)
|
| 3851 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3852 |
Downloading setuptools (1.1MiB)
|
| 3853 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3854 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3855 |
+
Downloading networkx (1.9MiB)
|
| 3856 |
Downloading kiwisolver (1.4MiB)
|
| 3857 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3858 |
Downloading torch (846.9MiB)
|
| 3859 |
+
Downloading matplotlib (8.3MiB)
|
| 3860 |
+
Downloading triton (148.3MiB)
|
| 3861 |
+
Downloading sympy (6.0MiB)
|
| 3862 |
+
Downloading fonttools (4.7MiB)
|
| 3863 |
+
Downloading xformers (111.8MiB)
|
| 3864 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3865 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3866 |
Downloading nvidia-cufile-cu12
|
|
|
|
| 3872 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3873 |
Downloading nvidia-cuda-cupti-cu12
|
| 3874 |
Downloading matplotlib
|
|
|
|
| 3875 |
Downloading sympy
|
| 3876 |
+
Downloading numpy
|
| 3877 |
Downloading nvidia-nvjitlink-cu12
|
| 3878 |
Downloading nvidia-curand-cu12
|
| 3879 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
|
|
| 3881 |
Downloading triton
|
| 3882 |
Downloading nvidia-cufft-cu12
|
| 3883 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3884 |
Downloading nvidia-cusparselt-cu12
|
| 3885 |
+
Downloading nvidia-cusparse-cu12
|
| 3886 |
Downloading nvidia-nccl-cu12
|
| 3887 |
Downloading nvidia-cublas-cu12
|
| 3888 |
Downloading nvidia-cudnn-cu12
|
| 3889 |
Downloading torch
|
| 3890 |
+
Installed 38 packages in 452ms
|
| 3891 |
</div>
|
| 3892 |
</div>
|
| 3893 |
<div class="cell-artifacts">
|
flash_attn/results/artifacts/combine/latency.csv
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Implementation,Impl ID,Workload,Batch,Seq Length,Heads,Head Dim,Dtype,Mean (ms),P10 (ms),P50 (ms),P90 (ms),Reps,Peak Mem (MB),Backend,Family
|
| 2 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L128,1,1152,24,128,bfloat16,0.48577280044555665,0.47836801409721375,0.4803520143032074,0.4827199876308441,5,83.38,FLASH,torch-sdpa
|
| 3 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L256,1,1280,24,128,bfloat16,0.5229184031486511,0.521727979183197,0.5228800177574158,0.5234559774398804,5,90.62,FLASH,torch-sdpa
|
| 4 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L320,1,1344,24,128,bfloat16,0.6515967845916748,0.6503999829292297,0.650879979133606,0.6513599753379822,5,95.06,FLASH,torch-sdpa
|
| 5 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L384,1,1408,24,128,bfloat16,0.6807615995407105,0.6797440052032471,0.6808639764785767,0.6815680265426636,5,99.88,FLASH,torch-sdpa
|
| 6 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L448,1,1472,24,128,bfloat16,0.711027193069458,0.7058879733085632,0.7121919989585876,0.7131519913673401,5,103.81,FLASH,torch-sdpa
|
| 7 |
+
Flash (PyTorch SDPA),torch_flash_ma,flux_L512,1,1536,24,128,bfloat16,0.7391423940658569,0.7369279861450195,0.7383999824523926,0.7408959865570068,5,109.12,FLASH,torch-sdpa
|
| 8 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L128,1,1152,24,128,bfloat16,0.5875647902488709,0.5863680243492126,0.5874559879302979,0.5876479744911194,5,83.38,EFFICIENT,torch-sdpa
|
| 9 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L256,1,1280,24,128,bfloat16,0.653657603263855,0.6485440135002136,0.6537600159645081,0.656544029712677,5,90.62,EFFICIENT,torch-sdpa
|
| 10 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L320,1,1344,24,128,bfloat16,0.7784512042999268,0.774944007396698,0.778656005859375,0.7801600098609924,5,95.94,EFFICIENT,torch-sdpa
|
| 11 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L384,1,1408,24,128,bfloat16,0.7922943949699401,0.791263997554779,0.7924799919128418,0.7927039861679077,5,100.0,EFFICIENT,torch-sdpa
|
| 12 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L448,1,1472,24,128,bfloat16,0.848089587688446,0.8444799780845642,0.8470079898834229,0.8499199748039246,5,103.81,EFFICIENT,torch-sdpa
|
| 13 |
+
MemEff (PyTorch SDPA),torch_mem_eff,flux_L512,1,1536,24,128,bfloat16,0.9523776054382325,0.95004802942276,0.9519039988517761,0.9541119933128357,5,109.12,EFFICIENT,torch-sdpa
|
| 14 |
+
xFormers,xformers_meff,flux_L128,1,1152,24,128,bfloat16,0.45066879987716674,0.4474239945411682,0.44921600818634033,0.45241600275039673,5,83.38,memory_efficient,xformers
|
| 15 |
+
xFormers,xformers_meff,flux_L256,1,1280,24,128,bfloat16,0.47004159688949587,0.4652479887008667,0.4705919921398163,0.4716799855232239,5,90.62,memory_efficient,xformers
|
| 16 |
+
xFormers,xformers_meff,flux_L320,1,1344,24,128,bfloat16,0.6022783994674683,0.5987840294837952,0.6021760106086731,0.6045759916305542,5,95.06,memory_efficient,xformers
|
| 17 |
+
xFormers,xformers_meff,flux_L384,1,1408,24,128,bfloat16,0.6013055920600892,0.6000319719314575,0.600383996963501,0.6016640067100525,5,99.88,memory_efficient,xformers
|
| 18 |
+
xFormers,xformers_meff,flux_L448,1,1472,24,128,bfloat16,0.6408192038536071,0.639136016368866,0.6404479742050171,0.6416320204734802,5,103.81,memory_efficient,xformers
|
| 19 |
+
xFormers,xformers_meff,flux_L512,1,1536,24,128,bfloat16,0.6466111898422241,0.6447359919548035,0.6462399959564209,0.6483839750289917,5,109.12,memory_efficient,xformers
|
| 20 |
+
Compiled (default),torch_flash_compiled_default,flux_L128,1,1152,24,128,bfloat16,0.527347207069397,0.5194560289382935,0.5272960066795349,0.5312960147857666,5,83.38,FLASH,torch-sdpa
|
| 21 |
+
Compiled (default),torch_flash_compiled_default,flux_L256,1,1280,24,128,bfloat16,0.5586367964744567,0.5560640096664429,0.5571519732475281,0.5611839890480042,5,90.62,FLASH,torch-sdpa
|
| 22 |
+
Compiled (default),torch_flash_compiled_default,flux_L320,1,1344,24,128,bfloat16,0.6860736012458801,0.6841920018196106,0.6860160231590271,0.6869760155677795,5,95.25,FLASH,torch-sdpa
|
| 23 |
+
Compiled (default),torch_flash_compiled_default,flux_L384,1,1408,24,128,bfloat16,0.7167360067367554,0.7152000069618225,0.7161920070648193,0.7164160013198853,5,99.88,FLASH,torch-sdpa
|
| 24 |
+
Compiled (default),torch_flash_compiled_default,flux_L448,1,1472,24,128,bfloat16,0.7423295855522156,0.7400959730148315,0.742143988609314,0.7431039810180664,5,103.81,FLASH,torch-sdpa
|
| 25 |
+
Compiled (default),torch_flash_compiled_default,flux_L512,1,1536,24,128,bfloat16,0.7743871927261352,0.7718080282211304,0.7745919823646545,0.7748159766197205,5,109.12,FLASH,torch-sdpa
|
| 26 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L128,1,1152,24,128,bfloat16,0.6489088058471679,0.6148160099983215,0.6296960115432739,0.6522240042686462,5,67.5,FLASH,torch-sdpa
|
| 27 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L256,1,1280,24,128,bfloat16,0.700761592388153,0.6615359783172607,0.6821119785308838,0.7128959894180298,5,75.0,FLASH,torch-sdpa
|
| 28 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L320,1,1344,24,128,bfloat16,0.834444797039032,0.7967039942741394,0.8164799809455872,0.8463680148124695,5,80.38,FLASH,torch-sdpa
|
| 29 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L384,1,1408,24,128,bfloat16,0.8709375977516174,0.8432319760322571,0.8498560190200806,0.8750079870223999,5,82.5,FLASH,torch-sdpa
|
| 30 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L448,1,1472,24,128,bfloat16,0.9069631934165955,0.8775359988212585,0.9030719995498657,0.903872013092041,5,86.25,FLASH,torch-sdpa
|
| 31 |
+
Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L512,1,1536,24,128,bfloat16,0.9371584057807922,0.9145920276641846,0.9164159893989563,0.9357439875602722,5,90.0,FLASH,torch-sdpa
|
| 32 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L128,1,1152,24,128,bfloat16,0.34446719884872434,0.3438720107078552,0.3445119857788086,0.34457600116729736,5,83.38,flash-attn,hf-kernels
|
| 33 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L256,1,1280,24,128,bfloat16,0.37571839094161985,0.37404799461364746,0.3763839900493622,0.3766399919986725,5,90.62,flash-attn,hf-kernels
|
| 34 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L320,1,1344,24,128,bfloat16,0.4945920050144196,0.4925439953804016,0.493120014667511,0.4938240051269531,5,95.06,flash-attn,hf-kernels
|
| 35 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L384,1,1408,24,128,bfloat16,0.5139647841453552,0.5123199820518494,0.5142719745635986,0.5147839784622192,5,99.88,flash-attn,hf-kernels
|
| 36 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L448,1,1472,24,128,bfloat16,0.5353855967521668,0.5339199900627136,0.5350080132484436,0.5352320075035095,5,103.81,flash-attn,hf-kernels
|
| 37 |
+
HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L512,1,1536,24,128,bfloat16,0.5548800110816956,0.5538560152053833,0.5548800230026245,0.5553280115127563,5,109.12,flash-attn,hf-kernels
|
| 38 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L128,1,1152,24,128,bfloat16,0.3617343962192535,0.36102399230003357,0.3616960048675537,0.36211198568344116,5,83.38,flash-attn3,hf-kernels
|
| 39 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L256,1,1280,24,128,bfloat16,0.3907967984676361,0.3885439932346344,0.39056000113487244,0.3906239867210388,5,90.62,flash-attn3,hf-kernels
|
| 40 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L320,1,1344,24,128,bfloat16,0.5228991985321045,0.521344006061554,0.5230720043182373,0.5232319831848145,5,95.06,flash-attn3,hf-kernels
|
| 41 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L384,1,1408,24,128,bfloat16,0.5254656076431274,0.523904025554657,0.5249919891357422,0.526528000831604,5,99.88,flash-attn3,hf-kernels
|
| 42 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L448,1,1472,24,128,bfloat16,0.5646592020988465,0.5627840161323547,0.565343976020813,0.565343976020813,5,103.81,flash-attn3,hf-kernels
|
| 43 |
+
HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L512,1,1536,24,128,bfloat16,0.5698879957199097,0.567359983921051,0.5696640014648438,0.5698559880256653,5,109.12,flash-attn3,hf-kernels
|
flash_attn/results/artifacts/combine/latency.svg
ADDED
|
|
Git LFS Details
|
flash_attn/results/cells/combine.py
CHANGED
|
@@ -10,13 +10,173 @@
|
|
| 10 |
# [tool.uv.sources]
|
| 11 |
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 12 |
# ///
|
| 13 |
-
import torch
|
| 14 |
-
import sys
|
| 15 |
import os
|
| 16 |
-
import
|
| 17 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
#
|
| 20 |
cache_dirs = {
|
| 21 |
"Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
|
| 22 |
"MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
|
|
@@ -34,8 +194,6 @@ for name, cache_dir in cache_dirs.items():
|
|
| 34 |
print(f"{name:30s}: {cache_dir}")
|
| 35 |
print()
|
| 36 |
|
| 37 |
-
# Collect all JSONL paths
|
| 38 |
-
all_paths = []
|
| 39 |
file_mapping = {
|
| 40 |
"Flash (PyTorch SDPA)": "attn.jsonl",
|
| 41 |
"MemEff (PyTorch SDPA)": "attn.jsonl",
|
|
@@ -48,10 +206,10 @@ file_mapping = {
|
|
| 48 |
"HF Kernels Flash Attn3": "attn.jsonl",
|
| 49 |
}
|
| 50 |
|
|
|
|
| 51 |
for name, cache_dir in cache_dirs.items():
|
| 52 |
if cache_dir:
|
| 53 |
-
|
| 54 |
-
path = Path(cache_dir) / jsonl_file
|
| 55 |
if path.exists() and path.stat().st_size > 0:
|
| 56 |
all_paths.append(str(path))
|
| 57 |
print(f"✓ Found {name}: {path}")
|
|
@@ -59,30 +217,40 @@ for name, cache_dir in cache_dirs.items():
|
|
| 59 |
print(f"⊘ Empty/Missing {name}: {path}")
|
| 60 |
else:
|
| 61 |
print(f"✗ No cache dir for {name}")
|
| 62 |
-
|
| 63 |
print()
|
| 64 |
|
| 65 |
if not all_paths:
|
| 66 |
print("ERROR: No benchmark data files found!")
|
|
|
|
|
|
|
|
|
|
| 67 |
sys.exit(1)
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
print("COMBINED BENCHMARK SUMMARY")
|
| 71 |
-
print()
|
| 72 |
-
|
| 73 |
kbt.summarize(all_paths)
|
| 74 |
-
|
| 75 |
-
print()
|
| 76 |
-
print("GENERATING COMBINED VISUALIZATION")
|
| 77 |
-
print()
|
| 78 |
|
| 79 |
try:
|
|
|
|
|
|
|
| 80 |
kbt.viz(all_paths)
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
except ImportError as e:
|
| 83 |
print(f"✗ Visualization requires matplotlib: {e}")
|
| 84 |
except Exception as e:
|
| 85 |
print(f"✗ Visualization failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
print()
|
| 88 |
print("ANALYSIS COMPLETE")
|
|
@@ -90,7 +258,62 @@ print(f"Total implementations analyzed: {len(all_paths)}")
|
|
| 90 |
print(f"\nImplementations included:")
|
| 91 |
for name, cache_dir in cache_dirs.items():
|
| 92 |
if cache_dir:
|
| 93 |
-
|
| 94 |
-
path = Path(cache_dir) / jsonl_file
|
| 95 |
if path.exists() and path.stat().st_size > 0:
|
| 96 |
-
print(f" ✓ {name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
# [tool.uv.sources]
|
| 11 |
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 12 |
# ///
|
|
|
|
|
|
|
| 13 |
import os
|
| 14 |
+
import sys
|
| 15 |
from pathlib import Path
|
| 16 |
+
import json
|
| 17 |
+
import torch # noqa: F401 # imported because upstream may expect torch to be importable
|
| 18 |
+
import kernels_benchmark_tools as kbt
|
| 19 |
+
|
| 20 |
+
# --- Matplotlib setup and helpers ------------------------------------------------
|
| 21 |
+
import matplotlib as mpl
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
+
import csv
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# Keep text as text (not paths) so CSS can style fonts, size, etc.
|
| 27 |
+
mpl.rcParams["svg.fonttype"] = "none"
|
| 28 |
+
# Make ids deterministic across builds
|
| 29 |
+
mpl.rcParams["svg.hashsalt"] = "latency-benchmark-combined"
|
| 30 |
+
# Avoid auto-closed figures interfering with our tagging
|
| 31 |
+
mpl.rcParams["figure.autolayout"] = True
|
| 32 |
+
# Make background transparent
|
| 33 |
+
mpl.rcParams["figure.facecolor"] = "none"
|
| 34 |
+
mpl.rcParams["axes.facecolor"] = "none"
|
| 35 |
+
mpl.rcParams["savefig.facecolor"] = "none"
|
| 36 |
+
mpl.rcParams["savefig.edgecolor"] = "none"
|
| 37 |
+
|
| 38 |
+
def _slugify(s: str) -> str:
|
| 39 |
+
s = (s or "").strip().lower()
|
| 40 |
+
keep = []
|
| 41 |
+
for ch in s:
|
| 42 |
+
if ch.isalnum():
|
| 43 |
+
keep.append(ch)
|
| 44 |
+
elif ch in (" ", "-", "_", "/", ".", ":"):
|
| 45 |
+
keep.append("-")
|
| 46 |
+
else:
|
| 47 |
+
keep.append("")
|
| 48 |
+
out = "".join(keep)
|
| 49 |
+
while "--" in out:
|
| 50 |
+
out = out.replace("--", "-")
|
| 51 |
+
return out.strip("-") or "unnamed"
|
| 52 |
+
|
| 53 |
+
def _tag_current_figure(default_series_prefix="series"):
|
| 54 |
+
"""Attach SVG ids (gid) to key artists so they can be targeted from CSS."""
|
| 55 |
+
fig = plt.gcf()
|
| 56 |
+
if fig is None:
|
| 57 |
+
return
|
| 58 |
+
|
| 59 |
+
# Tag the figure itself
|
| 60 |
+
fig.set_gid("figure--latency")
|
| 61 |
+
|
| 62 |
+
for ax_idx, ax in enumerate(fig.get_axes(), start=1):
|
| 63 |
+
ax.set_gid(f"axes--{ax_idx}")
|
| 64 |
+
|
| 65 |
+
# Axis labels & title
|
| 66 |
+
if ax.get_title():
|
| 67 |
+
for t in ax.texts:
|
| 68 |
+
if t.get_text() == ax.get_title():
|
| 69 |
+
t.set_gid("title--main")
|
| 70 |
+
if ax.xaxis and ax.xaxis.get_label():
|
| 71 |
+
ax.xaxis.label.set_gid("label--x")
|
| 72 |
+
if ax.yaxis and ax.yaxis.get_label():
|
| 73 |
+
ax.yaxis.label.set_gid("label--y")
|
| 74 |
+
|
| 75 |
+
# Gridlines
|
| 76 |
+
for i, gl in enumerate(ax.get_xgridlines(), start=1):
|
| 77 |
+
gl.set_gid(f"grid-x--{i}")
|
| 78 |
+
for i, gl in enumerate(ax.get_ygridlines(), start=1):
|
| 79 |
+
gl.set_gid(f"grid-y--{i}")
|
| 80 |
+
|
| 81 |
+
# Legend block & entries
|
| 82 |
+
leg = ax.get_legend()
|
| 83 |
+
if leg is not None:
|
| 84 |
+
leg.set_gid("legend")
|
| 85 |
+
for i, txt in enumerate(leg.get_texts(), start=1):
|
| 86 |
+
label_slug = _slugify(txt.get_text())
|
| 87 |
+
txt.set_gid(f"legend-label--{label_slug or i}")
|
| 88 |
+
|
| 89 |
+
# Series (lines, patches)
|
| 90 |
+
# Lines
|
| 91 |
+
line_seen = {}
|
| 92 |
+
for ln in getattr(ax, "lines", []):
|
| 93 |
+
raw_label = ln.get_label() or ""
|
| 94 |
+
# Matplotlib uses labels beginning with "_" for non-legendable items
|
| 95 |
+
label = raw_label if not raw_label.startswith("_") else f"{default_series_prefix}"
|
| 96 |
+
slug = _slugify(label)
|
| 97 |
+
line_seen[slug] = line_seen.get(slug, 0) + 1
|
| 98 |
+
suffix = "" if line_seen[slug] == 1 else f"-{line_seen[slug]}"
|
| 99 |
+
ln.set_gid(f"series--{slug}{suffix}")
|
| 100 |
+
|
| 101 |
+
# Patches (bars, areas)
|
| 102 |
+
patch_seen = {}
|
| 103 |
+
for pt in getattr(ax, "patches", []):
|
| 104 |
+
label = getattr(pt, "get_label", lambda: "")() or f"{default_series_prefix}"
|
| 105 |
+
if isinstance(label, str) and label.startswith("_"):
|
| 106 |
+
label = default_series_prefix
|
| 107 |
+
slug = _slugify(label)
|
| 108 |
+
patch_seen[slug] = patch_seen.get(slug, 0) + 1
|
| 109 |
+
suffix = "" if patch_seen[slug] == 1 else f"-{patch_seen[slug]}"
|
| 110 |
+
pt.set_gid(f"series--{slug}{suffix}")
|
| 111 |
+
|
| 112 |
+
def _postprocess_svg_add_classes(svg_path: Path):
|
| 113 |
+
"""Add convenient CSS classes alongside ids (e.g., class='series grid grid-x')."""
|
| 114 |
+
try:
|
| 115 |
+
import xml.etree.ElementTree as ET
|
| 116 |
+
ET.register_namespace("", "http://www.w3.org/2000/svg")
|
| 117 |
+
tree = ET.parse(svg_path)
|
| 118 |
+
root = tree.getroot()
|
| 119 |
+
for el in root.iter():
|
| 120 |
+
el_id = el.attrib.get("id", "")
|
| 121 |
+
if not el_id:
|
| 122 |
+
continue
|
| 123 |
+
cls = []
|
| 124 |
+
if el_id.startswith("figure--"):
|
| 125 |
+
cls.append("figure")
|
| 126 |
+
elif el_id.startswith("axes--"):
|
| 127 |
+
cls.append("axes")
|
| 128 |
+
elif el_id.startswith("grid-x--"):
|
| 129 |
+
cls += ["grid", "grid-x"]
|
| 130 |
+
elif el_id.startswith("grid-y--"):
|
| 131 |
+
cls += ["grid", "grid-y"]
|
| 132 |
+
elif el_id.startswith("legend"):
|
| 133 |
+
cls.append("legend")
|
| 134 |
+
elif el_id.startswith("label--x"):
|
| 135 |
+
cls.append("xlabel")
|
| 136 |
+
elif el_id.startswith("label--y"):
|
| 137 |
+
cls.append("ylabel")
|
| 138 |
+
elif el_id.startswith("title--"):
|
| 139 |
+
cls.append("title")
|
| 140 |
+
elif el_id.startswith("series--"):
|
| 141 |
+
cls.append("series")
|
| 142 |
+
if cls:
|
| 143 |
+
# Preserve any existing class (unlikely from Matplotlib)
|
| 144 |
+
existing = el.attrib.get("class", "")
|
| 145 |
+
el.set("class", (existing + " " + " ".join(cls)).strip())
|
| 146 |
+
tree.write(svg_path, encoding="utf-8", xml_declaration=True)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"✗ SVG postprocess (classes) skipped: {e}")
|
| 149 |
+
|
| 150 |
+
# Monkey-patch savefig to force SVG & ensure tagging occurs even if kbt.viz saves internally.
|
| 151 |
+
_orig_savefig = plt.savefig
|
| 152 |
+
def _savefig_svg(fname, *args, **kwargs):
|
| 153 |
+
# Always save as SVG at a stable path for the artifact system
|
| 154 |
+
out = Path("latency.svg")
|
| 155 |
+
kwargs["format"] = "svg"
|
| 156 |
+
# Ensure everything we care about has ids before export
|
| 157 |
+
_tag_current_figure()
|
| 158 |
+
res = _orig_savefig(out, *args, **kwargs)
|
| 159 |
+
# Add helpful CSS classes on top of ids
|
| 160 |
+
_postprocess_svg_add_classes(out)
|
| 161 |
+
print(f"✓ Combined visualization saved as {out}")
|
| 162 |
+
return res
|
| 163 |
+
|
| 164 |
+
plt.savefig = _savefig_svg # apply patch
|
| 165 |
+
|
| 166 |
+
# Capture close calls in case kbt.viz() closes figures before we re-save
|
| 167 |
+
_orig_close = plt.close
|
| 168 |
+
_last_closed = {"fig": None}
|
| 169 |
+
def _capture_close(arg=None):
|
| 170 |
+
try:
|
| 171 |
+
if hasattr(arg, "savefig"): # looks like a Figure
|
| 172 |
+
_last_closed["fig"] = arg
|
| 173 |
+
else:
|
| 174 |
+
_last_closed["fig"] = plt.gcf()
|
| 175 |
+
finally:
|
| 176 |
+
return _orig_close(arg)
|
| 177 |
+
plt.close = _capture_close
|
| 178 |
|
| 179 |
+
# --- Locate benchmark artifacts --------------------------------------------------
|
| 180 |
cache_dirs = {
|
| 181 |
"Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
|
| 182 |
"MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
|
|
|
|
| 194 |
print(f"{name:30s}: {cache_dir}")
|
| 195 |
print()
|
| 196 |
|
|
|
|
|
|
|
| 197 |
file_mapping = {
|
| 198 |
"Flash (PyTorch SDPA)": "attn.jsonl",
|
| 199 |
"MemEff (PyTorch SDPA)": "attn.jsonl",
|
|
|
|
| 206 |
"HF Kernels Flash Attn3": "attn.jsonl",
|
| 207 |
}
|
| 208 |
|
| 209 |
+
all_paths = []
|
| 210 |
for name, cache_dir in cache_dirs.items():
|
| 211 |
if cache_dir:
|
| 212 |
+
path = Path(cache_dir) / file_mapping[name]
|
|
|
|
| 213 |
if path.exists() and path.stat().st_size > 0:
|
| 214 |
all_paths.append(str(path))
|
| 215 |
print(f"✓ Found {name}: {path}")
|
|
|
|
| 217 |
print(f"⊘ Empty/Missing {name}: {path}")
|
| 218 |
else:
|
| 219 |
print(f"✗ No cache dir for {name}")
|
|
|
|
| 220 |
print()
|
| 221 |
|
| 222 |
if not all_paths:
|
| 223 |
print("ERROR: No benchmark data files found!")
|
| 224 |
+
# restore patched functions before exiting
|
| 225 |
+
plt.savefig = _orig_savefig
|
| 226 |
+
plt.close = _orig_close
|
| 227 |
sys.exit(1)
|
| 228 |
|
| 229 |
+
# --- Summary + Visualization -----------------------------------------------------
|
| 230 |
+
print("COMBINED BENCHMARK SUMMARY\n")
|
|
|
|
|
|
|
| 231 |
kbt.summarize(all_paths)
|
| 232 |
+
print("\nGENERATING COMBINED VISUALIZATION\n")
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
try:
|
| 235 |
+
# If kbt.viz saves internally, our patched savefig ensures SVG gets written,
|
| 236 |
+
# and it will carry ids/classes for CSS styling.
|
| 237 |
kbt.viz(all_paths)
|
| 238 |
+
# Safety net: if kbt.viz didn't save, save now.
|
| 239 |
+
# if not Path("latency.svg").exists():
|
| 240 |
+
# _tag_current_figure()
|
| 241 |
+
# plt.savefig("latency.svg")
|
| 242 |
+
|
| 243 |
+
plt.savefig("latency.svg") # ensure saved with tagging
|
| 244 |
+
|
| 245 |
+
print("✓ SVG visualization ready: latency.svg!")
|
| 246 |
except ImportError as e:
|
| 247 |
print(f"✗ Visualization requires matplotlib: {e}")
|
| 248 |
except Exception as e:
|
| 249 |
print(f"✗ Visualization failed: {e}")
|
| 250 |
+
finally:
|
| 251 |
+
# Clean up patches to avoid side effects in later cells
|
| 252 |
+
plt.savefig = _orig_savefig
|
| 253 |
+
plt.close = _orig_close
|
| 254 |
|
| 255 |
print()
|
| 256 |
print("ANALYSIS COMPLETE")
|
|
|
|
| 258 |
print(f"\nImplementations included:")
|
| 259 |
for name, cache_dir in cache_dirs.items():
|
| 260 |
if cache_dir:
|
| 261 |
+
path = Path(cache_dir) / file_mapping[name]
|
|
|
|
| 262 |
if path.exists() and path.stat().st_size > 0:
|
| 263 |
+
print(f" ✓ {name}")
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
# Collect all benchmark data and export to CSV
|
| 268 |
+
all_data = {}
|
| 269 |
+
for name, cache_dir in cache_dirs.items():
|
| 270 |
+
if cache_dir:
|
| 271 |
+
path = Path(cache_dir) / file_mapping[name]
|
| 272 |
+
if path.exists() and path.stat().st_size > 0:
|
| 273 |
+
with open(path, 'r') as f:
|
| 274 |
+
records = [json.loads(line) for line in f]
|
| 275 |
+
all_data[name] = records
|
| 276 |
+
|
| 277 |
+
# Export to CSV
|
| 278 |
+
csv_path = Path("latency.csv")
|
| 279 |
+
with open(csv_path, 'w', newline='') as csvfile:
|
| 280 |
+
writer = csv.writer(csvfile)
|
| 281 |
+
|
| 282 |
+
# Write header
|
| 283 |
+
header = ["Implementation", "Impl ID", "Workload", "Batch", "Seq Length", "Heads", "Head Dim", "Dtype",
|
| 284 |
+
"Mean (ms)", "P10 (ms)", "P50 (ms)", "P90 (ms)", "Reps",
|
| 285 |
+
# "Compile (ms)",
|
| 286 |
+
"Peak Mem (MB)", "Backend", "Family"]
|
| 287 |
+
writer.writerow(header)
|
| 288 |
+
|
| 289 |
+
# Write data rows
|
| 290 |
+
for impl_name, records in all_data.items():
|
| 291 |
+
for record in records:
|
| 292 |
+
wl = record.get('wl', {})
|
| 293 |
+
lat = record.get('lat_ms', {})
|
| 294 |
+
tags = record.get('tags', {})
|
| 295 |
+
|
| 296 |
+
row = [
|
| 297 |
+
impl_name,
|
| 298 |
+
record.get('impl', ''),
|
| 299 |
+
wl.get('name', ''),
|
| 300 |
+
wl.get('batch', ''),
|
| 301 |
+
wl.get('seq_len', ''),
|
| 302 |
+
wl.get('heads', ''),
|
| 303 |
+
wl.get('head_dim', ''),
|
| 304 |
+
wl.get('dtype', ''),
|
| 305 |
+
lat.get('mean', ''),
|
| 306 |
+
lat.get('p10', ''),
|
| 307 |
+
lat.get('p50', ''),
|
| 308 |
+
lat.get('p90', ''),
|
| 309 |
+
lat.get('reps', ''),
|
| 310 |
+
# record.get('compile_ms', ''),
|
| 311 |
+
round(record.get('peak_bytes', 0) / 1024 / 1024, 2) if record.get('peak_bytes') else '',
|
| 312 |
+
tags.get('backend', ''),
|
| 313 |
+
tags.get('family', ''),
|
| 314 |
+
]
|
| 315 |
+
writer.writerow(row)
|
| 316 |
+
|
| 317 |
+
print(f"✓ CSV export complete: {csv_path}")
|
| 318 |
+
print(f"Total implementations: {len(all_data)}")
|
| 319 |
+
print(f"Total records: {sum(len(records) for records in all_data.values())}")
|
flash_attn/results/combined_results.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|