drbh HF Staff commited on
Commit
352017c
·
verified ·
1 Parent(s): 9ad2ef6

Upload folder using huggingface_hub

Browse files
flash_attn/impls/artifacts/benchmark/attn.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4424000084400177, "p50": 0.4480000138282776, "p90": 0.45020800828933716, "mean": 0.448172801733017, "reps": 5, "warmup": 2}, "compile_ms": 1.8151999711990356, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.46480000019073486, "p50": 0.4689280092716217, "p90": 0.47071999311447144, "mean": 0.46839680075645446, "reps": 5, "warmup": 2}, "compile_ms": 0.35923200845718384, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5970879793167114, "p50": 0.5986559987068176, "p90": 0.6020799875259399, "mean": 0.6001919984817505, "reps": 5, "warmup": 2}, "compile_ms": 0.48611199855804443, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5994560122489929, "p50": 0.6028159856796265, "p90": 0.6028800010681152, "mean": 0.6018815994262695, "reps": 5, "warmup": 2}, "compile_ms": 0.49404799938201904, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6379839777946472, "p50": 0.6402559876441956, "p90": 0.6423360109329224, "mean": 0.6404095888137817, "reps": 5, "warmup": 2}, "compile_ms": 0.531391978263855, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6431040167808533, "p50": 0.6442880034446716, "p90": 0.6445119976997375, "mean": 0.644704008102417, "reps": 5, "warmup": 2}, "compile_ms": 0.5358719825744629, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4474239945411682, "p50": 0.44921600818634033, "p90": 0.45241600275039673, "mean": 0.45066879987716674, "reps": 5, "warmup": 2}, "compile_ms": 1.7530560493469238, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4652479887008667, "p50": 0.4705919921398163, "p90": 0.4716799855232239, "mean": 0.47004159688949587, "reps": 5, "warmup": 2}, "compile_ms": 0.36032000184059143, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5987840294837952, "p50": 0.6021760106086731, "p90": 0.6045759916305542, "mean": 0.6022783994674683, "reps": 5, "warmup": 2}, "compile_ms": 0.4950079917907715, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6000319719314575, "p50": 0.600383996963501, "p90": 0.6016640067100525, "mean": 0.6013055920600892, "reps": 5, "warmup": 2}, "compile_ms": 0.49647998809814453, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.639136016368866, "p50": 0.6404479742050171, "p90": 0.6416320204734802, "mean": 0.6408192038536071, "reps": 5, "warmup": 2}, "compile_ms": 0.530239999294281, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T18:12:20Z", "run": "a2d84c2da2864a0ead6e0da36e5784e9", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6447359919548035, "p50": 0.6462399959564209, "p90": 0.6483839750289917, "mean": 0.6466111898422241, "reps": 5, "warmup": 2}, "compile_ms": 0.5342720150947571, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T15:50:57Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5143679976463318, "p50": 0.5232959985733032, "p90": 0.5257599949836731, "mean": 0.5211328029632568, "reps": 5, "warmup": 2}, "compile_ms": 3112.67236328125, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T15:50:57Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5569279789924622, "p50": 0.558784008026123, "p90": 0.5599679946899414, "mean": 0.5588735938072205, "reps": 5, "warmup": 2}, "compile_ms": 272.2660217285156, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.685375988483429, "p50": 0.6888960003852844, "p90": 0.6940159797668457, "mean": 0.6904960036277771, "reps": 5, "warmup": 2}, "compile_ms": 272.7831726074219, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7146559953689575, "p50": 0.7190399765968323, "p90": 0.7200639843940735, "mean": 0.7184319853782654, "reps": 5, "warmup": 2}, "compile_ms": 270.6763916015625, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.739359974861145, "p50": 0.7402240037918091, "p90": 0.7426239848136902, "mean": 0.741484797000885, "reps": 5, "warmup": 2}, "compile_ms": 270.3490295410156, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7703679800033569, "p50": 0.7723519802093506, "p90": 0.7728000283241272, "mean": 0.7723968029022217, "reps": 5, "warmup": 2}, "compile_ms": 269.7756652832031, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-02T18:08:46Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5194560289382935, "p50": 0.5272960066795349, "p90": 0.5312960147857666, "mean": 0.527347207069397, "reps": 5, "warmup": 2}, "compile_ms": 3354.235107421875, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T18:08:47Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5560640096664429, "p50": 0.5571519732475281, "p90": 0.5611839890480042, "mean": 0.5586367964744567, "reps": 5, "warmup": 2}, "compile_ms": 471.23529052734375, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T18:08:47Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6841920018196106, "p50": 0.6860160231590271, "p90": 0.6869760155677795, "mean": 0.6860736012458801, "reps": 5, "warmup": 2}, "compile_ms": 468.1533508300781, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T18:08:48Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7152000069618225, "p50": 0.7161920070648193, "p90": 0.7164160013198853, "mean": 0.7167360067367554, "reps": 5, "warmup": 2}, "compile_ms": 465.7891540527344, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T18:08:48Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7400959730148315, "p50": 0.742143988609314, "p90": 0.7431039810180664, "mean": 0.7423295855522156, "reps": 5, "warmup": 2}, "compile_ms": 468.6272888183594, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T18:08:49Z", "run": "3ea490632d4f4be2a19f477a48f12fc5", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7718080282211304, "p50": 0.7745919823646545, "p90": 0.7748159766197205, "mean": 0.7743871927261352, "reps": 5, "warmup": 2}, "compile_ms": 475.9334716796875, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T15:50:03Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.63155198097229, "p50": 0.6451839804649353, "p90": 0.665727972984314, "mean": 0.6618239879608154, "reps": 5, "warmup": 2}, "compile_ms": 4977.1767578125, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T15:50:05Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6628159880638123, "p50": 0.6843519806861877, "p90": 0.7063680291175842, "mean": 0.7008576035499573, "reps": 5, "warmup": 2}, "compile_ms": 1701.4315185546875, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T15:50:07Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8050559759140015, "p50": 0.8155199885368347, "p90": 0.8389120101928711, "mean": 0.833843195438385, "reps": 5, "warmup": 2}, "compile_ms": 1701.230712890625, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T15:50:09Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8313599824905396, "p50": 0.849407970905304, "p90": 0.8810880184173584, "mean": 0.8694527983665467, "reps": 5, "warmup": 2}, "compile_ms": 2027.875, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T15:50:11Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8768960237503052, "p50": 0.8824639916419983, "p90": 0.9011520147323608, "mean": 0.9017536044120789, "reps": 5, "warmup": 2}, "compile_ms": 2269.297607421875, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T15:50:13Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9179520010948181, "p50": 0.9188479781150818, "p90": 0.9378560185432434, "mean": 0.9400512099266052, "reps": 5, "warmup": 2}, "compile_ms": 1835.313720703125, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-02T18:09:34Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6148160099983215, "p50": 0.6296960115432739, "p90": 0.6522240042686462, "mean": 0.6489088058471679, "reps": 5, "warmup": 2}, "compile_ms": 4649.109375, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T18:09:35Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6615359783172607, "p50": 0.6821119785308838, "p90": 0.7128959894180298, "mean": 0.700761592388153, "reps": 5, "warmup": 2}, "compile_ms": 1487.6849365234375, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T18:09:37Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7967039942741394, "p50": 0.8164799809455872, "p90": 0.8463680148124695, "mean": 0.834444797039032, "reps": 5, "warmup": 2}, "compile_ms": 1492.66748046875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T18:09:39Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8432319760322571, "p50": 0.8498560190200806, "p90": 0.8750079870223999, "mean": 0.8709375977516174, "reps": 5, "warmup": 2}, "compile_ms": 1477.6558837890625, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T18:09:41Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8775359988212585, "p50": 0.9030719995498657, "p90": 0.903872013092041, "mean": 0.9069631934165955, "reps": 5, "warmup": 2}, "compile_ms": 1919.1016845703125, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T18:09:43Z", "run": "02313c2372ed4884add9b1c644335af9", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9145920276641846, "p50": 0.9164159893989563, "p90": 0.9357439875602722, "mean": 0.9371584057807922, "reps": 5, "warmup": 2}, "compile_ms": 1487.1219482421875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/compiled_variants.html CHANGED
@@ -719,6 +719,41 @@
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  .cell-failed {
723
  border-color: var(--border-cell-failed);
724
  }
@@ -3711,7 +3746,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3711
  <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
3712
  <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
3713
  </span> |
3714
- Cell: benchmark_default | 44.25s
3715
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
3716
  <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
3717
  <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
@@ -3795,7 +3830,7 @@ Cell: benchmark_default | 44.25s
3795
  </div>
3796
  <div id="output-benchmark_default" class="cell-output">
3797
  <div class="cell-stdout">impl wl p50(ms) ok
3798
- torch_flash_compiled_default flux_L128 0.52 True
3799
  torch_flash_compiled_default flux_L256 0.56 True
3800
  torch_flash_compiled_default flux_L320 0.69 True
3801
  torch_flash_compiled_default flux_L384 0.72 True
@@ -3806,28 +3841,28 @@ torch_flash_compiled_default flux_L512 0.77 True
3806
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3807
  <div class="uv-logs-content" style="display: none;">
3808
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3809
- Downloading triton (148.3MiB)
3810
- Downloading torch (846.9MiB)
3811
- Downloading kiwisolver (1.4MiB)
3812
- Downloading fonttools (4.7MiB)
3813
- Downloading nvidia-cublas-cu12 (566.8MiB)
3814
- Downloading nvidia-cufile-cu12 (1.1MiB)
3815
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3816
- Downloading nvidia-cufft-cu12 (184.2MiB)
3817
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3818
- Downloading setuptools (1.1MiB)
3819
  Downloading matplotlib (8.3MiB)
 
 
 
 
 
 
 
3820
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3821
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
 
 
 
3822
  Downloading nvidia-cusolver-cu12 (255.1MiB)
 
 
 
 
3823
  Downloading numpy (16.2MiB)
3824
- Downloading sympy (6.0MiB)
3825
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3826
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3827
- Downloading nvidia-curand-cu12 (60.7MiB)
3828
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3829
- Downloading networkx (1.9MiB)
3830
- Downloading pillow (6.3MiB)
3831
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3832
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3833
  Downloading nvidia-cufile-cu12
@@ -3839,21 +3874,21 @@ Downloading pillow (6.3MiB)
3839
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3840
  Downloading nvidia-cuda-cupti-cu12
3841
  Downloading matplotlib
3842
- Downloading numpy
3843
  Downloading sympy
 
3844
  Downloading nvidia-nvjitlink-cu12
3845
  Downloading nvidia-curand-cu12
3846
  Downloading nvidia-cuda-nvrtc-cu12
3847
  Downloading triton
3848
  Downloading nvidia-cufft-cu12
3849
  Downloading nvidia-cusolver-cu12
3850
- Downloading nvidia-cusparselt-cu12
3851
  Downloading nvidia-cusparse-cu12
 
3852
  Downloading nvidia-nccl-cu12
3853
  Downloading nvidia-cublas-cu12
3854
  Downloading nvidia-cudnn-cu12
3855
  Downloading torch
3856
- Installed 37 packages in 516ms
3857
  </div>
3858
  </div>
3859
  <div class="cell-artifacts">
@@ -3871,7 +3906,7 @@ Installed 37 packages in 516ms
3871
  <span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark_max_autotune | 56.94s
3875
  | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
3877
  <a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
@@ -3955,65 +3990,65 @@ Cell: benchmark_max_autotune | 56.94s
3955
  </div>
3956
  <div id="output-benchmark_max_autotune" class="cell-output">
3957
  <div class="cell-stdout">impl wl p50(ms) ok
3958
- torch_flash_compiled_max_autotune flux_L128 0.65 True
3959
  torch_flash_compiled_max_autotune flux_L256 0.68 True
3960
  torch_flash_compiled_max_autotune flux_L320 0.82 True
3961
  torch_flash_compiled_max_autotune flux_L384 0.85 True
3962
- torch_flash_compiled_max_autotune flux_L448 0.88 True
3963
  torch_flash_compiled_max_autotune flux_L512 0.92 True
3964
  </div>
3965
  <div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
3966
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3967
  <div class="uv-logs-content" style="display: none;">
3968
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3969
- Downloading matplotlib (8.3MiB)
3970
- Downloading setuptools (1.1MiB)
3971
  Downloading nvidia-cublas-cu12 (566.8MiB)
3972
- Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
 
3973
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
 
 
3974
  Downloading fonttools (4.7MiB)
 
 
 
 
 
 
3975
  Downloading numpy (16.2MiB)
3976
  Downloading pillow (6.3MiB)
3977
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3978
- Downloading nvidia-nccl-cu12 (307.4MiB)
3979
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3980
- Downloading nvidia-cufft-cu12 (184.2MiB)
3981
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3982
- Downloading networkx (1.9MiB)
3983
- Downloading torch (846.9MiB)
3984
- Downloading triton (148.3MiB)
3985
- Downloading nvidia-cufile-cu12 (1.1MiB)
3986
- Downloading kiwisolver (1.4MiB)
3987
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3988
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3989
  Downloading nvidia-curand-cu12 (60.7MiB)
3990
- Downloading sympy (6.0MiB)
3991
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3992
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3993
  Downloading nvidia-cufile-cu12
3994
  Downloading kiwisolver
3995
  Downloading setuptools
3996
- Downloading fonttools
3997
  Downloading networkx
 
3998
  Downloading pillow
3999
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
4000
- Downloading matplotlib
4001
  Downloading nvidia-cuda-cupti-cu12
4002
- Downloading sympy
4003
  Downloading numpy
 
4004
  Downloading nvidia-nvjitlink-cu12
4005
  Downloading nvidia-curand-cu12
4006
  Downloading nvidia-cuda-nvrtc-cu12
4007
  Downloading triton
4008
  Downloading nvidia-cufft-cu12
4009
  Downloading nvidia-cusolver-cu12
4010
- Downloading nvidia-cusparse-cu12
4011
  Downloading nvidia-cusparselt-cu12
 
4012
  Downloading nvidia-nccl-cu12
4013
  Downloading nvidia-cublas-cu12
4014
  Downloading nvidia-cudnn-cu12
4015
  Downloading torch
4016
- Installed 37 packages in 547ms
4017
  </div>
4018
  </div>
4019
  <div class="cell-artifacts">
 
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
722
+ /* CSV table styling */
723
+ .artifact-csv {
724
+ margin-top: 1rem;
725
+ overflow-x: auto;
726
+ }
727
+ .csv-table {
728
+ width: 100%;
729
+ border-collapse: collapse;
730
+ font-size: 0.9rem;
731
+ background: var(--bg-secondary);
732
+ border: 1px solid var(--border-primary);
733
+ border-radius: 1px;
734
+ }
735
+ .csv-table th,
736
+ .csv-table td {
737
+ padding: 0.5rem 0.75rem;
738
+ text-align: left;
739
+ border: 1px solid var(--border-primary);
740
+ }
741
+ .csv-table th {
742
+ background: var(--bg-tertiary);
743
+ font-weight: 600;
744
+ color: var(--text-primary);
745
+ }
746
+ .csv-table tbody tr:hover {
747
+ background: var(--bg-artifact-hover);
748
+ }
749
+ .artifact-csv-error {
750
+ margin-top: 1rem;
751
+ padding: 1rem;
752
+ background: var(--bg-error);
753
+ color: var(--text-error);
754
+ border: 1px solid var(--border-error);
755
+ border-radius: 1px;
756
+ }
757
  .cell-failed {
758
  border-color: var(--border-cell-failed);
759
  }
 
3746
  <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
3747
  <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
3748
  </span> |
3749
+ Cell: benchmark_default | 46.78s
3750
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
3751
  <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
3752
  <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
 
3830
  </div>
3831
  <div id="output-benchmark_default" class="cell-output">
3832
  <div class="cell-stdout">impl wl p50(ms) ok
3833
+ torch_flash_compiled_default flux_L128 0.53 True
3834
  torch_flash_compiled_default flux_L256 0.56 True
3835
  torch_flash_compiled_default flux_L320 0.69 True
3836
  torch_flash_compiled_default flux_L384 0.72 True
 
3841
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3842
  <div class="uv-logs-content" style="display: none;">
3843
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3844
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
 
 
 
 
 
 
 
3845
  Downloading matplotlib (8.3MiB)
3846
+ Downloading networkx (1.9MiB)
3847
+ Downloading setuptools (1.1MiB)
3848
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3849
+ Downloading pillow (6.3MiB)
3850
+ Downloading sympy (6.0MiB)
3851
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3852
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3853
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3854
  Downloading nvidia-nccl-cu12 (307.4MiB)
3855
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3856
+ Downloading nvidia-curand-cu12 (60.7MiB)
3857
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3858
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3859
+ Downloading torch (846.9MiB)
3860
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3861
+ Downloading fonttools (4.7MiB)
3862
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3863
+ Downloading kiwisolver (1.4MiB)
3864
+ Downloading triton (148.3MiB)
3865
  Downloading numpy (16.2MiB)
 
 
 
 
 
 
 
3866
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3867
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3868
  Downloading nvidia-cufile-cu12
 
3874
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3875
  Downloading nvidia-cuda-cupti-cu12
3876
  Downloading matplotlib
 
3877
  Downloading sympy
3878
+ Downloading numpy
3879
  Downloading nvidia-nvjitlink-cu12
3880
  Downloading nvidia-curand-cu12
3881
  Downloading nvidia-cuda-nvrtc-cu12
3882
  Downloading triton
3883
  Downloading nvidia-cufft-cu12
3884
  Downloading nvidia-cusolver-cu12
 
3885
  Downloading nvidia-cusparse-cu12
3886
+ Downloading nvidia-cusparselt-cu12
3887
  Downloading nvidia-nccl-cu12
3888
  Downloading nvidia-cublas-cu12
3889
  Downloading nvidia-cudnn-cu12
3890
  Downloading torch
3891
+ Installed 37 packages in 557ms
3892
  </div>
3893
  </div>
3894
  <div class="cell-artifacts">
 
3906
  <span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
3907
  <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
3908
  </span> |
3909
+ Cell: benchmark_max_autotune | 53.65s
3910
  | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
3911
  <button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
3912
  <a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
 
3990
  </div>
3991
  <div id="output-benchmark_max_autotune" class="cell-output">
3992
  <div class="cell-stdout">impl wl p50(ms) ok
3993
+ torch_flash_compiled_max_autotune flux_L128 0.63 True
3994
  torch_flash_compiled_max_autotune flux_L256 0.68 True
3995
  torch_flash_compiled_max_autotune flux_L320 0.82 True
3996
  torch_flash_compiled_max_autotune flux_L384 0.85 True
3997
+ torch_flash_compiled_max_autotune flux_L448 0.90 True
3998
  torch_flash_compiled_max_autotune flux_L512 0.92 True
3999
  </div>
4000
  <div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
4001
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4002
  <div class="uv-logs-content" style="display: none;">
4003
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
4004
+ Downloading nvidia-cufile-cu12 (1.1MiB)
 
4005
  Downloading nvidia-cublas-cu12 (566.8MiB)
4006
+ Downloading sympy (6.0MiB)
4007
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4008
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4009
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4010
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4011
+ Downloading matplotlib (8.3MiB)
4012
+ Downloading triton (148.3MiB)
4013
+ Downloading networkx (1.9MiB)
4014
  Downloading fonttools (4.7MiB)
4015
+ Downloading torch (846.9MiB)
4016
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4017
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4018
+ Downloading kiwisolver (1.4MiB)
4019
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4020
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4021
  Downloading numpy (16.2MiB)
4022
  Downloading pillow (6.3MiB)
4023
  Downloading nvidia-cudnn-cu12 (674.0MiB)
4024
+ Downloading setuptools (1.1MiB)
 
 
 
 
 
 
 
 
 
 
4025
  Downloading nvidia-curand-cu12 (60.7MiB)
 
4026
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
4027
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
4028
  Downloading nvidia-cufile-cu12
4029
  Downloading kiwisolver
4030
  Downloading setuptools
 
4031
  Downloading networkx
4032
+ Downloading fonttools
4033
  Downloading pillow
4034
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
 
4035
  Downloading nvidia-cuda-cupti-cu12
4036
+ Downloading matplotlib
4037
  Downloading numpy
4038
+ Downloading sympy
4039
  Downloading nvidia-nvjitlink-cu12
4040
  Downloading nvidia-curand-cu12
4041
  Downloading nvidia-cuda-nvrtc-cu12
4042
  Downloading triton
4043
  Downloading nvidia-cufft-cu12
4044
  Downloading nvidia-cusolver-cu12
 
4045
  Downloading nvidia-cusparselt-cu12
4046
+ Downloading nvidia-cusparse-cu12
4047
  Downloading nvidia-nccl-cu12
4048
  Downloading nvidia-cublas-cu12
4049
  Downloading nvidia-cudnn-cu12
4050
  Downloading torch
4051
+ Installed 37 packages in 525ms
4052
  </div>
4053
  </div>
4054
  <div class="cell-artifacts">
flash_attn/impls/flash_attention.html CHANGED
@@ -719,6 +719,41 @@
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  .cell-failed {
723
  border-color: var(--border-cell-failed);
724
  }
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: nv | 0.66s
3714
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3716
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3726,7 +3761,7 @@ Cell: nv | 0.66s
3726
  </div>
3727
  </div>
3728
  <div id="output-nv" class="cell-output">
3729
- <div class="cell-stdout">Thu Oct 2 15:53:02 2025
3730
  +-----------------------------------------------------------------------------------------+
3731
  | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3732
  |-----------------------------------------+------------------------+----------------------+
@@ -3735,19 +3770,19 @@ Cell: nv | 0.66s
3735
  | | | MIG M. |
3736
  |=========================================+========================+======================|
3737
  | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3738
- | 0% 29C P0 87W / 300W | 0MiB / 23028MiB | 0% Default |
3739
  | | | N/A |
3740
  +-----------------------------------------+------------------------+----------------------+
3741
  | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3742
- | 0% 25C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3743
  | | | N/A |
3744
  +-----------------------------------------+------------------------+----------------------+
3745
  | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3746
- | 0% 25C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
3747
  | | | N/A |
3748
  +-----------------------------------------+------------------------+----------------------+
3749
  | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3750
- | 0% 25C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
3751
  | | | N/A |
3752
  +-----------------------------------------+------------------------+----------------------+
3753
 
@@ -3771,7 +3806,7 @@ Cell: nv | 0.66s
3771
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3772
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3773
  </span> |
3774
- Cell: benchmark | 37.94s
3775
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3776
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3777
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3852,7 +3887,7 @@ Cell: benchmark | 37.94s
3852
  <div id="output-benchmark" class="cell-output">
3853
  <div class="cell-stdout">impl wl p50(ms) ok
3854
  torch_flash_ma flux_L128 0.48 True
3855
- torch_flash_ma flux_L256 0.53 True
3856
  torch_flash_ma flux_L320 0.65 True
3857
  torch_flash_ma flux_L384 0.68 True
3858
  torch_flash_ma flux_L448 0.71 True
@@ -3862,35 +3897,35 @@ torch_flash_ma flux_L512 0.74 True
3862
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3863
  <div class="uv-logs-content" style="display: none;">
3864
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3865
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3866
- Downloading sympy (6.0MiB)
3867
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3868
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3869
- Downloading nvidia-nccl-cu12 (307.4MiB)
3870
  Downloading networkx (1.9MiB)
3871
- Downloading fonttools (4.7MiB)
3872
- Downloading matplotlib (8.3MiB)
3873
- Downloading nvidia-cufft-cu12 (184.2MiB)
3874
- Downloading setuptools (1.1MiB)
3875
- Downloading pillow (6.3MiB)
3876
- Downloading nvidia-cublas-cu12 (566.8MiB)
3877
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3878
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3879
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3880
- Downloading numpy (16.2MiB)
3881
- Downloading nvidia-cufile-cu12 (1.1MiB)
3882
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3883
- Downloading nvidia-curand-cu12 (60.7MiB)
3884
- Downloading kiwisolver (1.4MiB)
3885
- Downloading torch (846.9MiB)
 
 
 
 
 
3886
  Downloading triton (148.3MiB)
 
 
 
3887
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3888
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3889
  Downloading nvidia-cufile-cu12
3890
  Downloading kiwisolver
3891
  Downloading setuptools
3892
- Downloading fonttools
3893
  Downloading networkx
 
3894
  Downloading pillow
3895
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3896
  Downloading nvidia-cuda-cupti-cu12
@@ -3906,10 +3941,10 @@ Downloading triton (148.3MiB)
3906
  Downloading nvidia-cusparselt-cu12
3907
  Downloading nvidia-cusparse-cu12
3908
  Downloading nvidia-nccl-cu12
3909
- Downloading nvidia-cublas-cu12
3910
  Downloading nvidia-cudnn-cu12
 
3911
  Downloading torch
3912
- Installed 37 packages in 567ms
3913
  </div>
3914
  </div>
3915
  <div class="cell-artifacts">
 
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
722
+ /* CSV table styling */
723
+ .artifact-csv {
724
+ margin-top: 1rem;
725
+ overflow-x: auto;
726
+ }
727
+ .csv-table {
728
+ width: 100%;
729
+ border-collapse: collapse;
730
+ font-size: 0.9rem;
731
+ background: var(--bg-secondary);
732
+ border: 1px solid var(--border-primary);
733
+ border-radius: 1px;
734
+ }
735
+ .csv-table th,
736
+ .csv-table td {
737
+ padding: 0.5rem 0.75rem;
738
+ text-align: left;
739
+ border: 1px solid var(--border-primary);
740
+ }
741
+ .csv-table th {
742
+ background: var(--bg-tertiary);
743
+ font-weight: 600;
744
+ color: var(--text-primary);
745
+ }
746
+ .csv-table tbody tr:hover {
747
+ background: var(--bg-artifact-hover);
748
+ }
749
+ .artifact-csv-error {
750
+ margin-top: 1rem;
751
+ padding: 1rem;
752
+ background: var(--bg-error);
753
+ color: var(--text-error);
754
+ border: 1px solid var(--border-error);
755
+ border-radius: 1px;
756
+ }
757
  .cell-failed {
758
  border-color: var(--border-cell-failed);
759
  }
 
3745
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3747
  </span> |
3748
+ Cell: nv | 0.70s
3749
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3751
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3761
  </div>
3762
  </div>
3763
  <div id="output-nv" class="cell-output">
3764
+ <div class="cell-stdout">Thu Oct 2 18:06:49 2025
3765
  +-----------------------------------------------------------------------------------------+
3766
  | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3767
  |-----------------------------------------+------------------------+----------------------+
 
3770
  | | | MIG M. |
3771
  |=========================================+========================+======================|
3772
  | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3773
+ | 0% 26C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3774
  | | | N/A |
3775
  +-----------------------------------------+------------------------+----------------------+
3776
  | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3777
+ | 0% 26C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
3778
  | | | N/A |
3779
  +-----------------------------------------+------------------------+----------------------+
3780
  | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3781
+ | 0% 26C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3782
  | | | N/A |
3783
  +-----------------------------------------+------------------------+----------------------+
3784
  | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3785
+ | 0% 27C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3786
  | | | N/A |
3787
  +-----------------------------------------+------------------------+----------------------+
3788
 
 
3806
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3807
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3808
  </span> |
3809
+ Cell: benchmark | 36.63s
3810
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3811
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3812
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  <div id="output-benchmark" class="cell-output">
3888
  <div class="cell-stdout">impl wl p50(ms) ok
3889
  torch_flash_ma flux_L128 0.48 True
3890
+ torch_flash_ma flux_L256 0.52 True
3891
  torch_flash_ma flux_L320 0.65 True
3892
  torch_flash_ma flux_L384 0.68 True
3893
  torch_flash_ma flux_L448 0.71 True
 
3897
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3898
  <div class="uv-logs-content" style="display: none;">
3899
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 
 
 
3900
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
3901
  Downloading networkx (1.9MiB)
3902
+ Downloading kiwisolver (1.4MiB)
3903
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3904
+ Downloading sympy (6.0MiB)
3905
+ Downloading nvidia-curand-cu12 (60.7MiB)
 
 
3906
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3907
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3908
  Downloading nvidia-cusolver-cu12 (255.1MiB)
 
 
3909
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3910
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3911
+ Downloading pillow (6.3MiB)
3912
+ Downloading numpy (16.2MiB)
3913
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3914
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3915
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3916
+ Downloading setuptools (1.1MiB)
3917
+ Downloading matplotlib (8.3MiB)
3918
  Downloading triton (148.3MiB)
3919
+ Downloading fonttools (4.7MiB)
3920
+ Downloading torch (846.9MiB)
3921
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3922
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3923
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3924
  Downloading nvidia-cufile-cu12
3925
  Downloading kiwisolver
3926
  Downloading setuptools
 
3927
  Downloading networkx
3928
+ Downloading fonttools
3929
  Downloading pillow
3930
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3931
  Downloading nvidia-cuda-cupti-cu12
 
3941
  Downloading nvidia-cusparselt-cu12
3942
  Downloading nvidia-cusparse-cu12
3943
  Downloading nvidia-nccl-cu12
 
3944
  Downloading nvidia-cudnn-cu12
3945
+ Downloading nvidia-cublas-cu12
3946
  Downloading torch
3947
+ Installed 37 packages in 548ms
3948
  </div>
3949
  </div>
3950
  <div class="cell-artifacts">
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -719,6 +719,41 @@
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  .cell-failed {
723
  border-color: var(--border-cell-failed);
724
  }
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: benchmark | 38.08s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3735,7 +3770,7 @@ Cell: benchmark | 38.08s
3735
  <span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
3736
  <span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
3737
 
3738
- <span class="n">hf_kernels_flash_attn</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">&quot;kernels-community/flash-attn&quot;</span><span class="p">)</span>
3739
 
3740
 
3741
  <span class="k">def</span><span class="w"> </span><span class="nf">hf_flash_attention</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
@@ -3797,39 +3832,39 @@ Cell: benchmark | 38.08s
3797
  <div id="output-benchmark" class="cell-output">
3798
  <div class="cell-stdout">impl wl p50(ms) ok
3799
  hf_kernels_flash_attn flux_L128 0.34 True
3800
- hf_kernels_flash_attn flux_L256 0.37 True
3801
  hf_kernels_flash_attn flux_L320 0.49 True
3802
  hf_kernels_flash_attn flux_L384 0.51 True
3803
- hf_kernels_flash_attn flux_L448 0.53 True
3804
- hf_kernels_flash_attn flux_L512 0.56 True
3805
  </div>
3806
  <div class="uv-install-logs" id="uv-logs-benchmark">
3807
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3808
  <div class="uv-logs-content" style="display: none;">
3809
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3810
- Downloading nvidia-cufft-cu12 (184.2MiB)
3811
- Downloading numpy (16.2MiB)
3812
- Downloading setuptools (1.1MiB)
3813
- Downloading hf-xet (3.0MiB)
3814
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3815
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3816
  Downloading nvidia-curand-cu12 (60.7MiB)
3817
  Downloading networkx (1.9MiB)
3818
- Downloading nvidia-nccl-cu12 (307.4MiB)
3819
  Downloading torch (846.9MiB)
3820
- Downloading nvidia-cufile-cu12 (1.1MiB)
3821
- Downloading triton (148.3MiB)
3822
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3823
- Downloading sympy (6.0MiB)
3824
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3825
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
3826
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3827
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
3828
  Downloading kiwisolver (1.4MiB)
3829
- Downloading nvidia-cublas-cu12 (566.8MiB)
3830
  Downloading pillow (6.3MiB)
 
 
 
 
3831
  Downloading fonttools (4.7MiB)
3832
- Downloading matplotlib (8.3MiB)
3833
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3834
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3835
  Downloading nvidia-cufile-cu12
@@ -3840,8 +3875,8 @@ Downloading matplotlib (8.3MiB)
3840
  Downloading fonttools
3841
  Downloading pillow
3842
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3843
- Downloading nvidia-cuda-cupti-cu12
3844
  Downloading matplotlib
 
3845
  Downloading numpy
3846
  Downloading sympy
3847
  Downloading nvidia-nvjitlink-cu12
@@ -3850,19 +3885,19 @@ Downloading matplotlib (8.3MiB)
3850
  Downloading triton
3851
  Downloading nvidia-cufft-cu12
3852
  Downloading nvidia-cusolver-cu12
3853
- Downloading nvidia-cusparselt-cu12
3854
  Downloading nvidia-cusparse-cu12
 
3855
  Downloading nvidia-nccl-cu12
3856
  Downloading nvidia-cublas-cu12
3857
  Downloading nvidia-cudnn-cu12
3858
  Downloading torch
3859
- Installed 47 packages in 519ms
3860
  </div>
3861
  </div>
3862
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
3863
- Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:06, 2.87it/s]
3864
- Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:12, 1.49it/s]
3865
- Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 16.01it/s]</div>
3866
  <div class="cell-artifacts">
3867
  <h4>Artifacts:</h4>
3868
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
722
+ /* CSV table styling */
723
+ .artifact-csv {
724
+ margin-top: 1rem;
725
+ overflow-x: auto;
726
+ }
727
+ .csv-table {
728
+ width: 100%;
729
+ border-collapse: collapse;
730
+ font-size: 0.9rem;
731
+ background: var(--bg-secondary);
732
+ border: 1px solid var(--border-primary);
733
+ border-radius: 1px;
734
+ }
735
+ .csv-table th,
736
+ .csv-table td {
737
+ padding: 0.5rem 0.75rem;
738
+ text-align: left;
739
+ border: 1px solid var(--border-primary);
740
+ }
741
+ .csv-table th {
742
+ background: var(--bg-tertiary);
743
+ font-weight: 600;
744
+ color: var(--text-primary);
745
+ }
746
+ .csv-table tbody tr:hover {
747
+ background: var(--bg-artifact-hover);
748
+ }
749
+ .artifact-csv-error {
750
+ margin-top: 1rem;
751
+ padding: 1rem;
752
+ background: var(--bg-error);
753
+ color: var(--text-error);
754
+ border: 1px solid var(--border-error);
755
+ border-radius: 1px;
756
+ }
757
  .cell-failed {
758
  border-color: var(--border-cell-failed);
759
  }
 
3745
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3747
  </span> |
3748
+ Cell: benchmark | 39.43s
3749
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3751
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3770
  <span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
3771
  <span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
3772
 
3773
+ <span class="n">hf_kernels_flash_attn</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">&quot;kernels-community/flash-attn&quot;</span><span class="p">,</span> <span class="n">revision</span><span class="o">=</span><span class="s2">&quot;v0.0.2&quot;</span><span class="p">)</span>
3774
 
3775
 
3776
  <span class="k">def</span><span class="w"> </span><span class="nf">hf_flash_attention</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
 
3832
  <div id="output-benchmark" class="cell-output">
3833
  <div class="cell-stdout">impl wl p50(ms) ok
3834
  hf_kernels_flash_attn flux_L128 0.34 True
3835
+ hf_kernels_flash_attn flux_L256 0.38 True
3836
  hf_kernels_flash_attn flux_L320 0.49 True
3837
  hf_kernels_flash_attn flux_L384 0.51 True
3838
+ hf_kernels_flash_attn flux_L448 0.54 True
3839
+ hf_kernels_flash_attn flux_L512 0.55 True
3840
  </div>
3841
  <div class="uv-install-logs" id="uv-logs-benchmark">
3842
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3843
  <div class="uv-logs-content" style="display: none;">
3844
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3845
+ Downloading sympy (6.0MiB)
3846
+ Downloading matplotlib (8.3MiB)
3847
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3848
+ Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
3849
  Downloading nvidia-curand-cu12 (60.7MiB)
3850
  Downloading networkx (1.9MiB)
 
3851
  Downloading torch (846.9MiB)
3852
+ Downloading setuptools (1.1MiB)
 
 
 
3853
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3854
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3855
+ Downloading triton (148.3MiB)
3856
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3857
+ Downloading numpy (16.2MiB)
3858
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3859
  Downloading kiwisolver (1.4MiB)
3860
+ Downloading hf-xet (3.0MiB)
3861
  Downloading pillow (6.3MiB)
3862
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3863
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3864
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3865
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3866
  Downloading fonttools (4.7MiB)
3867
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3868
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3869
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3870
  Downloading nvidia-cufile-cu12
 
3875
  Downloading fonttools
3876
  Downloading pillow
3877
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
 
3878
  Downloading matplotlib
3879
+ Downloading nvidia-cuda-cupti-cu12
3880
  Downloading numpy
3881
  Downloading sympy
3882
  Downloading nvidia-nvjitlink-cu12
 
3885
  Downloading triton
3886
  Downloading nvidia-cufft-cu12
3887
  Downloading nvidia-cusolver-cu12
 
3888
  Downloading nvidia-cusparse-cu12
3889
+ Downloading nvidia-cusparselt-cu12
3890
  Downloading nvidia-nccl-cu12
3891
  Downloading nvidia-cublas-cu12
3892
  Downloading nvidia-cudnn-cu12
3893
  Downloading torch
3894
+ Installed 47 packages in 552ms
3895
  </div>
3896
  </div>
3897
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
3898
+ Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:03, 5.41it/s]
3899
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:16, 1.09it/s]
3900
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 12.37it/s]</div>
3901
  <div class="cell-artifacts">
3902
  <h4>Artifacts:</h4>
3903
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -719,6 +719,41 @@
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  .cell-failed {
723
  border-color: var(--border-cell-failed);
724
  }
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: benchmark | 41.76s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3798,7 +3833,7 @@ Cell: benchmark | 41.76s
3798
  hf_kernels_flash_attn3 flux_L128 0.36 True
3799
  hf_kernels_flash_attn3 flux_L256 0.39 True
3800
  hf_kernels_flash_attn3 flux_L320 0.52 True
3801
- hf_kernels_flash_attn3 flux_L384 0.53 True
3802
  hf_kernels_flash_attn3 flux_L448 0.57 True
3803
  hf_kernels_flash_attn3 flux_L512 0.57 True
3804
  </div>
@@ -3806,29 +3841,29 @@ hf_kernels_flash_attn3 flux_L512 0.57 True
3806
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3807
  <div class="uv-logs-content" style="display: none;">
3808
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3809
- Downloading sympy (6.0MiB)
3810
- Downloading networkx (1.9MiB)
3811
  Downloading nvidia-cufile-cu12 (1.1MiB)
3812
- Downloading matplotlib (8.3MiB)
3813
  Downloading setuptools (1.1MiB)
3814
- Downloading fonttools (4.7MiB)
 
 
 
 
 
 
3815
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3816
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
3817
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3818
- Downloading nvidia-cublas-cu12 (566.8MiB)
3819
- Downloading nvidia-cufft-cu12 (184.2MiB)
3820
- Downloading numpy (16.2MiB)
3821
- Downloading nvidia-curand-cu12 (60.7MiB)
3822
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3823
- Downloading hf-xet (3.0MiB)
3824
- Downloading pillow (6.3MiB)
3825
  Downloading nvidia-cusolver-cu12 (255.1MiB)
 
 
3826
  Downloading kiwisolver (1.4MiB)
3827
- Downloading torch (846.9MiB)
3828
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3829
- Downloading nvidia-nccl-cu12 (307.4MiB)
3830
- Downloading triton (148.3MiB)
3831
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3832
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3833
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3834
  Downloading nvidia-cufile-cu12
@@ -3849,19 +3884,19 @@ Downloading nvidia-cusparse-cu12 (274.9MiB)
3849
  Downloading triton
3850
  Downloading nvidia-cufft-cu12
3851
  Downloading nvidia-cusolver-cu12
3852
- Downloading nvidia-cusparselt-cu12
3853
  Downloading nvidia-cusparse-cu12
 
3854
  Downloading nvidia-nccl-cu12
3855
  Downloading nvidia-cublas-cu12
3856
  Downloading nvidia-cudnn-cu12
3857
  Downloading torch
3858
- Installed 47 packages in 515ms
3859
  </div>
3860
  </div>
3861
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
3862
- Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 4.20it/s]
3863
  Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.09it/s]
3864
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.44it/s]</div>
3865
  <div class="cell-artifacts">
3866
  <h4>Artifacts:</h4>
3867
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
722
+ /* CSV table styling */
723
+ .artifact-csv {
724
+ margin-top: 1rem;
725
+ overflow-x: auto;
726
+ }
727
+ .csv-table {
728
+ width: 100%;
729
+ border-collapse: collapse;
730
+ font-size: 0.9rem;
731
+ background: var(--bg-secondary);
732
+ border: 1px solid var(--border-primary);
733
+ border-radius: 1px;
734
+ }
735
+ .csv-table th,
736
+ .csv-table td {
737
+ padding: 0.5rem 0.75rem;
738
+ text-align: left;
739
+ border: 1px solid var(--border-primary);
740
+ }
741
+ .csv-table th {
742
+ background: var(--bg-tertiary);
743
+ font-weight: 600;
744
+ color: var(--text-primary);
745
+ }
746
+ .csv-table tbody tr:hover {
747
+ background: var(--bg-artifact-hover);
748
+ }
749
+ .artifact-csv-error {
750
+ margin-top: 1rem;
751
+ padding: 1rem;
752
+ background: var(--bg-error);
753
+ color: var(--text-error);
754
+ border: 1px solid var(--border-error);
755
+ border-radius: 1px;
756
+ }
757
  .cell-failed {
758
  border-color: var(--border-cell-failed);
759
  }
 
3745
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3747
  </span> |
3748
+ Cell: benchmark | 39.41s
3749
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3751
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3833
  hf_kernels_flash_attn3 flux_L128 0.36 True
3834
  hf_kernels_flash_attn3 flux_L256 0.39 True
3835
  hf_kernels_flash_attn3 flux_L320 0.52 True
3836
+ hf_kernels_flash_attn3 flux_L384 0.52 True
3837
  hf_kernels_flash_attn3 flux_L448 0.57 True
3838
  hf_kernels_flash_attn3 flux_L512 0.57 True
3839
  </div>
 
3841
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3842
  <div class="uv-logs-content" style="display: none;">
3843
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 
 
3844
  Downloading nvidia-cufile-cu12 (1.1MiB)
3845
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3846
  Downloading setuptools (1.1MiB)
3847
+ Downloading nvidia-curand-cu12 (60.7MiB)
3848
+ Downloading pillow (6.3MiB)
3849
+ Downloading numpy (16.2MiB)
3850
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3851
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3852
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3853
+ Downloading networkx (1.9MiB)
3854
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3855
+ Downloading sympy (6.0MiB)
3856
+ Downloading hf-xet (3.0MiB)
3857
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3858
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3859
+ Downloading torch (846.9MiB)
3860
+ Downloading triton (148.3MiB)
 
3861
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
3862
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3863
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3864
+ Downloading fonttools (4.7MiB)
3865
  Downloading kiwisolver (1.4MiB)
3866
+ Downloading matplotlib (8.3MiB)
 
 
 
 
3867
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3868
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3869
  Downloading nvidia-cufile-cu12
 
3884
  Downloading triton
3885
  Downloading nvidia-cufft-cu12
3886
  Downloading nvidia-cusolver-cu12
 
3887
  Downloading nvidia-cusparse-cu12
3888
+ Downloading nvidia-cusparselt-cu12
3889
  Downloading nvidia-nccl-cu12
3890
  Downloading nvidia-cublas-cu12
3891
  Downloading nvidia-cudnn-cu12
3892
  Downloading torch
3893
+ Installed 47 packages in 529ms
3894
  </div>
3895
  </div>
3896
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
3897
+ Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 6.35it/s]
3898
  Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.09it/s]
3899
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.48it/s]</div>
3900
  <div class="cell-artifacts">
3901
  <h4>Artifacts:</h4>
3902
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -719,6 +719,41 @@
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  .cell-failed {
723
  border-color: var(--border-cell-failed);
724
  }
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: benchmark | 35.95s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3803,35 +3838,35 @@ torch_mem_eff flux_L512 0.95 True
3803
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3804
  <div class="uv-logs-content" style="display: none;">
3805
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3806
- Downloading nvidia-curand-cu12 (60.7MiB)
3807
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3808
- Downloading networkx (1.9MiB)
3809
  Downloading kiwisolver (1.4MiB)
3810
- Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
3811
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3812
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3813
- Downloading sympy (6.0MiB)
3814
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3815
- Downloading fonttools (4.7MiB)
3816
- Downloading nvidia-cublas-cu12 (566.8MiB)
3817
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
3818
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3819
- Downloading setuptools (1.1MiB)
3820
- Downloading matplotlib (8.3MiB)
3821
  Downloading nvidia-cufile-cu12 (1.1MiB)
3822
- Downloading numpy (16.2MiB)
3823
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3824
- Downloading nvidia-nccl-cu12 (307.4MiB)
3825
- Downloading torch (846.9MiB)
3826
  Downloading triton (148.3MiB)
3827
- Downloading pillow (6.3MiB)
3828
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3829
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3830
  Downloading nvidia-cufile-cu12
3831
  Downloading kiwisolver
3832
  Downloading setuptools
3833
- Downloading fonttools
3834
  Downloading networkx
 
3835
  Downloading pillow
3836
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3837
  Downloading nvidia-cuda-cupti-cu12
@@ -3850,7 +3885,7 @@ Downloading pillow (6.3MiB)
3850
  Downloading nvidia-cublas-cu12
3851
  Downloading nvidia-cudnn-cu12
3852
  Downloading torch
3853
- Installed 37 packages in 556ms
3854
  </div>
3855
  </div>
3856
  <div class="cell-artifacts">
 
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
722
+ /* CSV table styling */
723
+ .artifact-csv {
724
+ margin-top: 1rem;
725
+ overflow-x: auto;
726
+ }
727
+ .csv-table {
728
+ width: 100%;
729
+ border-collapse: collapse;
730
+ font-size: 0.9rem;
731
+ background: var(--bg-secondary);
732
+ border: 1px solid var(--border-primary);
733
+ border-radius: 1px;
734
+ }
735
+ .csv-table th,
736
+ .csv-table td {
737
+ padding: 0.5rem 0.75rem;
738
+ text-align: left;
739
+ border: 1px solid var(--border-primary);
740
+ }
741
+ .csv-table th {
742
+ background: var(--bg-tertiary);
743
+ font-weight: 600;
744
+ color: var(--text-primary);
745
+ }
746
+ .csv-table tbody tr:hover {
747
+ background: var(--bg-artifact-hover);
748
+ }
749
+ .artifact-csv-error {
750
+ margin-top: 1rem;
751
+ padding: 1rem;
752
+ background: var(--bg-error);
753
+ color: var(--text-error);
754
+ border: 1px solid var(--border-error);
755
+ border-radius: 1px;
756
+ }
757
  .cell-failed {
758
  border-color: var(--border-cell-failed);
759
  }
 
3745
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3747
  </span> |
3748
+ Cell: benchmark | 36.09s
3749
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3751
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3838
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3839
  <div class="uv-logs-content" style="display: none;">
3840
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3841
+ Downloading sympy (6.0MiB)
3842
+ Downloading setuptools (1.1MiB)
3843
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3844
  Downloading kiwisolver (1.4MiB)
3845
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3846
+ Downloading torch (846.9MiB)
3847
+ Downloading matplotlib (8.3MiB)
3848
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3849
+ Downloading pillow (6.3MiB)
 
3850
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3851
+ Downloading networkx (1.9MiB)
3852
+ Downloading numpy (16.2MiB)
3853
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3854
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3855
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3856
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3857
+ Downloading nvidia-curand-cu12 (60.7MiB)
 
3858
  Downloading nvidia-cufile-cu12 (1.1MiB)
3859
+ Downloading fonttools (4.7MiB)
3860
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3861
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
 
3862
  Downloading triton (148.3MiB)
 
3863
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3864
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3865
  Downloading nvidia-cufile-cu12
3866
  Downloading kiwisolver
3867
  Downloading setuptools
 
3868
  Downloading networkx
3869
+ Downloading fonttools
3870
  Downloading pillow
3871
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3872
  Downloading nvidia-cuda-cupti-cu12
 
3885
  Downloading nvidia-cublas-cu12
3886
  Downloading nvidia-cudnn-cu12
3887
  Downloading torch
3888
+ Installed 37 packages in 447ms
3889
  </div>
3890
  </div>
3891
  <div class="cell-artifacts">
flash_attn/impls/sage_attention.html CHANGED
@@ -719,6 +719,41 @@
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  .cell-failed {
723
  border-color: var(--border-cell-failed);
724
  }
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: benchmark | 40.43s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3804,53 +3839,53 @@ Cell: benchmark | 40.43s
3804
  <div id="output-benchmark" class="cell-output">
3805
  <div class="cell-stdout">impl wl p50(ms) ok
3806
  sage_int8_fp16 flux_L128 FAIL False
3807
- Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3808
  sage_int8_fp16 flux_L256 FAIL False
3809
- Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3810
  sage_int8_fp16 flux_L320 FAIL False
3811
- Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3812
  sage_int8_fp16 flux_L384 FAIL False
3813
- Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3814
  sage_int8_fp16 flux_L448 FAIL False
3815
- Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3816
  sage_int8_fp16 flux_L512 FAIL False
3817
- Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3818
  </div>
3819
  <div class="uv-install-logs" id="uv-logs-benchmark">
3820
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3821
  <div class="uv-logs-content" style="display: none;">
3822
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 
3823
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3824
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3825
- Downloading networkx (1.9MiB)
3826
- Downloading setuptools (1.1MiB)
3827
  Downloading numpy (16.2MiB)
3828
  Downloading nvidia-cufile-cu12 (1.1MiB)
3829
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3830
- Downloading nvidia-curand-cu12 (60.7MiB)
3831
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3832
  Downloading hf-xet (3.0MiB)
3833
- Downloading torch (846.9MiB)
3834
- Downloading triton (148.3MiB)
3835
- Downloading nvidia-nccl-cu12 (307.4MiB)
3836
  Downloading nvidia-cublas-cu12 (566.8MiB)
3837
- Downloading kiwisolver (1.4MiB)
3838
  Downloading pillow (6.3MiB)
3839
- Downloading sympy (6.0MiB)
3840
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3841
- Downloading nvidia-cufft-cu12 (184.2MiB)
3842
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3843
- Downloading matplotlib (8.3MiB)
 
 
 
3844
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
 
 
3845
  Downloading fonttools (4.7MiB)
 
 
3846
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3847
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3848
  Downloading nvidia-cufile-cu12
3849
  Downloading kiwisolver
3850
  Downloading hf-xet
3851
  Downloading setuptools
3852
- Downloading networkx
3853
  Downloading fonttools
 
3854
  Downloading pillow
3855
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3856
  Downloading nvidia-cuda-cupti-cu12
@@ -3863,19 +3898,20 @@ Downloading fonttools (4.7MiB)
3863
  Downloading triton
3864
  Downloading nvidia-cufft-cu12
3865
  Downloading nvidia-cusolver-cu12
3866
- Downloading nvidia-cusparse-cu12
3867
  Downloading nvidia-cusparselt-cu12
 
3868
  Downloading nvidia-nccl-cu12
3869
  Downloading nvidia-cublas-cu12
3870
  Downloading nvidia-cudnn-cu12
3871
  Downloading torch
3872
- Installed 48 packages in 525ms
3873
  </div>
3874
  </div>
3875
  <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3876
- Fetching 11 files: 9%|▉ | 1/11 [00:00&lt;00:01, 5.55it/s]
3877
- Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 12.93it/s]
3878
- Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 16.93it/s]</div>
 
3879
  <div class="cell-artifacts">
3880
  <h4>Artifacts:</h4>
3881
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
722
+ /* CSV table styling */
723
+ .artifact-csv {
724
+ margin-top: 1rem;
725
+ overflow-x: auto;
726
+ }
727
+ .csv-table {
728
+ width: 100%;
729
+ border-collapse: collapse;
730
+ font-size: 0.9rem;
731
+ background: var(--bg-secondary);
732
+ border: 1px solid var(--border-primary);
733
+ border-radius: 1px;
734
+ }
735
+ .csv-table th,
736
+ .csv-table td {
737
+ padding: 0.5rem 0.75rem;
738
+ text-align: left;
739
+ border: 1px solid var(--border-primary);
740
+ }
741
+ .csv-table th {
742
+ background: var(--bg-tertiary);
743
+ font-weight: 600;
744
+ color: var(--text-primary);
745
+ }
746
+ .csv-table tbody tr:hover {
747
+ background: var(--bg-artifact-hover);
748
+ }
749
+ .artifact-csv-error {
750
+ margin-top: 1rem;
751
+ padding: 1rem;
752
+ background: var(--bg-error);
753
+ color: var(--text-error);
754
+ border: 1px solid var(--border-error);
755
+ border-radius: 1px;
756
+ }
757
  .cell-failed {
758
  border-color: var(--border-cell-failed);
759
  }
 
3745
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3747
  </span> |
3748
+ Cell: benchmark | 40.08s
3749
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3751
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3839
  <div id="output-benchmark" class="cell-output">
3840
  <div class="cell-stdout">impl wl p50(ms) ok
3841
  sage_int8_fp16 flux_L128 FAIL False
3842
+ Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3843
  sage_int8_fp16 flux_L256 FAIL False
3844
+ Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3845
  sage_int8_fp16 flux_L320 FAIL False
3846
+ Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3847
  sage_int8_fp16 flux_L384 FAIL False
3848
+ Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3849
  sage_int8_fp16 flux_L448 FAIL False
3850
+ Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3851
  sage_int8_fp16 flux_L512 FAIL False
3852
+ Error: module &#x27;sage_attention_ba12545b014364be&#x27; has no attribute &#x27;fwd&#x27;
3853
  </div>
3854
  <div class="uv-install-logs" id="uv-logs-benchmark">
3855
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3856
  <div class="uv-logs-content" style="display: none;">
3857
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3858
+ Downloading sympy (6.0MiB)
3859
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3860
+ Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
3861
  Downloading numpy (16.2MiB)
3862
  Downloading nvidia-cufile-cu12 (1.1MiB)
3863
+ Downloading networkx (1.9MiB)
 
 
3864
  Downloading hf-xet (3.0MiB)
 
 
 
3865
  Downloading nvidia-cublas-cu12 (566.8MiB)
3866
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3867
  Downloading pillow (6.3MiB)
 
 
 
3868
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3869
+ Downloading triton (148.3MiB)
3870
+ Downloading setuptools (1.1MiB)
3871
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3872
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3873
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3874
+ Downloading matplotlib (8.3MiB)
3875
+ Downloading nvidia-curand-cu12 (60.7MiB)
3876
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3877
+ Downloading kiwisolver (1.4MiB)
3878
  Downloading fonttools (4.7MiB)
3879
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3880
+ Downloading torch (846.9MiB)
3881
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3882
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3883
  Downloading nvidia-cufile-cu12
3884
  Downloading kiwisolver
3885
  Downloading hf-xet
3886
  Downloading setuptools
 
3887
  Downloading fonttools
3888
+ Downloading networkx
3889
  Downloading pillow
3890
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3891
  Downloading nvidia-cuda-cupti-cu12
 
3898
  Downloading triton
3899
  Downloading nvidia-cufft-cu12
3900
  Downloading nvidia-cusolver-cu12
 
3901
  Downloading nvidia-cusparselt-cu12
3902
+ Downloading nvidia-cusparse-cu12
3903
  Downloading nvidia-nccl-cu12
3904
  Downloading nvidia-cublas-cu12
3905
  Downloading nvidia-cudnn-cu12
3906
  Downloading torch
3907
+ Installed 48 packages in 531ms
3908
  </div>
3909
  </div>
3910
  <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3911
+ Fetching 11 files: 9%|▉ | 1/11 [00:00&lt;00:02, 4.42it/s]
3912
+ Fetching 11 files: 27%|██▋ | 3/11 [00:00&lt;00:01, 5.95it/s]
3913
+ Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 12.27it/s]
3914
+ Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 14.37it/s]</div>
3915
  <div class="cell-artifacts">
3916
  <h4>Artifacts:</h4>
3917
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
flash_attn/impls/xformers.html CHANGED
@@ -719,6 +719,41 @@
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  .cell-failed {
723
  border-color: var(--border-cell-failed);
724
  }
@@ -3710,7 +3745,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: benchmark | 40.64s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3797,35 +3832,35 @@ xformers_meff flux_L256 0.47 True
3797
  xformers_meff flux_L320 0.60 True
3798
  xformers_meff flux_L384 0.60 True
3799
  xformers_meff flux_L448 0.64 True
3800
- xformers_meff flux_L512 0.64 True
3801
  </div>
3802
  <div class="uv-install-logs" id="uv-logs-benchmark">
3803
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3804
  <div class="uv-logs-content" style="display: none;">
3805
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3806
- Downloading networkx (1.9MiB)
3807
  Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
3808
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3809
- Downloading nvidia-cufft-cu12 (184.2MiB)
3810
  Downloading nvidia-curand-cu12 (60.7MiB)
3811
- Downloading triton (148.3MiB)
3812
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3813
- Downloading pillow (6.3MiB)
3814
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3815
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3816
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3817
- Downloading nvidia-cublas-cu12 (566.8MiB)
3818
  Downloading numpy (16.2MiB)
3819
  Downloading nvidia-nccl-cu12 (307.4MiB)
3820
- Downloading sympy (6.0MiB)
3821
- Downloading matplotlib (8.3MiB)
3822
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3823
- Downloading xformers (111.8MiB)
3824
  Downloading setuptools (1.1MiB)
 
3825
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
3826
  Downloading kiwisolver (1.4MiB)
3827
- Downloading fonttools (4.7MiB)
3828
  Downloading torch (846.9MiB)
 
 
 
 
 
3829
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3830
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3831
  Downloading nvidia-cufile-cu12
@@ -3837,8 +3872,8 @@ Downloading torch (846.9MiB)
3837
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3838
  Downloading nvidia-cuda-cupti-cu12
3839
  Downloading matplotlib
3840
- Downloading numpy
3841
  Downloading sympy
 
3842
  Downloading nvidia-nvjitlink-cu12
3843
  Downloading nvidia-curand-cu12
3844
  Downloading nvidia-cuda-nvrtc-cu12
@@ -3846,13 +3881,13 @@ Downloading torch (846.9MiB)
3846
  Downloading triton
3847
  Downloading nvidia-cufft-cu12
3848
  Downloading nvidia-cusolver-cu12
3849
- Downloading nvidia-cusparse-cu12
3850
  Downloading nvidia-cusparselt-cu12
 
3851
  Downloading nvidia-nccl-cu12
3852
  Downloading nvidia-cublas-cu12
3853
  Downloading nvidia-cudnn-cu12
3854
  Downloading torch
3855
- Installed 38 packages in 562ms
3856
  </div>
3857
  </div>
3858
  <div class="cell-artifacts">
 
719
  .artifact-preview svg {
720
  background: transparent;
721
  }
722
+ /* CSV table styling */
723
+ .artifact-csv {
724
+ margin-top: 1rem;
725
+ overflow-x: auto;
726
+ }
727
+ .csv-table {
728
+ width: 100%;
729
+ border-collapse: collapse;
730
+ font-size: 0.9rem;
731
+ background: var(--bg-secondary);
732
+ border: 1px solid var(--border-primary);
733
+ border-radius: 1px;
734
+ }
735
+ .csv-table th,
736
+ .csv-table td {
737
+ padding: 0.5rem 0.75rem;
738
+ text-align: left;
739
+ border: 1px solid var(--border-primary);
740
+ }
741
+ .csv-table th {
742
+ background: var(--bg-tertiary);
743
+ font-weight: 600;
744
+ color: var(--text-primary);
745
+ }
746
+ .csv-table tbody tr:hover {
747
+ background: var(--bg-artifact-hover);
748
+ }
749
+ .artifact-csv-error {
750
+ margin-top: 1rem;
751
+ padding: 1rem;
752
+ background: var(--bg-error);
753
+ color: var(--text-error);
754
+ border: 1px solid var(--border-error);
755
+ border-radius: 1px;
756
+ }
757
  .cell-failed {
758
  border-color: var(--border-cell-failed);
759
  }
 
3745
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3746
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3747
  </span> |
3748
+ Cell: benchmark | 40.41s
3749
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3750
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3751
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3832
  xformers_meff flux_L320 0.60 True
3833
  xformers_meff flux_L384 0.60 True
3834
  xformers_meff flux_L448 0.64 True
3835
+ xformers_meff flux_L512 0.65 True
3836
  </div>
3837
  <div class="uv-install-logs" id="uv-logs-benchmark">
3838
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3839
  <div class="uv-logs-content" style="display: none;">
3840
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 
3841
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3842
+ Downloading pillow (6.3MiB)
3843
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3844
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3845
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3846
  Downloading nvidia-curand-cu12 (60.7MiB)
 
 
 
 
3847
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3848
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3849
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3850
  Downloading numpy (16.2MiB)
3851
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
 
 
3852
  Downloading setuptools (1.1MiB)
3853
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3854
  Downloading nvidia-cufile-cu12 (1.1MiB)
3855
+ Downloading networkx (1.9MiB)
3856
  Downloading kiwisolver (1.4MiB)
3857
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3858
  Downloading torch (846.9MiB)
3859
+ Downloading matplotlib (8.3MiB)
3860
+ Downloading triton (148.3MiB)
3861
+ Downloading sympy (6.0MiB)
3862
+ Downloading fonttools (4.7MiB)
3863
+ Downloading xformers (111.8MiB)
3864
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3865
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3866
  Downloading nvidia-cufile-cu12
 
3872
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3873
  Downloading nvidia-cuda-cupti-cu12
3874
  Downloading matplotlib
 
3875
  Downloading sympy
3876
+ Downloading numpy
3877
  Downloading nvidia-nvjitlink-cu12
3878
  Downloading nvidia-curand-cu12
3879
  Downloading nvidia-cuda-nvrtc-cu12
 
3881
  Downloading triton
3882
  Downloading nvidia-cufft-cu12
3883
  Downloading nvidia-cusolver-cu12
 
3884
  Downloading nvidia-cusparselt-cu12
3885
+ Downloading nvidia-cusparse-cu12
3886
  Downloading nvidia-nccl-cu12
3887
  Downloading nvidia-cublas-cu12
3888
  Downloading nvidia-cudnn-cu12
3889
  Downloading torch
3890
+ Installed 38 packages in 452ms
3891
  </div>
3892
  </div>
3893
  <div class="cell-artifacts">
flash_attn/results/artifacts/combine/latency.csv ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Implementation,Impl ID,Workload,Batch,Seq Length,Heads,Head Dim,Dtype,Mean (ms),P10 (ms),P50 (ms),P90 (ms),Reps,Peak Mem (MB),Backend,Family
2
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L128,1,1152,24,128,bfloat16,0.48577280044555665,0.47836801409721375,0.4803520143032074,0.4827199876308441,5,83.38,FLASH,torch-sdpa
3
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L256,1,1280,24,128,bfloat16,0.5229184031486511,0.521727979183197,0.5228800177574158,0.5234559774398804,5,90.62,FLASH,torch-sdpa
4
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L320,1,1344,24,128,bfloat16,0.6515967845916748,0.6503999829292297,0.650879979133606,0.6513599753379822,5,95.06,FLASH,torch-sdpa
5
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L384,1,1408,24,128,bfloat16,0.6807615995407105,0.6797440052032471,0.6808639764785767,0.6815680265426636,5,99.88,FLASH,torch-sdpa
6
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L448,1,1472,24,128,bfloat16,0.711027193069458,0.7058879733085632,0.7121919989585876,0.7131519913673401,5,103.81,FLASH,torch-sdpa
7
+ Flash (PyTorch SDPA),torch_flash_ma,flux_L512,1,1536,24,128,bfloat16,0.7391423940658569,0.7369279861450195,0.7383999824523926,0.7408959865570068,5,109.12,FLASH,torch-sdpa
8
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L128,1,1152,24,128,bfloat16,0.5875647902488709,0.5863680243492126,0.5874559879302979,0.5876479744911194,5,83.38,EFFICIENT,torch-sdpa
9
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L256,1,1280,24,128,bfloat16,0.653657603263855,0.6485440135002136,0.6537600159645081,0.656544029712677,5,90.62,EFFICIENT,torch-sdpa
10
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L320,1,1344,24,128,bfloat16,0.7784512042999268,0.774944007396698,0.778656005859375,0.7801600098609924,5,95.94,EFFICIENT,torch-sdpa
11
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L384,1,1408,24,128,bfloat16,0.7922943949699401,0.791263997554779,0.7924799919128418,0.7927039861679077,5,100.0,EFFICIENT,torch-sdpa
12
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L448,1,1472,24,128,bfloat16,0.848089587688446,0.8444799780845642,0.8470079898834229,0.8499199748039246,5,103.81,EFFICIENT,torch-sdpa
13
+ MemEff (PyTorch SDPA),torch_mem_eff,flux_L512,1,1536,24,128,bfloat16,0.9523776054382325,0.95004802942276,0.9519039988517761,0.9541119933128357,5,109.12,EFFICIENT,torch-sdpa
14
+ xFormers,xformers_meff,flux_L128,1,1152,24,128,bfloat16,0.45066879987716674,0.4474239945411682,0.44921600818634033,0.45241600275039673,5,83.38,memory_efficient,xformers
15
+ xFormers,xformers_meff,flux_L256,1,1280,24,128,bfloat16,0.47004159688949587,0.4652479887008667,0.4705919921398163,0.4716799855232239,5,90.62,memory_efficient,xformers
16
+ xFormers,xformers_meff,flux_L320,1,1344,24,128,bfloat16,0.6022783994674683,0.5987840294837952,0.6021760106086731,0.6045759916305542,5,95.06,memory_efficient,xformers
17
+ xFormers,xformers_meff,flux_L384,1,1408,24,128,bfloat16,0.6013055920600892,0.6000319719314575,0.600383996963501,0.6016640067100525,5,99.88,memory_efficient,xformers
18
+ xFormers,xformers_meff,flux_L448,1,1472,24,128,bfloat16,0.6408192038536071,0.639136016368866,0.6404479742050171,0.6416320204734802,5,103.81,memory_efficient,xformers
19
+ xFormers,xformers_meff,flux_L512,1,1536,24,128,bfloat16,0.6466111898422241,0.6447359919548035,0.6462399959564209,0.6483839750289917,5,109.12,memory_efficient,xformers
20
+ Compiled (default),torch_flash_compiled_default,flux_L128,1,1152,24,128,bfloat16,0.527347207069397,0.5194560289382935,0.5272960066795349,0.5312960147857666,5,83.38,FLASH,torch-sdpa
21
+ Compiled (default),torch_flash_compiled_default,flux_L256,1,1280,24,128,bfloat16,0.5586367964744567,0.5560640096664429,0.5571519732475281,0.5611839890480042,5,90.62,FLASH,torch-sdpa
22
+ Compiled (default),torch_flash_compiled_default,flux_L320,1,1344,24,128,bfloat16,0.6860736012458801,0.6841920018196106,0.6860160231590271,0.6869760155677795,5,95.25,FLASH,torch-sdpa
23
+ Compiled (default),torch_flash_compiled_default,flux_L384,1,1408,24,128,bfloat16,0.7167360067367554,0.7152000069618225,0.7161920070648193,0.7164160013198853,5,99.88,FLASH,torch-sdpa
24
+ Compiled (default),torch_flash_compiled_default,flux_L448,1,1472,24,128,bfloat16,0.7423295855522156,0.7400959730148315,0.742143988609314,0.7431039810180664,5,103.81,FLASH,torch-sdpa
25
+ Compiled (default),torch_flash_compiled_default,flux_L512,1,1536,24,128,bfloat16,0.7743871927261352,0.7718080282211304,0.7745919823646545,0.7748159766197205,5,109.12,FLASH,torch-sdpa
26
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L128,1,1152,24,128,bfloat16,0.6489088058471679,0.6148160099983215,0.6296960115432739,0.6522240042686462,5,67.5,FLASH,torch-sdpa
27
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L256,1,1280,24,128,bfloat16,0.700761592388153,0.6615359783172607,0.6821119785308838,0.7128959894180298,5,75.0,FLASH,torch-sdpa
28
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L320,1,1344,24,128,bfloat16,0.834444797039032,0.7967039942741394,0.8164799809455872,0.8463680148124695,5,80.38,FLASH,torch-sdpa
29
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L384,1,1408,24,128,bfloat16,0.8709375977516174,0.8432319760322571,0.8498560190200806,0.8750079870223999,5,82.5,FLASH,torch-sdpa
30
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L448,1,1472,24,128,bfloat16,0.9069631934165955,0.8775359988212585,0.9030719995498657,0.903872013092041,5,86.25,FLASH,torch-sdpa
31
+ Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L512,1,1536,24,128,bfloat16,0.9371584057807922,0.9145920276641846,0.9164159893989563,0.9357439875602722,5,90.0,FLASH,torch-sdpa
32
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L128,1,1152,24,128,bfloat16,0.34446719884872434,0.3438720107078552,0.3445119857788086,0.34457600116729736,5,83.38,flash-attn,hf-kernels
33
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L256,1,1280,24,128,bfloat16,0.37571839094161985,0.37404799461364746,0.3763839900493622,0.3766399919986725,5,90.62,flash-attn,hf-kernels
34
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L320,1,1344,24,128,bfloat16,0.4945920050144196,0.4925439953804016,0.493120014667511,0.4938240051269531,5,95.06,flash-attn,hf-kernels
35
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L384,1,1408,24,128,bfloat16,0.5139647841453552,0.5123199820518494,0.5142719745635986,0.5147839784622192,5,99.88,flash-attn,hf-kernels
36
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L448,1,1472,24,128,bfloat16,0.5353855967521668,0.5339199900627136,0.5350080132484436,0.5352320075035095,5,103.81,flash-attn,hf-kernels
37
+ HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L512,1,1536,24,128,bfloat16,0.5548800110816956,0.5538560152053833,0.5548800230026245,0.5553280115127563,5,109.12,flash-attn,hf-kernels
38
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L128,1,1152,24,128,bfloat16,0.3617343962192535,0.36102399230003357,0.3616960048675537,0.36211198568344116,5,83.38,flash-attn3,hf-kernels
39
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L256,1,1280,24,128,bfloat16,0.3907967984676361,0.3885439932346344,0.39056000113487244,0.3906239867210388,5,90.62,flash-attn3,hf-kernels
40
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L320,1,1344,24,128,bfloat16,0.5228991985321045,0.521344006061554,0.5230720043182373,0.5232319831848145,5,95.06,flash-attn3,hf-kernels
41
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L384,1,1408,24,128,bfloat16,0.5254656076431274,0.523904025554657,0.5249919891357422,0.526528000831604,5,99.88,flash-attn3,hf-kernels
42
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L448,1,1472,24,128,bfloat16,0.5646592020988465,0.5627840161323547,0.565343976020813,0.565343976020813,5,103.81,flash-attn3,hf-kernels
43
+ HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L512,1,1536,24,128,bfloat16,0.5698879957199097,0.567359983921051,0.5696640014648438,0.5698559880256653,5,109.12,flash-attn3,hf-kernels
flash_attn/results/artifacts/combine/latency.svg ADDED

Git LFS Details

  • SHA256: 587d477bc9dc161a51c012142295b5a6efa33e48054fc657106ca27ba64b8683
  • Pointer size: 130 Bytes
  • Size of remote file: 28.3 kB
flash_attn/results/cells/combine.py CHANGED
@@ -10,13 +10,173 @@
10
  # [tool.uv.sources]
11
  # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
12
  # ///
13
- import torch
14
- import sys
15
  import os
16
- import kernels_benchmark_tools as kbt
17
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- # Discover the upstream artifact directories from environment variables
20
  cache_dirs = {
21
  "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
22
  "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
@@ -34,8 +194,6 @@ for name, cache_dir in cache_dirs.items():
34
  print(f"{name:30s}: {cache_dir}")
35
  print()
36
 
37
- # Collect all JSONL paths
38
- all_paths = []
39
  file_mapping = {
40
  "Flash (PyTorch SDPA)": "attn.jsonl",
41
  "MemEff (PyTorch SDPA)": "attn.jsonl",
@@ -48,10 +206,10 @@ file_mapping = {
48
  "HF Kernels Flash Attn3": "attn.jsonl",
49
  }
50
 
 
51
  for name, cache_dir in cache_dirs.items():
52
  if cache_dir:
53
- jsonl_file = file_mapping[name]
54
- path = Path(cache_dir) / jsonl_file
55
  if path.exists() and path.stat().st_size > 0:
56
  all_paths.append(str(path))
57
  print(f"✓ Found {name}: {path}")
@@ -59,30 +217,40 @@ for name, cache_dir in cache_dirs.items():
59
  print(f"⊘ Empty/Missing {name}: {path}")
60
  else:
61
  print(f"✗ No cache dir for {name}")
62
-
63
  print()
64
 
65
  if not all_paths:
66
  print("ERROR: No benchmark data files found!")
 
 
 
67
  sys.exit(1)
68
 
69
- # Generate combined summary
70
- print("COMBINED BENCHMARK SUMMARY")
71
- print()
72
-
73
  kbt.summarize(all_paths)
74
-
75
- print()
76
- print("GENERATING COMBINED VISUALIZATION")
77
- print()
78
 
79
  try:
 
 
80
  kbt.viz(all_paths)
81
- print("✓ Combined visualization saved as latency.png")
 
 
 
 
 
 
 
82
  except ImportError as e:
83
  print(f"✗ Visualization requires matplotlib: {e}")
84
  except Exception as e:
85
  print(f"✗ Visualization failed: {e}")
 
 
 
 
86
 
87
  print()
88
  print("ANALYSIS COMPLETE")
@@ -90,7 +258,62 @@ print(f"Total implementations analyzed: {len(all_paths)}")
90
  print(f"\nImplementations included:")
91
  for name, cache_dir in cache_dirs.items():
92
  if cache_dir:
93
- jsonl_file = file_mapping[name]
94
- path = Path(cache_dir) / jsonl_file
95
  if path.exists() and path.stat().st_size > 0:
96
- print(f" ✓ {name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # [tool.uv.sources]
11
  # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
12
  # ///
 
 
13
  import os
14
+ import sys
15
  from pathlib import Path
16
+ import json
17
+ import torch # noqa: F401 # imported because upstream may expect torch to be importable
18
+ import kernels_benchmark_tools as kbt
19
+
20
+ # --- Matplotlib setup and helpers ------------------------------------------------
21
+ import matplotlib as mpl
22
+ import matplotlib.pyplot as plt
23
+ import csv
24
+
25
+
26
+ # Keep text as text (not paths) so CSS can style fonts, size, etc.
27
+ mpl.rcParams["svg.fonttype"] = "none"
28
+ # Make ids deterministic across builds
29
+ mpl.rcParams["svg.hashsalt"] = "latency-benchmark-combined"
30
+ # Avoid auto-closed figures interfering with our tagging
31
+ mpl.rcParams["figure.autolayout"] = True
32
+ # Make background transparent
33
+ mpl.rcParams["figure.facecolor"] = "none"
34
+ mpl.rcParams["axes.facecolor"] = "none"
35
+ mpl.rcParams["savefig.facecolor"] = "none"
36
+ mpl.rcParams["savefig.edgecolor"] = "none"
37
+
38
+ def _slugify(s: str) -> str:
39
+ s = (s or "").strip().lower()
40
+ keep = []
41
+ for ch in s:
42
+ if ch.isalnum():
43
+ keep.append(ch)
44
+ elif ch in (" ", "-", "_", "/", ".", ":"):
45
+ keep.append("-")
46
+ else:
47
+ keep.append("")
48
+ out = "".join(keep)
49
+ while "--" in out:
50
+ out = out.replace("--", "-")
51
+ return out.strip("-") or "unnamed"
52
+
53
+ def _tag_current_figure(default_series_prefix="series"):
54
+ """Attach SVG ids (gid) to key artists so they can be targeted from CSS."""
55
+ fig = plt.gcf()
56
+ if fig is None:
57
+ return
58
+
59
+ # Tag the figure itself
60
+ fig.set_gid("figure--latency")
61
+
62
+ for ax_idx, ax in enumerate(fig.get_axes(), start=1):
63
+ ax.set_gid(f"axes--{ax_idx}")
64
+
65
+ # Axis labels & title
66
+ if ax.get_title():
67
+ for t in ax.texts:
68
+ if t.get_text() == ax.get_title():
69
+ t.set_gid("title--main")
70
+ if ax.xaxis and ax.xaxis.get_label():
71
+ ax.xaxis.label.set_gid("label--x")
72
+ if ax.yaxis and ax.yaxis.get_label():
73
+ ax.yaxis.label.set_gid("label--y")
74
+
75
+ # Gridlines
76
+ for i, gl in enumerate(ax.get_xgridlines(), start=1):
77
+ gl.set_gid(f"grid-x--{i}")
78
+ for i, gl in enumerate(ax.get_ygridlines(), start=1):
79
+ gl.set_gid(f"grid-y--{i}")
80
+
81
+ # Legend block & entries
82
+ leg = ax.get_legend()
83
+ if leg is not None:
84
+ leg.set_gid("legend")
85
+ for i, txt in enumerate(leg.get_texts(), start=1):
86
+ label_slug = _slugify(txt.get_text())
87
+ txt.set_gid(f"legend-label--{label_slug or i}")
88
+
89
+ # Series (lines, patches)
90
+ # Lines
91
+ line_seen = {}
92
+ for ln in getattr(ax, "lines", []):
93
+ raw_label = ln.get_label() or ""
94
+ # Matplotlib uses labels beginning with "_" for non-legendable items
95
+ label = raw_label if not raw_label.startswith("_") else f"{default_series_prefix}"
96
+ slug = _slugify(label)
97
+ line_seen[slug] = line_seen.get(slug, 0) + 1
98
+ suffix = "" if line_seen[slug] == 1 else f"-{line_seen[slug]}"
99
+ ln.set_gid(f"series--{slug}{suffix}")
100
+
101
+ # Patches (bars, areas)
102
+ patch_seen = {}
103
+ for pt in getattr(ax, "patches", []):
104
+ label = getattr(pt, "get_label", lambda: "")() or f"{default_series_prefix}"
105
+ if isinstance(label, str) and label.startswith("_"):
106
+ label = default_series_prefix
107
+ slug = _slugify(label)
108
+ patch_seen[slug] = patch_seen.get(slug, 0) + 1
109
+ suffix = "" if patch_seen[slug] == 1 else f"-{patch_seen[slug]}"
110
+ pt.set_gid(f"series--{slug}{suffix}")
111
+
112
+ def _postprocess_svg_add_classes(svg_path: Path):
113
+ """Add convenient CSS classes alongside ids (e.g., class='series grid grid-x')."""
114
+ try:
115
+ import xml.etree.ElementTree as ET
116
+ ET.register_namespace("", "http://www.w3.org/2000/svg")
117
+ tree = ET.parse(svg_path)
118
+ root = tree.getroot()
119
+ for el in root.iter():
120
+ el_id = el.attrib.get("id", "")
121
+ if not el_id:
122
+ continue
123
+ cls = []
124
+ if el_id.startswith("figure--"):
125
+ cls.append("figure")
126
+ elif el_id.startswith("axes--"):
127
+ cls.append("axes")
128
+ elif el_id.startswith("grid-x--"):
129
+ cls += ["grid", "grid-x"]
130
+ elif el_id.startswith("grid-y--"):
131
+ cls += ["grid", "grid-y"]
132
+ elif el_id.startswith("legend"):
133
+ cls.append("legend")
134
+ elif el_id.startswith("label--x"):
135
+ cls.append("xlabel")
136
+ elif el_id.startswith("label--y"):
137
+ cls.append("ylabel")
138
+ elif el_id.startswith("title--"):
139
+ cls.append("title")
140
+ elif el_id.startswith("series--"):
141
+ cls.append("series")
142
+ if cls:
143
+ # Preserve any existing class (unlikely from Matplotlib)
144
+ existing = el.attrib.get("class", "")
145
+ el.set("class", (existing + " " + " ".join(cls)).strip())
146
+ tree.write(svg_path, encoding="utf-8", xml_declaration=True)
147
+ except Exception as e:
148
+ print(f"✗ SVG postprocess (classes) skipped: {e}")
149
+
150
+ # Monkey-patch savefig to force SVG & ensure tagging occurs even if kbt.viz saves internally.
151
+ _orig_savefig = plt.savefig
152
+ def _savefig_svg(fname, *args, **kwargs):
153
+ # Always save as SVG at a stable path for the artifact system
154
+ out = Path("latency.svg")
155
+ kwargs["format"] = "svg"
156
+ # Ensure everything we care about has ids before export
157
+ _tag_current_figure()
158
+ res = _orig_savefig(out, *args, **kwargs)
159
+ # Add helpful CSS classes on top of ids
160
+ _postprocess_svg_add_classes(out)
161
+ print(f"✓ Combined visualization saved as {out}")
162
+ return res
163
+
164
+ plt.savefig = _savefig_svg # apply patch
165
+
166
+ # Capture close calls in case kbt.viz() closes figures before we re-save
167
+ _orig_close = plt.close
168
+ _last_closed = {"fig": None}
169
+ def _capture_close(arg=None):
170
+ try:
171
+ if hasattr(arg, "savefig"): # looks like a Figure
172
+ _last_closed["fig"] = arg
173
+ else:
174
+ _last_closed["fig"] = plt.gcf()
175
+ finally:
176
+ return _orig_close(arg)
177
+ plt.close = _capture_close
178
 
179
+ # --- Locate benchmark artifacts --------------------------------------------------
180
  cache_dirs = {
181
  "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
182
  "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
 
194
  print(f"{name:30s}: {cache_dir}")
195
  print()
196
 
 
 
197
  file_mapping = {
198
  "Flash (PyTorch SDPA)": "attn.jsonl",
199
  "MemEff (PyTorch SDPA)": "attn.jsonl",
 
206
  "HF Kernels Flash Attn3": "attn.jsonl",
207
  }
208
 
209
+ all_paths = []
210
  for name, cache_dir in cache_dirs.items():
211
  if cache_dir:
212
+ path = Path(cache_dir) / file_mapping[name]
 
213
  if path.exists() and path.stat().st_size > 0:
214
  all_paths.append(str(path))
215
  print(f"✓ Found {name}: {path}")
 
217
  print(f"⊘ Empty/Missing {name}: {path}")
218
  else:
219
  print(f"✗ No cache dir for {name}")
 
220
  print()
221
 
222
  if not all_paths:
223
  print("ERROR: No benchmark data files found!")
224
+ # restore patched functions before exiting
225
+ plt.savefig = _orig_savefig
226
+ plt.close = _orig_close
227
  sys.exit(1)
228
 
229
+ # --- Summary + Visualization -----------------------------------------------------
230
+ print("COMBINED BENCHMARK SUMMARY\n")
 
 
231
  kbt.summarize(all_paths)
232
+ print("\nGENERATING COMBINED VISUALIZATION\n")
 
 
 
233
 
234
  try:
235
+ # If kbt.viz saves internally, our patched savefig ensures SVG gets written,
236
+ # and it will carry ids/classes for CSS styling.
237
  kbt.viz(all_paths)
238
+ # Safety net: if kbt.viz didn't save, save now.
239
+ # if not Path("latency.svg").exists():
240
+ # _tag_current_figure()
241
+ # plt.savefig("latency.svg")
242
+
243
+ plt.savefig("latency.svg") # ensure saved with tagging
244
+
245
+ print("✓ SVG visualization ready: latency.svg!")
246
  except ImportError as e:
247
  print(f"✗ Visualization requires matplotlib: {e}")
248
  except Exception as e:
249
  print(f"✗ Visualization failed: {e}")
250
+ finally:
251
+ # Clean up patches to avoid side effects in later cells
252
+ plt.savefig = _orig_savefig
253
+ plt.close = _orig_close
254
 
255
  print()
256
  print("ANALYSIS COMPLETE")
 
258
  print(f"\nImplementations included:")
259
  for name, cache_dir in cache_dirs.items():
260
  if cache_dir:
261
+ path = Path(cache_dir) / file_mapping[name]
 
262
  if path.exists() and path.stat().st_size > 0:
263
+ print(f" ✓ {name}")
264
+
265
+
266
+
267
+ # Collect all benchmark data and export to CSV
268
+ all_data = {}
269
+ for name, cache_dir in cache_dirs.items():
270
+ if cache_dir:
271
+ path = Path(cache_dir) / file_mapping[name]
272
+ if path.exists() and path.stat().st_size > 0:
273
+ with open(path, 'r') as f:
274
+ records = [json.loads(line) for line in f]
275
+ all_data[name] = records
276
+
277
+ # Export to CSV
278
+ csv_path = Path("latency.csv")
279
+ with open(csv_path, 'w', newline='') as csvfile:
280
+ writer = csv.writer(csvfile)
281
+
282
+ # Write header
283
+ header = ["Implementation", "Impl ID", "Workload", "Batch", "Seq Length", "Heads", "Head Dim", "Dtype",
284
+ "Mean (ms)", "P10 (ms)", "P50 (ms)", "P90 (ms)", "Reps",
285
+ # "Compile (ms)",
286
+ "Peak Mem (MB)", "Backend", "Family"]
287
+ writer.writerow(header)
288
+
289
+ # Write data rows
290
+ for impl_name, records in all_data.items():
291
+ for record in records:
292
+ wl = record.get('wl', {})
293
+ lat = record.get('lat_ms', {})
294
+ tags = record.get('tags', {})
295
+
296
+ row = [
297
+ impl_name,
298
+ record.get('impl', ''),
299
+ wl.get('name', ''),
300
+ wl.get('batch', ''),
301
+ wl.get('seq_len', ''),
302
+ wl.get('heads', ''),
303
+ wl.get('head_dim', ''),
304
+ wl.get('dtype', ''),
305
+ lat.get('mean', ''),
306
+ lat.get('p10', ''),
307
+ lat.get('p50', ''),
308
+ lat.get('p90', ''),
309
+ lat.get('reps', ''),
310
+ # record.get('compile_ms', ''),
311
+ round(record.get('peak_bytes', 0) / 1024 / 1024, 2) if record.get('peak_bytes') else '',
312
+ tags.get('backend', ''),
313
+ tags.get('family', ''),
314
+ ]
315
+ writer.writerow(row)
316
+
317
+ print(f"✓ CSV export complete: {csv_path}")
318
+ print(f"Total implementations: {len(all_data)}")
319
+ print(f"Total records: {sum(len(records) for records in all_data.values())}")
flash_attn/results/combined_results.html CHANGED
The diff for this file is too large to render. See raw diff