| {"ts": "2025-10-23T16:39:18Z", "run": "1329c4d4ac744bda875c16c4784efd4e", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.3605679958127439, "p50": 2.7313290047459304, "p90": 2.7496689872350544, "mean": 1.9229636003728956, "iqr": 1.743196975439787, "raw_times": [2.766780002275482, 2.7313290047459304, 2.7496689872350544, 1.0064720117952675, 0.3605679958127439], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 2.621017018100247, "peak_bytes": 101842944, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null} | |
| {"ts": "2025-10-23T16:39:18Z", "run": "1329c4d4ac744bda875c16c4784efd4e", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.789678015280515, "p50": 1.0204220016021281, "p90": 2.805980999255553, "mean": 1.6569862025789917, "iqr": 1.9910630071535707, "raw_times": [0.789678015280515, 0.8149179921019822, 1.0204220016021281, 2.805980999255553, 2.85393200465478], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.7163150003179908, "peak_bytes": 113639424, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null} | |
| {"ts": "2025-10-23T16:39:19Z", "run": "1329c4d4ac744bda875c16c4784efd4e", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.8689190144650638, "p50": 2.8084310179110616, "p90": 2.8393420216161758, "mean": 2.434495009947568, "iqr": 0.049981026677414775, "raw_times": [0.8689190144650638, 2.8664220008067787, 2.8084310179110616, 2.8393420216161758, 2.789360994938761], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.761545990826562, "peak_bytes": 116785152, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} | |
| {"ts": "2025-10-23T16:39:20Z", "run": "1329c4d4ac744bda875c16c4784efd4e", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": {"p10": 0.5894530040677637, "p50": 0.5925440054852515, "p90": 0.6658439815510064, "mean": 1.0438627970870584, "iqr": 0.0752209743950516, "raw_times": [2.7808499871753156, 0.6658439815510064, 0.5925440054852515, 0.5894530040677637, 0.5906230071559548], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 2.9009229911025614, "peak_bytes": 119013376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null} | |
| {"ts": "2025-10-23T16:39:20Z", "run": "1329c4d4ac744bda875c16c4784efd4e", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}} | |
| {"ts": "2025-10-23T16:39:20Z", "run": "1329c4d4ac744bda875c16c4784efd4e", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L4", "sm": "8.9", "py": "3.12.7", "plat": "Linux-5.15.0-1084-aws-x86_64-with-glibc2.31"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}} | |