| # /// script | |
| # requires-python = ">=3.10" | |
| # dependencies = [ | |
| # "numpy", | |
| # "torch==2.8.0", | |
| # "kernels-benchmark-tools", | |
| # ] | |
| # | |
| # [tool.uv.sources] | |
| # kernels-benchmark-tools = { path = "../../../../../tools", editable = true } | |
| # /// | |
| import torch | |
| import sys | |
| from kernels_benchmark_tools import KernelTypeEnum, run_benchmark | |
| def torch_flash(q, k, v): | |
| qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v)) | |
| with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION): | |
| o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt) | |
| return o.transpose(1, 2).contiguous() | |
| run_benchmark( | |
| kernel_type=KernelTypeEnum.ATTENTION, | |
| impl_name="torch_flash_ma", | |
| impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, | |
| impl_func=torch_flash, | |
| ) |