# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention 3 kernel
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
def hf_flash_attention3(query, key, value):
return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn3",
impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
impl_func=hf_flash_attention3,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 3.53% 162.212us 48.20% 2.217ms 2.217ms 0.000us 0.00% 3.575ms 3.575ms 1
FlashAttnFunc 2.60% 119.532us 44.67% 2.055ms 684.947us 0.000us 0.00% 3.575ms 1.192ms 3
_flash_attn3_48fe103_dirty::fwd 1.56% 71.632us 42.08% 1.935ms 645.103us 2.671ms 100.00% 3.575ms 1.192ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.672ms 100.06% 2.672ms 2.672ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.671ms 100.00% 2.671ms 890.241us 3
Activity Buffer Request 38.25% 1.759ms 38.25% 1.759ms 1.759ms 904.001us 33.85% 904.001us 904.001us 1
aten::empty 0.93% 42.731us 0.93% 42.731us 7.122us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.32% 14.640us 0.32% 14.640us 4.880us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.03% 47.150us 1.03% 47.150us 15.717us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 51.80% 2.383ms 51.80% 2.383ms 2.383ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.600ms
Self CUDA time total: 2.671ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.14% 101.412us 45.76% 2.172ms 2.172ms 0.000us 0.00% 3.747ms 3.747ms 1
FlashAttnFunc 1.91% 90.691us 43.62% 2.071ms 690.247us 0.000us 0.00% 3.747ms 1.249ms 3
_flash_attn3_48fe103_dirty::fwd 1.11% 52.911us 41.71% 1.980ms 660.016us 2.794ms 100.00% 3.747ms 1.249ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.06% 2.796ms 2.796ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.794ms 100.00% 2.794ms 931.376us 3
Activity Buffer Request 39.32% 1.866ms 39.32% 1.866ms 1.866ms 953.126us 34.11% 953.126us 953.126us 1
aten::empty 0.55% 26.341us 0.55% 26.341us 4.390us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.160us 0.11% 5.160us 1.720us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.62% 29.260us 0.62% 29.260us 9.753us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 54.24% 2.575ms 54.24% 2.575ms 2.575ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.747ms
Self CUDA time total: 2.794ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.17% 102.652us 42.70% 2.019ms 2.019ms 0.000us 0.00% 3.920ms 3.920ms 1
FlashAttnFunc 1.91% 90.472us 40.53% 1.916ms 638.683us 0.000us 0.00% 3.920ms 1.307ms 3
_flash_attn3_48fe103_dirty::fwd 0.99% 47.030us 38.62% 1.826ms 608.525us 2.928ms 100.00% 3.920ms 1.307ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.930ms 100.05% 2.930ms 2.930ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.928ms 100.00% 2.928ms 976.037us 3
Activity Buffer Request 36.27% 1.715ms 36.27% 1.715ms 1.715ms 991.995us 33.88% 991.995us 991.995us 1
aten::empty 0.57% 26.980us 0.57% 26.980us 4.497us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 4.990us 0.11% 4.990us 1.663us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.68% 32.070us 0.68% 32.070us 10.690us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 57.30% 2.709ms 57.30% 2.709ms 2.709ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.728ms
Self CUDA time total: 2.928ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.33% 117.613us 45.39% 2.290ms 2.290ms 0.000us 0.00% 3.984ms 3.984ms 1
FlashAttnFunc 1.82% 91.609us 43.06% 2.172ms 724.120us 0.000us 0.00% 3.984ms 1.328ms 3
_flash_attn3_48fe103_dirty::fwd 0.95% 47.941us 41.24% 2.081ms 693.584us 2.967ms 100.00% 3.984ms 1.328ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.968ms 100.05% 2.968ms 2.968ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.967ms 100.00% 2.967ms 988.843us 3
Activity Buffer Request 35.42% 1.787ms 35.42% 1.787ms 1.787ms 1.017ms 34.30% 1.017ms 1.017ms 1
aten::empty 0.56% 28.180us 0.56% 28.180us 4.697us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.080us 0.10% 5.080us 1.693us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.21% 212.544us 4.21% 212.544us 70.848us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 54.61% 2.755ms 54.61% 2.755ms 2.755ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.045ms
Self CUDA time total: 2.967ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.35% 128.980us 39.64% 2.179ms 2.179ms 0.000us 0.00% 4.722ms 4.722ms 1
FlashAttnFunc 1.64% 90.214us 37.30% 2.050ms 683.484us 0.000us 0.00% 4.722ms 1.574ms 3
_flash_attn3_48fe103_dirty::fwd 0.87% 47.980us 35.66% 1.960ms 653.413us 3.530ms 100.00% 4.722ms 1.574ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.532ms 100.04% 3.532ms 3.532ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.530ms 100.00% 3.530ms 1.177ms 3
Activity Buffer Request 31.21% 1.716ms 31.21% 1.716ms 1.716ms 1.192ms 33.75% 1.192ms 1.192ms 1
aten::empty 0.49% 26.830us 0.49% 26.830us 4.472us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.09% 5.100us 0.09% 5.100us 1.700us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 2.99% 164.492us 2.99% 164.492us 54.831us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 60.36% 3.318ms 60.36% 3.318ms 3.318ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.497ms
Self CUDA time total: 3.530ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.16% 118.543us 39.14% 2.150ms 2.150ms 0.000us 0.00% 4.736ms 4.736ms 1
FlashAttnFunc 1.66% 91.361us 36.98% 2.032ms 677.186us 0.000us 0.00% 4.736ms 1.579ms 3
_flash_attn3_48fe103_dirty::fwd 0.85% 46.593us 35.32% 1.940ms 646.733us 3.555ms 100.00% 4.736ms 1.579ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.556ms 100.04% 3.556ms 3.556ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.555ms 100.00% 3.555ms 1.185ms 3
Activity Buffer Request 30.64% 1.683ms 30.64% 1.683ms 1.683ms 1.181ms 33.22% 1.181ms 1.181ms 1
aten::empty 0.50% 27.560us 0.50% 27.560us 4.593us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.09% 5.069us 0.09% 5.069us 1.690us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.23% 177.672us 3.23% 177.672us 59.224us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 60.86% 3.344ms 60.86% 3.344ms 3.344ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.494ms
Self CUDA time total: 3.555ms
impl wl p50(ms) ok
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.05 True
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.22 True
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.21 True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.28it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.57it/s]