# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention kernel
hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn2")
def hf_flash_attention(query, key, value):
"""HuggingFace Kernels Flash Attention"""
return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn",
impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
impl_func=hf_flash_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 3.32% 167.194us 47.96% 2.415ms 2.415ms 0.000us 0.00% 3.817ms 3.817ms 1
_flash_attn_9e27194::fwd 1.37% 69.029us 44.64% 2.247ms 749.145us 2.847ms 100.00% 3.817ms 1.272ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.849ms 100.05% 2.849ms 2.849ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.847ms 100.00% 2.847ms 949.099us 3
Activity Buffer Request 39.70% 1.999ms 39.70% 1.999ms 1.999ms 970.081us 34.07% 970.081us 970.081us 1
cudaDeviceGetAttribute 0.09% 4.410us 0.09% 4.410us 0.294us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.38% 19.301us 1.08% 54.311us 18.104us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.70% 35.010us 0.70% 35.010us 11.670us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.51% 25.771us 0.51% 25.771us 2.863us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 1.06% 53.231us 1.06% 53.231us 17.744us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.83% 41.840us 0.83% 41.840us 13.947us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 52.04% 2.620ms 52.04% 2.620ms 2.620ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.035ms
Self CUDA time total: 2.847ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 1.71% 88.920us 43.78% 2.280ms 2.280ms 0.000us 0.00% 4.110ms 4.110ms 1
_flash_attn_9e27194::fwd 0.90% 46.653us 42.07% 2.191ms 730.229us 3.068ms 100.00% 4.110ms 1.370ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.070ms 100.05% 3.070ms 3.070ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.068ms 100.00% 3.068ms 1.023ms 3
Activity Buffer Request 39.69% 2.067ms 39.69% 2.067ms 2.067ms 1.041ms 33.93% 1.041ms 1.041ms 1
cudaDeviceGetAttribute 0.07% 3.649us 0.07% 3.649us 0.243us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.14% 7.310us 0.43% 22.581us 7.527us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.29% 15.271us 0.29% 15.271us 5.090us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.41% 21.500us 0.41% 21.500us 2.389us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.07% 3.620us 0.07% 3.620us 1.207us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.50% 25.800us 0.50% 25.800us 8.600us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 56.22% 2.927ms 56.22% 2.927ms 2.927ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.207ms
Self CUDA time total: 3.068ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 1.71% 88.010us 40.24% 2.065ms 2.065ms 0.000us 0.00% 4.290ms 4.290ms 1
_flash_attn_9e27194::fwd 1.03% 52.730us 38.53% 1.977ms 659.108us 3.209ms 100.00% 4.290ms 1.430ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.211ms 100.05% 3.211ms 3.211ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.209ms 100.00% 3.209ms 1.070ms 3
Activity Buffer Request 35.96% 1.846ms 35.96% 1.846ms 1.846ms 1.081ms 33.68% 1.081ms 1.081ms 1
cudaDeviceGetAttribute 0.07% 3.699us 0.07% 3.699us 0.247us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.13% 6.760us 0.45% 22.961us 7.654us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.32% 16.201us 0.32% 16.201us 5.400us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.41% 20.833us 0.41% 20.833us 2.315us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.07% 3.580us 0.07% 3.580us 1.193us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.54% 27.851us 0.54% 27.851us 9.284us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 59.76% 3.067ms 59.76% 3.067ms 3.067ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.132ms
Self CUDA time total: 3.209ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.41% 90.762us 19.01% 717.141us 717.141us 0.000us 0.00% 4.279ms 4.279ms 1
_flash_attn_9e27194::fwd 1.23% 46.533us 16.60% 626.379us 208.793us 3.197ms 100.00% 4.279ms 1.426ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.199ms 100.05% 3.199ms 3.199ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.197ms 100.00% 3.197ms 1.066ms 3
Activity Buffer Request 7.66% 288.965us 7.66% 288.965us 288.965us 1.082ms 33.83% 1.082ms 1.082ms 1
cudaDeviceGetAttribute 0.10% 3.648us 0.10% 3.648us 0.243us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.18% 6.920us 0.61% 22.930us 7.643us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.42% 16.010us 0.42% 16.010us 5.337us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.56% 21.260us 0.56% 21.260us 2.362us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.10% 3.650us 0.10% 3.650us 1.217us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 6.35% 239.393us 6.35% 239.393us 79.798us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 80.99% 3.055ms 80.99% 3.055ms 3.055ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.772ms
Self CUDA time total: 3.197ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 1.57% 90.561us 37.72% 2.178ms 2.178ms 0.000us 0.00% 4.999ms 4.999ms 1
_flash_attn_9e27194::fwd 0.83% 48.040us 36.16% 2.087ms 695.661us 3.741ms 100.00% 4.999ms 1.666ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.743ms 100.05% 3.743ms 3.743ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.741ms 100.00% 3.741ms 1.247ms 3
Activity Buffer Request 30.45% 1.758ms 30.45% 1.758ms 1.758ms 1.258ms 33.63% 1.258ms 1.258ms 1
cudaDeviceGetAttribute 0.06% 3.722us 0.06% 3.722us 0.248us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.14% 7.831us 0.41% 23.771us 7.924us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.28% 15.940us 0.28% 15.940us 5.313us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.36% 20.578us 0.36% 20.578us 2.286us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.06% 3.590us 0.06% 3.590us 1.197us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.98% 229.604us 3.98% 229.604us 76.535us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 62.28% 3.595ms 62.28% 3.595ms 3.595ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.772ms
Self CUDA time total: 3.741ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.13% 89.030us 15.70% 656.370us 656.370us 0.000us 0.00% 4.900ms 4.900ms 1
_flash_attn_9e27194::fwd 1.15% 48.015us 13.57% 567.340us 189.113us 3.667ms 100.00% 4.900ms 1.633ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.669ms 100.04% 3.669ms 3.669ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.667ms 100.00% 3.667ms 1.222ms 3
Activity Buffer Request 5.94% 248.154us 5.94% 248.154us 248.154us 1.233ms 33.62% 1.233ms 1.233ms 1
cudaDeviceGetAttribute 0.08% 3.539us 0.08% 3.539us 0.236us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.16% 6.860us 0.56% 23.209us 7.736us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.39% 16.349us 0.39% 16.349us 5.450us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.49% 20.571us 0.49% 20.571us 2.286us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.09% 3.630us 0.09% 3.630us 1.210us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.27% 220.222us 5.27% 220.222us 73.407us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 84.30% 3.524ms 84.30% 3.524ms 3.524ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.180ms
Self CUDA time total: 3.667ms
impl wl p50(ms) ok
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.99 True
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.04 True
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.07 True
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.26 True
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.25 True
▶ UV Install Logs
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
Fetching 20 files: 10%|█ | 2/20 [00:01<00:17, 1.01it/s]
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 10.06it/s]