# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention kernel
hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
def hf_flash_attention(query, key, value):
"""HuggingFace Kernels Flash Attention"""
return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn",
impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
impl_func=hf_flash_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 3.89% 173.532us 41.54% 1.852ms 1.852ms 0.000us 0.00% 3.821ms 3.821ms 1
_flash_attn_9e27194::fwd 1.71% 76.382us 37.65% 1.679ms 559.513us 2.851ms 100.00% 3.821ms 1.274ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.852ms 100.05% 2.852ms 2.852ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.851ms 100.00% 2.851ms 950.289us 3
Activity Buffer Request 32.53% 1.450ms 32.53% 1.450ms 1.450ms 970.364us 34.04% 970.364us 970.364us 1
cudaDeviceGetAttribute 0.10% 4.520us 0.10% 4.520us 0.301us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.46% 20.440us 1.29% 57.461us 19.154us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.83% 37.021us 0.83% 37.021us 12.340us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.76% 33.730us 0.76% 33.730us 3.748us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.29% 12.870us 0.29% 12.870us 4.290us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.97% 43.280us 0.97% 43.280us 14.427us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 58.46% 2.606ms 58.46% 2.606ms 2.606ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.458ms
Self CUDA time total: 2.851ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.32% 104.162us 37.24% 1.676ms 1.676ms 0.000us 0.00% 4.000ms 4.000ms 1
_flash_attn_9e27194::fwd 1.05% 47.052us 34.93% 1.571ms 523.812us 2.988ms 100.00% 4.000ms 1.333ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.989ms 100.04% 2.989ms 2.989ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.988ms 100.00% 2.988ms 995.942us 3
Activity Buffer Request 32.02% 1.441ms 32.02% 1.441ms 1.441ms 1.012ms 33.87% 1.012ms 1.012ms 1
cudaDeviceGetAttribute 0.10% 4.331us 0.10% 4.331us 0.289us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.16% 7.210us 0.52% 23.350us 7.783us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.36% 16.140us 0.36% 16.140us 5.380us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.47% 21.320us 0.47% 21.320us 2.369us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.10% 4.349us 0.10% 4.349us 1.450us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.67% 30.329us 0.67% 30.329us 10.110us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 62.76% 2.824ms 62.76% 2.824ms 2.824ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.499ms
Self CUDA time total: 2.988ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.58% 116.241us 37.17% 1.677ms 1.677ms 0.000us 0.00% 4.040ms 4.040ms 1
_flash_attn_9e27194::fwd 1.11% 49.909us 34.60% 1.561ms 520.326us 3.012ms 100.00% 4.040ms 1.347ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.013ms 100.04% 3.013ms 3.013ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.012ms 100.00% 3.012ms 1.004ms 3
Activity Buffer Request 31.60% 1.426ms 31.60% 1.426ms 1.426ms 1.029ms 34.16% 1.029ms 1.029ms 1
cudaDeviceGetAttribute 0.08% 3.801us 0.08% 3.801us 0.253us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.18% 8.151us 0.55% 24.960us 8.320us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.37% 16.809us 0.37% 16.809us 5.603us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.47% 21.201us 0.47% 21.201us 2.356us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.09% 3.950us 0.09% 3.950us 1.317us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.69% 31.260us 0.69% 31.260us 10.420us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 62.83% 2.835ms 62.83% 2.835ms 2.835ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.512ms
Self CUDA time total: 3.012ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.01% 99.212us 38.53% 1.898ms 1.898ms 0.000us 0.00% 4.264ms 4.264ms 1
_flash_attn_9e27194::fwd 1.06% 52.152us 36.51% 1.799ms 599.723us 3.190ms 100.00% 4.264ms 1.421ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.191ms 100.05% 3.191ms 3.191ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.190ms 100.00% 3.190ms 1.063ms 3
Activity Buffer Request 28.82% 1.420ms 28.82% 1.420ms 1.420ms 1.074ms 33.68% 1.074ms 1.074ms 1
cudaDeviceGetAttribute 0.09% 4.479us 0.09% 4.479us 0.299us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.16% 7.900us 0.54% 26.470us 8.823us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.38% 18.570us 0.38% 18.570us 6.190us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.46% 22.430us 0.46% 22.430us 2.492us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 3.830us 0.08% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.47% 269.763us 5.47% 269.763us 89.921us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 61.47% 3.029ms 61.47% 3.029ms 3.029ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.928ms
Self CUDA time total: 3.190ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.16% 88.971us 14.91% 614.057us 614.057us 0.000us 0.00% 4.875ms 4.875ms 1
_flash_attn_9e27194::fwd 1.23% 50.539us 12.75% 525.086us 175.029us 3.652ms 100.00% 4.875ms 1.625ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.653ms 100.04% 3.653ms 3.653ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3
Activity Buffer Request 5.08% 209.112us 5.08% 209.112us 209.112us 1.223ms 33.50% 1.223ms 1.223ms 1
cudaDeviceGetAttribute 0.10% 3.960us 0.10% 3.960us 0.264us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.19% 7.749us 0.60% 24.700us 8.233us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.41% 16.951us 0.41% 16.951us 5.650us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.54% 22.121us 0.54% 22.121us 2.458us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.10% 4.190us 0.10% 4.190us 1.397us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.11% 210.464us 5.11% 210.464us 70.155us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 85.09% 3.504ms 85.09% 3.504ms 3.504ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.118ms
Self CUDA time total: 3.652ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.23% 91.402us 14.65% 600.857us 600.857us 0.000us 0.00% 4.881ms 4.881ms 1
_flash_attn_9e27194::fwd 1.15% 47.191us 12.42% 509.455us 169.818us 3.654ms 100.00% 4.881ms 1.627ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 100.04% 3.655ms 3.655ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.00% 3.654ms 1.218ms 3
Activity Buffer Request 5.38% 220.623us 5.38% 220.623us 220.623us 1.227ms 33.59% 1.227ms 1.227ms 1
cudaDeviceGetAttribute 0.09% 3.601us 0.09% 3.601us 0.240us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.18% 7.230us 0.58% 23.840us 7.947us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.40% 16.610us 0.40% 16.610us 5.537us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.51% 20.851us 0.51% 20.851us 2.317us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.09% 3.688us 0.09% 3.688us 1.229us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.62% 189.661us 4.62% 189.661us 63.220us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 85.35% 3.502ms 85.35% 3.502ms 3.502ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.103ms
Self CUDA time total: 3.654ms
impl wl p50(ms) ok
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.98 True
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
▶ UV Install Logs
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
Fetching 20 files: 10%|█ | 2/20 [00:01<00:12, 1.43it/s]
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 14.34it/s]