# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention 3 kernel
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
def hf_flash_attention3(query, key, value):
return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn3",
impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
impl_func=hf_flash_attention3,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 3.71% 164.893us 43.76% 1.944ms 1.944ms 0.000us 0.00% 3.688ms 3.688ms 1
FlashAttnFunc 2.67% 118.403us 40.05% 1.779ms 593.141us 0.000us 0.00% 3.688ms 1.229ms 3
_flash_attn3_48fe103_dirty::fwd 1.75% 77.922us 37.39% 1.661ms 553.673us 2.790ms 100.00% 3.688ms 1.229ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.791ms 100.05% 2.791ms 2.791ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.790ms 100.00% 2.790ms 929.856us 3
Activity Buffer Request 33.30% 1.480ms 33.30% 1.480ms 1.480ms 898.016us 32.19% 898.016us 898.016us 1
aten::empty 1.01% 44.942us 1.01% 44.942us 7.490us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.31% 13.870us 0.31% 13.870us 4.623us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.01% 44.741us 1.01% 44.741us 14.914us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 56.24% 2.499ms 56.24% 2.499ms 2.499ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.443ms
Self CUDA time total: 2.790ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.31% 100.671us 40.75% 1.773ms 1.773ms 0.000us 0.00% 3.735ms 3.735ms 1
FlashAttnFunc 2.09% 91.144us 38.44% 1.673ms 557.547us 0.000us 0.00% 3.735ms 1.245ms 3
_flash_attn3_48fe103_dirty::fwd 1.16% 50.371us 36.34% 1.581ms 527.165us 2.796ms 100.00% 3.735ms 1.245ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.798ms 100.06% 2.798ms 2.798ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.00% 2.796ms 932.000us 3
Activity Buffer Request 33.75% 1.469ms 33.75% 1.469ms 1.469ms 939.487us 33.60% 939.487us 939.487us 1
aten::empty 0.64% 27.720us 0.64% 27.720us 4.620us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 4.991us 0.11% 4.991us 1.664us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.68% 29.510us 0.68% 29.510us 9.837us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 59.25% 2.578ms 59.25% 2.578ms 2.578ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.352ms
Self CUDA time total: 2.796ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.10% 95.451us 39.98% 1.817ms 1.817ms 0.000us 0.00% 3.967ms 3.967ms 1
FlashAttnFunc 2.52% 114.605us 37.88% 1.721ms 573.824us 0.000us 0.00% 3.967ms 1.322ms 3
_flash_attn3_48fe103_dirty::fwd 1.12% 50.981us 35.36% 1.607ms 535.622us 2.964ms 100.00% 3.967ms 1.322ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.966ms 100.05% 2.966ms 2.966ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.964ms 100.00% 2.964ms 988.118us 3
Activity Buffer Request 32.83% 1.492ms 32.83% 1.492ms 1.492ms 1.002ms 33.81% 1.002ms 1.002ms 1
aten::empty 0.60% 27.089us 0.60% 27.089us 4.515us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.480us 0.12% 5.480us 1.827us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.69% 31.551us 0.69% 31.551us 10.517us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 60.02% 2.727ms 60.02% 2.727ms 2.727ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.544ms
Self CUDA time total: 2.964ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.35% 113.792us 41.57% 2.016ms 2.016ms 0.000us 0.00% 4.078ms 4.078ms 1
FlashAttnFunc 1.91% 92.684us 39.22% 1.902ms 634.112us 0.000us 0.00% 4.078ms 1.359ms 3
_flash_attn3_48fe103_dirty::fwd 0.98% 47.600us 37.31% 1.810ms 603.217us 3.050ms 100.00% 4.078ms 1.359ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.052ms 100.05% 3.052ms 3.052ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.050ms 100.00% 3.050ms 1.017ms 3
Activity Buffer Request 30.19% 1.464ms 30.19% 1.464ms 1.464ms 1.028ms 33.70% 1.028ms 1.028ms 1
aten::empty 0.58% 28.221us 0.58% 28.221us 4.703us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.430us 0.11% 5.430us 1.810us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.44% 264.046us 5.44% 264.046us 88.015us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 58.43% 2.834ms 58.43% 2.834ms 2.834ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.851ms
Self CUDA time total: 3.050ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.29% 116.152us 37.60% 1.908ms 1.908ms 0.000us 0.00% 4.514ms 4.514ms 1
FlashAttnFunc 1.78% 90.384us 35.31% 1.792ms 597.414us 0.000us 0.00% 4.514ms 1.505ms 3
_flash_attn3_48fe103_dirty::fwd 0.91% 46.231us 33.53% 1.702ms 567.286us 3.379ms 100.00% 4.514ms 1.505ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.380ms 100.05% 3.380ms 3.380ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.379ms 100.00% 3.379ms 1.126ms 3
Activity Buffer Request 28.41% 1.442ms 28.41% 1.442ms 1.442ms 1.136ms 33.61% 1.136ms 1.136ms 1
aten::empty 0.54% 27.250us 0.54% 27.250us 4.542us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.250us 0.10% 5.250us 1.750us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.57% 181.204us 3.57% 181.204us 60.401us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 62.40% 3.167ms 62.40% 3.167ms 3.167ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.075ms
Self CUDA time total: 3.379ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.24% 115.243us 39.36% 2.021ms 2.021ms 0.000us 0.00% 4.438ms 4.438ms 1
FlashAttnFunc 1.78% 91.262us 37.12% 1.906ms 635.278us 0.000us 0.00% 4.438ms 1.479ms 3
_flash_attn3_48fe103_dirty::fwd 0.90% 46.212us 35.34% 1.815ms 604.857us 3.325ms 100.00% 4.438ms 1.479ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.327ms 100.04% 3.327ms 3.327ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.325ms 100.00% 3.325ms 1.108ms 3
Activity Buffer Request 30.40% 1.561ms 30.40% 1.561ms 1.561ms 1.113ms 33.46% 1.113ms 1.113ms 1
aten::empty 0.54% 27.780us 0.54% 27.780us 4.630us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.330us 0.10% 5.330us 1.777us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.40% 174.454us 3.40% 174.454us 58.151us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 60.64% 3.113ms 60.64% 3.113ms 3.113ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.134ms
Self CUDA time total: 3.325ms
impl wl p50(ms) ok
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 1.00 True
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.99 True
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.15 True
▶ UV Install Logs
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.44it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.88it/s]