# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention 3 kernel
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
def hf_flash_attention3(query, key, value):
return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn3",
impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
impl_func=hf_flash_attention3,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 3.90% 171.143us 44.22% 1.941ms 1.941ms 0.000us 0.00% 3.653ms 3.653ms 1
FlashAttnFunc 2.92% 128.011us 40.32% 1.769ms 589.788us 0.000us 0.00% 3.653ms 1.218ms 3
_flash_attn3_48fe103_dirty::fwd 1.90% 83.422us 37.41% 1.641ms 547.118us 2.755ms 100.00% 3.653ms 1.218ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.756ms 100.05% 2.756ms 2.756ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.00% 2.755ms 918.306us 3
Activity Buffer Request 33.13% 1.454ms 33.13% 1.454ms 1.454ms 898.082us 32.60% 898.082us 898.082us 1
aten::empty 1.02% 44.762us 1.02% 44.762us 7.460us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.33% 14.660us 0.33% 14.660us 4.887us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.02% 44.660us 1.02% 44.660us 14.887us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 55.78% 2.447ms 55.78% 2.447ms 2.447ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.388ms
Self CUDA time total: 2.755ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.42% 105.470us 40.03% 1.743ms 1.743ms 0.000us 0.00% 3.784ms 3.784ms 1
FlashAttnFunc 2.12% 92.121us 37.61% 1.638ms 546.005us 0.000us 0.00% 3.784ms 1.261ms 3
_flash_attn3_48fe103_dirty::fwd 1.23% 53.460us 35.49% 1.546ms 515.298us 2.836ms 100.00% 3.784ms 1.261ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.838ms 100.05% 2.838ms 2.838ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.836ms 100.00% 2.836ms 945.359us 3
Activity Buffer Request 32.85% 1.431ms 32.85% 1.431ms 1.431ms 947.652us 33.41% 947.652us 947.652us 1
aten::empty 0.62% 27.052us 0.62% 27.052us 4.509us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 4.721us 0.11% 4.721us 1.574us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.68% 29.730us 0.68% 29.730us 9.910us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 59.97% 2.612ms 59.97% 2.612ms 2.612ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.355ms
Self CUDA time total: 2.836ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.34% 104.112us 39.68% 1.767ms 1.767ms 0.000us 0.00% 3.931ms 3.931ms 1
FlashAttnFunc 2.59% 115.143us 37.35% 1.662ms 554.155us 0.000us 0.00% 3.931ms 1.310ms 3
_flash_attn3_48fe103_dirty::fwd 1.23% 54.772us 34.76% 1.547ms 515.774us 2.932ms 100.00% 3.931ms 1.310ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.934ms 100.05% 2.934ms 2.934ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.432us 3
Activity Buffer Request 32.05% 1.427ms 32.05% 1.427ms 1.427ms 998.487us 34.05% 998.487us 998.487us 1
aten::empty 0.66% 29.309us 0.66% 29.309us 4.885us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 4.840us 0.11% 4.840us 1.613us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.71% 31.520us 0.71% 31.520us 10.507us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 60.32% 2.685ms 60.32% 2.685ms 2.685ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.452ms
Self CUDA time total: 2.932ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.48% 118.391us 41.58% 1.983ms 1.983ms 0.000us 0.00% 4.029ms 4.029ms 1
FlashAttnFunc 2.00% 95.232us 39.09% 1.865ms 621.579us 0.000us 0.00% 4.029ms 1.343ms 3
_flash_attn3_48fe103_dirty::fwd 1.18% 56.301us 37.10% 1.770ms 589.835us 3.014ms 100.00% 4.029ms 1.343ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.016ms 100.06% 3.016ms 3.016ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.014ms 100.00% 3.014ms 1.005ms 3
Activity Buffer Request 30.19% 1.440ms 30.19% 1.440ms 1.440ms 1.015ms 33.67% 1.015ms 1.015ms 1
aten::empty 0.58% 27.710us 0.58% 27.710us 4.618us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 4.771us 0.10% 4.771us 1.590us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.05% 240.873us 5.05% 240.873us 80.291us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 58.42% 2.787ms 58.42% 2.787ms 2.787ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.770ms
Self CUDA time total: 3.014ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.45% 127.821us 37.14% 1.937ms 1.937ms 0.000us 0.00% 4.669ms 4.669ms 1
FlashAttnFunc 1.78% 92.961us 34.69% 1.809ms 603.079us 0.000us 0.00% 4.669ms 1.556ms 3
_flash_attn3_48fe103_dirty::fwd 0.98% 50.990us 32.91% 1.716ms 572.092us 3.496ms 100.00% 4.669ms 1.556ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.498ms 100.05% 3.498ms 3.498ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.496ms 100.00% 3.496ms 1.165ms 3
Activity Buffer Request 27.66% 1.443ms 27.66% 1.443ms 1.443ms 1.173ms 33.56% 1.173ms 1.173ms 1
aten::empty 0.56% 28.951us 0.56% 28.951us 4.825us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.09% 4.870us 0.09% 4.870us 1.623us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.62% 188.673us 3.62% 188.673us 62.891us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 62.86% 3.279ms 62.86% 3.279ms 3.279ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.216ms
Self CUDA time total: 3.496ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.26% 115.651us 36.11% 1.844ms 1.844ms 0.000us 0.00% 4.648ms 4.648ms 1
FlashAttnFunc 1.78% 91.130us 33.84% 1.728ms 576.085us 0.000us 0.00% 4.648ms 1.549ms 3
_flash_attn3_48fe103_dirty::fwd 1.06% 54.250us 32.06% 1.637ms 545.708us 3.480ms 100.00% 4.648ms 1.549ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.481ms 100.04% 3.481ms 3.481ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.480ms 100.00% 3.480ms 1.160ms 3
Activity Buffer Request 27.00% 1.379ms 27.00% 1.379ms 1.379ms 1.168ms 33.58% 1.168ms 1.168ms 1
aten::empty 0.55% 28.142us 0.55% 28.142us 4.690us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.261us 0.10% 5.261us 1.754us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.35% 170.883us 3.35% 170.883us 56.961us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 63.89% 3.263ms 63.89% 3.263ms 3.263ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.107ms
Self CUDA time total: 3.480ms
impl wl p50(ms) ok
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.95 True
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.21 True
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.33it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.66it/s]