# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "xformers",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
import xformers.ops as xops
def xformers_attention(q, k, v):
"""xFormers memory efficient attention"""
# xFormers expects [batch, seq_len, heads, head_dim]
return xops.memory_efficient_attention(q, k, v)
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="xformers_meff",
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
impl_func=xformers_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 9.89% 457.200us 48.78% 2.255ms 2.255ms 0.000us 0.00% 3.820ms 3.820ms 1
xformers_flash3::flash_fwd 3.84% 177.424us 38.10% 1.761ms 587.077us 0.000us 0.00% 3.820ms 1.273ms 3
flash_attn_3::fwd 1.55% 71.862us 34.26% 1.584ms 527.935us 2.885ms 100.00% 3.820ms 1.273ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.886ms 100.04% 2.886ms 2.886ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.885ms 100.00% 2.885ms 961.658us 3
Activity Buffer Request 30.73% 1.420ms 30.73% 1.420ms 1.420ms 934.553us 32.39% 934.553us 934.553us 1
aten::empty 0.74% 34.201us 0.74% 34.201us 5.700us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.22% 10.110us 0.22% 10.110us 3.370us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.02% 47.230us 1.02% 47.230us 15.743us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.34% 15.510us 0.79% 36.581us 6.097us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.46% 21.071us 0.46% 21.071us 3.512us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 51.22% 2.368ms 51.22% 2.368ms 2.368ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.623ms
Self CUDA time total: 2.885ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.56% 301.335us 45.12% 2.073ms 2.073ms 0.000us 0.00% 3.862ms 3.862ms 1
xformers_flash3::flash_fwd 3.02% 138.865us 38.04% 1.748ms 582.607us 0.000us 0.00% 3.862ms 1.287ms 3
flash_attn_3::fwd 1.15% 53.013us 35.02% 1.609ms 536.319us 2.932ms 100.00% 3.862ms 1.287ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.933ms 100.04% 2.933ms 2.933ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.308us 3
Activity Buffer Request 32.36% 1.487ms 32.36% 1.487ms 1.487ms 930.332us 31.73% 930.332us 930.332us 1
aten::empty 0.65% 29.679us 0.65% 29.679us 4.946us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.591us 0.12% 5.591us 1.864us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.74% 34.170us 0.74% 34.170us 11.390us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.22% 9.881us 0.51% 23.631us 3.938us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.30% 13.750us 0.30% 13.750us 2.292us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 54.88% 2.521ms 54.88% 2.521ms 2.521ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.594ms
Self CUDA time total: 2.932ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.47% 295.057us 44.36% 2.024ms 2.024ms 0.000us 0.00% 3.906ms 3.906ms 1
xformers_flash3::flash_fwd 3.08% 140.693us 37.39% 1.706ms 568.676us 0.000us 0.00% 3.906ms 1.302ms 3
flash_attn_3::fwd 1.15% 52.641us 34.31% 1.565ms 521.779us 2.948ms 100.00% 3.906ms 1.302ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.949ms 100.05% 2.949ms 2.949ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.948ms 100.00% 2.948ms 982.658us 3
Activity Buffer Request 31.65% 1.444ms 31.65% 1.444ms 1.444ms 958.263us 32.51% 958.263us 958.263us 1
aten::empty 0.65% 29.440us 0.65% 29.440us 4.907us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.511us 0.12% 5.511us 1.837us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.74% 33.911us 0.74% 33.911us 11.304us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.18% 8.109us 0.50% 22.850us 3.808us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.32% 14.741us 0.32% 14.741us 2.457us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 55.64% 2.539ms 55.64% 2.539ms 2.539ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.562ms
Self CUDA time total: 2.948ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.44% 300.857us 47.49% 2.217ms 2.217ms 0.000us 0.00% 3.827ms 3.827ms 1
xformers_flash3::flash_fwd 3.16% 147.703us 40.53% 1.892ms 630.694us 0.000us 0.00% 3.827ms 1.276ms 3
flash_attn_3::fwd 1.13% 52.820us 37.36% 1.744ms 581.460us 2.874ms 100.00% 3.827ms 1.276ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.876ms 100.05% 2.876ms 2.876ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.874ms 100.00% 2.874ms 958.161us 3
Activity Buffer Request 30.85% 1.440ms 30.85% 1.440ms 1.440ms 952.124us 33.12% 952.124us 952.124us 1
aten::empty 0.63% 29.391us 0.63% 29.391us 4.899us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.13% 5.930us 0.13% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.63% 215.955us 4.63% 215.955us 71.985us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.22% 10.380us 0.51% 23.940us 3.990us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.29% 13.560us 0.29% 13.560us 2.260us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 52.51% 2.452ms 52.51% 2.452ms 2.452ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.669ms
Self CUDA time total: 2.874ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.75% 298.955us 42.23% 2.194ms 2.194ms 0.000us 0.00% 4.560ms 4.560ms 1
xformers_flash3::flash_fwd 2.73% 142.094us 36.04% 1.872ms 624.074us 0.000us 0.00% 4.560ms 1.520ms 3
flash_attn_3::fwd 1.06% 54.881us 33.30% 1.730ms 576.710us 3.413ms 100.00% 4.560ms 1.520ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.415ms 100.04% 3.415ms 3.415ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.413ms 100.00% 3.413ms 1.138ms 3
Activity Buffer Request 27.56% 1.432ms 27.56% 1.432ms 1.432ms 1.147ms 33.59% 1.147ms 1.147ms 1
aten::empty 0.56% 28.860us 0.56% 28.860us 4.810us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.420us 0.10% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.02% 208.865us 4.02% 208.865us 69.622us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.18% 9.222us 0.44% 22.901us 3.817us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.26% 13.679us 0.26% 13.679us 2.280us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 57.77% 3.001ms 57.77% 3.001ms 3.001ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.196ms
Self CUDA time total: 3.413ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.27% 272.556us 42.19% 2.184ms 2.184ms 0.000us 0.00% 4.536ms 4.536ms 1
xformers_flash3::flash_fwd 2.70% 139.942us 36.49% 1.889ms 629.618us 0.000us 0.00% 4.536ms 1.512ms 3
flash_attn_3::fwd 1.02% 52.981us 33.79% 1.749ms 582.970us 3.398ms 100.00% 4.536ms 1.512ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.400ms 100.05% 3.400ms 3.400ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3
Activity Buffer Request 28.10% 1.454ms 28.10% 1.454ms 1.454ms 1.138ms 33.49% 1.138ms 1.138ms 1
aten::empty 0.56% 28.991us 0.56% 28.991us 4.832us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.511us 0.11% 5.511us 1.837us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.00% 207.225us 4.00% 207.225us 69.075us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.17% 8.891us 0.44% 22.532us 3.755us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.26% 13.641us 0.26% 13.641us 2.274us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 57.81% 2.992ms 57.81% 2.992ms 2.992ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.176ms
Self CUDA time total: 3.398ms
impl wl p50(ms) ok
xformers_meff cuda_attn_L128_bfloat16 0.99 True
xformers_meff cuda_attn_L256_bfloat16 1.05 True
xformers_meff cuda_attn_L320_bfloat16 1.06 True
xformers_meff cuda_attn_L384_bfloat16 1.06 True
xformers_meff cuda_attn_L448_bfloat16 1.23 True
xformers_meff cuda_attn_L512_bfloat16 1.23 True
▶ UV Install Logs