# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "xformers",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
import xformers.ops as xops
def xformers_attention(q, k, v):
"""xFormers memory efficient attention"""
# xFormers expects [batch, seq_len, heads, head_dim]
return xops.memory_efficient_attention(q, k, v)
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="xformers_meff",
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
impl_func=xformers_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 9.51% 455.697us 55.56% 2.663ms 2.663ms 0.000us 0.00% 3.558ms 3.558ms 1
xformers_flash3::flash_fwd 4.08% 195.443us 45.35% 2.174ms 724.544us 0.000us 0.00% 3.558ms 1.186ms 3
flash_attn_3::fwd 1.49% 71.640us 41.28% 1.978ms 659.396us 2.651ms 100.00% 3.558ms 1.186ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.653ms 100.06% 2.653ms 2.653ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.651ms 100.00% 2.651ms 883.711us 3
Activity Buffer Request 37.88% 1.816ms 37.88% 1.816ms 1.816ms 906.719us 34.20% 906.719us 906.719us 1
aten::empty 0.75% 35.911us 0.75% 35.911us 5.985us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.26% 12.331us 0.26% 12.331us 4.110us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.89% 42.730us 0.89% 42.730us 14.243us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.24% 11.531us 0.69% 33.171us 5.529us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.45% 21.640us 0.45% 21.640us 3.607us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 44.44% 2.130ms 44.44% 2.130ms 2.130ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.793ms
Self CUDA time total: 2.651ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.26% 307.825us 49.96% 2.457ms 2.457ms 0.000us 0.00% 3.857ms 3.857ms 1
xformers_flash3::flash_fwd 2.96% 145.722us 43.25% 2.127ms 708.950us 0.000us 0.00% 3.857ms 1.286ms 3
flash_attn_3::fwd 1.03% 50.571us 40.29% 1.981ms 660.376us 2.878ms 100.00% 3.857ms 1.286ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.879ms 100.06% 2.879ms 2.879ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.878ms 100.00% 2.878ms 959.213us 3
Activity Buffer Request 37.86% 1.862ms 37.86% 1.862ms 1.862ms 979.202us 34.03% 979.202us 979.202us 1
aten::empty 0.61% 29.881us 0.61% 29.881us 4.980us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.570us 0.11% 5.570us 1.857us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.67% 33.080us 0.67% 33.080us 11.027us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.18% 8.899us 0.46% 22.400us 3.733us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.27% 13.501us 0.27% 13.501us 2.250us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 50.04% 2.461ms 50.04% 2.461ms 2.461ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.918ms
Self CUDA time total: 2.878ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.21% 306.054us 48.92% 2.410ms 2.410ms 0.000us 0.00% 3.933ms 3.933ms 1
xformers_flash3::flash_fwd 2.99% 147.392us 42.27% 2.082ms 693.957us 0.000us 0.00% 3.933ms 1.311ms 3
flash_attn_3::fwd 1.07% 52.480us 39.27% 1.934ms 644.826us 2.941ms 100.00% 3.933ms 1.311ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.942ms 100.05% 2.942ms 2.942ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.941ms 100.00% 2.941ms 980.234us 3
Activity Buffer Request 36.76% 1.811ms 36.76% 1.811ms 1.811ms 991.807us 33.73% 991.807us 991.807us 1
aten::empty 0.60% 29.531us 0.60% 29.531us 4.922us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.13% 6.550us 0.13% 6.550us 2.183us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.71% 35.120us 0.71% 35.120us 11.707us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.17% 8.281us 0.44% 21.831us 3.638us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.28% 13.550us 0.28% 13.550us 2.258us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 51.08% 2.516ms 51.08% 2.516ms 2.516ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.926ms
Self CUDA time total: 2.941ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.17% 315.944us 50.13% 2.567ms 2.567ms 0.000us 0.00% 4.004ms 4.004ms 1
xformers_flash3::flash_fwd 2.87% 146.993us 43.50% 2.228ms 742.605us 0.000us 0.00% 4.004ms 1.335ms 3
flash_attn_3::fwd 0.96% 49.370us 40.63% 2.081ms 693.607us 2.988ms 100.00% 4.004ms 1.335ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.990ms 100.05% 2.990ms 2.990ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.988ms 100.00% 2.988ms 996.112us 3
Activity Buffer Request 35.27% 1.806ms 35.27% 1.806ms 1.806ms 1.016ms 34.00% 1.016ms 1.016ms 1
aten::empty 0.59% 30.371us 0.59% 30.371us 5.062us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.580us 0.11% 5.580us 1.860us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.69% 189.213us 3.69% 189.213us 63.071us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.19% 9.850us 0.46% 23.640us 3.940us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.27% 13.790us 0.27% 13.790us 2.298us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 49.87% 2.554ms 49.87% 2.554ms 2.554ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.122ms
Self CUDA time total: 2.988ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.38% 306.205us 45.35% 2.581ms 2.581ms 0.000us 0.00% 4.704ms 4.704ms 1
xformers_flash3::flash_fwd 2.54% 144.312us 39.58% 2.253ms 750.894us 0.000us 0.00% 4.704ms 1.568ms 3
flash_attn_3::fwd 0.92% 52.341us 37.04% 2.108ms 702.790us 3.526ms 100.00% 4.704ms 1.568ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.528ms 100.05% 3.528ms 3.528ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.526ms 100.00% 3.526ms 1.175ms 3
Activity Buffer Request 32.26% 1.836ms 32.26% 1.836ms 1.836ms 1.177ms 33.39% 1.177ms 1.177ms 1
aten::empty 0.52% 29.660us 0.52% 29.660us 4.943us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.499us 0.10% 5.499us 1.833us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.24% 184.684us 3.24% 184.684us 61.561us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.15% 8.640us 0.39% 22.430us 3.738us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.24% 13.790us 0.24% 13.790us 2.298us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 54.65% 3.111ms 54.65% 3.111ms 3.111ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.692ms
Self CUDA time total: 3.526ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.52% 307.264us 44.82% 2.494ms 2.494ms 0.000us 0.00% 4.662ms 4.662ms 1
xformers_flash3::flash_fwd 2.63% 146.303us 38.91% 2.164ms 721.461us 0.000us 0.00% 4.662ms 1.554ms 3
flash_attn_3::fwd 0.91% 50.371us 36.28% 2.018ms 672.693us 3.490ms 100.00% 4.662ms 1.554ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.491ms 100.04% 3.491ms 3.491ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.490ms 100.00% 3.490ms 1.163ms 3
Activity Buffer Request 31.37% 1.745ms 31.37% 1.745ms 1.745ms 1.172ms 33.59% 1.172ms 1.172ms 1
aten::empty 0.54% 29.920us 0.54% 29.920us 4.987us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.750us 0.10% 5.750us 1.917us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.36% 187.102us 3.36% 187.102us 62.367us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.15% 8.539us 0.39% 21.890us 3.648us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.24% 13.351us 0.24% 13.351us 2.225us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 55.18% 3.069ms 55.18% 3.069ms 3.069ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.563ms
Self CUDA time total: 3.490ms
impl wl p50(ms) ok
xformers_meff cuda_attn_L128_bfloat16 0.99 True
xformers_meff cuda_attn_L256_bfloat16 1.05 True
xformers_meff cuda_attn_L320_bfloat16 1.09 True
xformers_meff cuda_attn_L384_bfloat16 1.09 True
xformers_meff cuda_attn_L448_bfloat16 1.27 True
xformers_meff cuda_attn_L512_bfloat16 1.28 True
▶ UV Install Logs