# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "xformers",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
import xformers.ops as xops
def xformers_attention(q, k, v):
"""xFormers memory efficient attention"""
# xFormers expects [batch, seq_len, heads, head_dim]
return xops.memory_efficient_attention(q, k, v)
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="xformers_meff",
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
impl_func=xformers_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 10.73% 481.606us 51.24% 2.299ms 2.299ms 0.000us 0.00% 3.630ms 3.630ms 1
xformers_flash3::flash_fwd 4.33% 194.084us 39.70% 1.781ms 593.782us 0.000us 0.00% 3.630ms 1.210ms 3
flash_attn_3::fwd 1.76% 78.961us 35.37% 1.587ms 529.087us 2.729ms 100.00% 3.630ms 1.210ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.730ms 100.05% 2.730ms 2.730ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.729ms 100.00% 2.729ms 909.588us 3
Activity Buffer Request 31.70% 1.423ms 31.70% 1.423ms 1.423ms 901.535us 33.04% 901.535us 901.535us 1
aten::empty 0.75% 33.761us 0.75% 33.761us 5.627us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.28% 12.380us 0.28% 12.380us 4.127us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.88% 39.570us 0.88% 39.570us 13.190us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.30% 13.520us 0.80% 36.080us 6.013us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.50% 22.560us 0.50% 22.560us 3.760us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 48.76% 2.188ms 48.76% 2.188ms 2.188ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.487ms
Self CUDA time total: 2.729ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 7.10% 312.113us 46.81% 2.059ms 2.059ms 0.000us 0.00% 3.744ms 3.744ms 1
xformers_flash3::flash_fwd 3.88% 170.673us 39.17% 1.723ms 574.405us 0.000us 0.00% 3.744ms 1.248ms 3
flash_attn_3::fwd 1.28% 56.171us 35.29% 1.553ms 517.514us 2.795ms 100.00% 3.744ms 1.248ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.05% 2.796ms 2.796ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.630us 3
Activity Buffer Request 32.47% 1.428ms 32.47% 1.428ms 1.428ms 948.729us 33.95% 948.729us 948.729us 1
aten::empty 0.66% 29.091us 0.66% 29.091us 4.848us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.13% 5.590us 0.13% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.76% 33.440us 0.76% 33.440us 11.147us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.20% 8.951us 0.54% 23.831us 3.972us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.34% 14.880us 0.34% 14.880us 2.480us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 53.19% 2.340ms 53.19% 2.340ms 2.340ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.399ms
Self CUDA time total: 2.795ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.52% 299.466us 45.41% 2.085ms 2.085ms 0.000us 0.00% 3.907ms 3.907ms 1
xformers_flash3::flash_fwd 3.09% 142.061us 38.39% 1.763ms 587.558us 0.000us 0.00% 3.907ms 1.302ms 3
flash_attn_3::fwd 1.15% 53.012us 35.30% 1.621ms 540.204us 2.913ms 100.00% 3.907ms 1.302ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.915ms 100.06% 2.915ms 2.915ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.913ms 100.00% 2.913ms 971.158us 3
Activity Buffer Request 32.68% 1.500ms 32.68% 1.500ms 1.500ms 993.281us 34.09% 993.281us 993.281us 1
aten::empty 0.62% 28.380us 0.62% 28.380us 4.730us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.73% 33.640us 0.73% 33.640us 11.213us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.18% 8.421us 0.49% 22.660us 3.777us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.31% 14.239us 0.31% 14.239us 2.373us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 54.59% 2.507ms 54.59% 2.507ms 2.507ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.591ms
Self CUDA time total: 2.913ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.26% 300.335us 46.54% 2.234ms 2.234ms 0.000us 0.00% 3.980ms 3.980ms 1
xformers_flash3::flash_fwd 3.08% 147.673us 39.81% 1.911ms 637.009us 0.000us 0.00% 3.980ms 1.327ms 3
flash_attn_3::fwd 1.12% 53.571us 36.74% 1.763ms 587.785us 2.981ms 100.00% 3.980ms 1.327ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.982ms 100.05% 2.982ms 2.982ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.981ms 100.00% 2.981ms 993.631us 3
Activity Buffer Request 29.81% 1.431ms 29.81% 1.431ms 1.431ms 999.263us 33.52% 999.263us 999.263us 1
aten::empty 0.60% 28.930us 0.60% 28.930us 4.822us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.610us 0.12% 5.610us 1.870us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.09% 244.533us 5.09% 244.533us 81.511us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.18% 8.489us 0.47% 22.530us 3.755us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.29% 14.041us 0.29% 14.041us 2.340us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 53.46% 2.566ms 53.46% 2.566ms 2.566ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.800ms
Self CUDA time total: 2.981ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.98% 313.865us 42.05% 2.207ms 2.207ms 0.000us 0.00% 4.635ms 4.635ms 1
xformers_flash3::flash_fwd 2.80% 146.723us 35.63% 1.870ms 623.176us 0.000us 0.00% 4.635ms 1.545ms 3
flash_attn_3::fwd 0.99% 51.861us 32.83% 1.723ms 574.268us 3.467ms 100.00% 4.635ms 1.545ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.05% 3.469ms 3.469ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.467ms 100.00% 3.467ms 1.156ms 3
Activity Buffer Request 27.82% 1.460ms 27.82% 1.460ms 1.460ms 1.168ms 33.68% 1.168ms 1.168ms 1
aten::empty 0.56% 29.260us 0.56% 29.260us 4.877us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 6.040us 0.12% 6.040us 2.013us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.35% 175.903us 3.35% 175.903us 58.634us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.16% 8.638us 0.44% 23.169us 3.862us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.28% 14.531us 0.28% 14.531us 2.422us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 57.95% 3.041ms 57.95% 3.041ms 3.041ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.247ms
Self CUDA time total: 3.467ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 5.97% 309.094us 41.86% 2.166ms 2.166ms 0.000us 0.00% 4.567ms 4.567ms 1
xformers_flash3::flash_fwd 2.75% 142.242us 35.45% 1.834ms 611.405us 0.000us 0.00% 4.567ms 1.522ms 3
flash_attn_3::fwd 1.04% 53.951us 32.70% 1.692ms 563.991us 3.419ms 100.00% 4.567ms 1.522ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.421ms 100.05% 3.421ms 3.421ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3
Activity Buffer Request 27.74% 1.436ms 27.74% 1.436ms 1.436ms 1.148ms 33.59% 1.148ms 1.148ms 1
aten::empty 0.58% 29.770us 0.58% 29.770us 4.962us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.591us 0.11% 5.591us 1.864us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.23% 167.152us 3.23% 167.152us 55.717us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.16% 8.371us 0.44% 22.751us 3.792us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.28% 14.380us 0.28% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 58.14% 3.008ms 58.14% 3.008ms 3.008ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.174ms
Self CUDA time total: 3.419ms
impl wl p50(ms) ok
xformers_meff cuda_attn_L128_bfloat16 1.00 True
xformers_meff cuda_attn_L256_bfloat16 1.04 True
xformers_meff cuda_attn_L320_bfloat16 1.09 True
xformers_meff cuda_attn_L384_bfloat16 1.11 True
xformers_meff cuda_attn_L448_bfloat16 1.26 True
xformers_meff cuda_attn_L512_bfloat16 1.25 True
▶ UV Install Logs