# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "xformers",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
import xformers.ops as xops
def xformers_attention(q, k, v):
"""xFormers memory efficient attention"""
# xFormers expects [batch, seq_len, heads, head_dim]
return xops.memory_efficient_attention(q, k, v)
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="xformers_meff",
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
impl_func=xformers_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 10.73% 480.812us 51.38% 2.302ms 2.302ms 0.000us 0.00% 3.631ms 3.631ms 1
xformers_flash3::flash_fwd 4.61% 206.363us 39.81% 1.783ms 594.453us 0.000us 0.00% 3.631ms 1.210ms 3
flash_attn_3::fwd 1.72% 77.043us 35.21% 1.577ms 525.665us 2.730ms 100.00% 3.631ms 1.210ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.731ms 100.06% 2.731ms 2.731ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.730ms 100.00% 2.730ms 909.864us 3
Activity Buffer Request 31.52% 1.412ms 31.52% 1.412ms 1.412ms 901.213us 33.02% 901.213us 901.213us 1
aten::empty 0.77% 34.510us 0.77% 34.510us 5.752us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.24% 10.880us 0.24% 10.880us 3.627us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.96% 42.842us 0.96% 42.842us 14.281us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.26% 11.610us 0.84% 37.430us 6.238us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.58% 25.820us 0.58% 25.820us 4.303us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 48.62% 2.178ms 48.62% 2.178ms 2.178ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.479ms
Self CUDA time total: 2.730ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 7.14% 318.116us 45.64% 2.033ms 2.033ms 0.000us 0.00% 3.819ms 3.819ms 1
xformers_flash3::flash_fwd 3.43% 153.034us 38.00% 1.693ms 564.339us 0.000us 0.00% 3.819ms 1.273ms 3
flash_attn_3::fwd 1.25% 55.902us 34.56% 1.540ms 513.328us 2.852ms 100.00% 3.819ms 1.273ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.853ms 100.05% 2.853ms 2.853ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.852ms 100.00% 2.852ms 950.587us 3
Activity Buffer Request 31.72% 1.413ms 31.72% 1.413ms 1.413ms 967.259us 33.92% 967.259us 967.259us 1
aten::empty 0.68% 30.270us 0.68% 30.270us 5.045us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.13% 5.700us 0.13% 5.700us 1.900us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.78% 34.811us 0.78% 34.811us 11.604us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.19% 8.522us 0.50% 22.121us 3.687us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.31% 13.599us 0.31% 13.599us 2.266us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 54.36% 2.422ms 54.36% 2.422ms 2.422ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.455ms
Self CUDA time total: 2.852ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.88% 312.747us 44.90% 2.040ms 2.040ms 0.000us 0.00% 3.937ms 3.937ms 1
xformers_flash3::flash_fwd 3.35% 152.284us 37.52% 1.705ms 568.205us 0.000us 0.00% 3.937ms 1.312ms 3
flash_attn_3::fwd 1.19% 54.281us 34.17% 1.552ms 517.444us 2.934ms 100.00% 3.937ms 1.312ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.936ms 100.05% 2.936ms 2.936ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.934ms 100.00% 2.934ms 977.979us 3
Activity Buffer Request 31.39% 1.426ms 31.39% 1.426ms 1.426ms 1.003ms 34.19% 1.003ms 1.003ms 1
aten::empty 0.67% 30.639us 0.67% 30.639us 5.106us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.14% 6.530us 0.14% 6.530us 2.177us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.77% 34.781us 0.77% 34.781us 11.594us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.19% 8.650us 0.49% 22.320us 3.720us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.30% 13.670us 0.30% 13.670us 2.278us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 55.10% 2.503ms 55.10% 2.503ms 2.503ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.543ms
Self CUDA time total: 2.934ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.56% 308.746us 47.29% 2.227ms 2.227ms 0.000us 0.00% 3.897ms 3.897ms 1
xformers_flash3::flash_fwd 3.22% 151.743us 40.27% 1.897ms 632.183us 0.000us 0.00% 3.897ms 1.299ms 3
flash_attn_3::fwd 1.19% 56.081us 37.05% 1.745ms 581.602us 2.911ms 100.00% 3.897ms 1.299ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.913ms 100.05% 2.913ms 2.913ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.911ms 100.00% 2.911ms 970.491us 3
Activity Buffer Request 30.05% 1.415ms 30.05% 1.415ms 1.415ms 985.179us 33.84% 985.179us 985.179us 1
aten::empty 0.65% 30.820us 0.65% 30.820us 5.137us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.13% 6.030us 0.13% 6.030us 2.010us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.02% 236.645us 5.02% 236.645us 78.882us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.18% 8.502us 0.47% 22.111us 3.685us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.29% 13.609us 0.29% 13.609us 2.268us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 52.71% 2.482ms 52.71% 2.482ms 2.482ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.710ms
Self CUDA time total: 2.911ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.33% 326.758us 43.32% 2.236ms 2.236ms 0.000us 0.00% 4.559ms 4.559ms 1
xformers_flash3::flash_fwd 3.59% 185.275us 36.53% 1.885ms 628.414us 0.000us 0.00% 4.559ms 1.520ms 3
flash_attn_3::fwd 1.12% 57.990us 32.94% 1.700ms 566.655us 3.412ms 100.00% 4.559ms 1.520ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.413ms 100.05% 3.413ms 3.413ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.412ms 100.00% 3.412ms 1.137ms 3
Activity Buffer Request 27.43% 1.416ms 27.43% 1.416ms 1.416ms 1.147ms 33.63% 1.147ms 1.147ms 1
aten::empty 0.66% 34.131us 0.66% 34.131us 5.688us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 6.360us 0.12% 6.360us 2.120us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.60% 185.845us 3.60% 185.845us 61.948us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.17% 8.790us 0.46% 23.539us 3.923us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.29% 14.749us 0.29% 14.749us 2.458us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 56.68% 2.925ms 56.68% 2.925ms 2.925ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.161ms
Self CUDA time total: 3.412ms
======================================================================
PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
xformers_meff 6.07% 310.905us 43.25% 2.215ms 2.215ms 0.000us 0.00% 4.499ms 4.499ms 1
xformers_flash3::flash_fwd 3.55% 181.844us 36.73% 1.881ms 626.964us 0.000us 0.00% 4.499ms 1.500ms 3
flash_attn_3::fwd 1.14% 58.453us 33.18% 1.699ms 566.349us 3.369ms 100.00% 4.499ms 1.500ms 3
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.371ms 100.06% 3.371ms 3.371ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.369ms 100.00% 3.369ms 1.123ms 3
Activity Buffer Request 27.78% 1.423ms 27.78% 1.423ms 1.423ms 1.130ms 33.54% 1.130ms 1.130ms 1
aten::empty 0.65% 33.340us 0.65% 33.340us 5.557us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.670us 0.11% 5.670us 1.890us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.50% 178.983us 3.50% 178.983us 59.661us 0.000us 0.00% 0.000us 0.000us 3
aten::reshape 0.17% 8.671us 0.45% 22.942us 3.824us 0.000us 0.00% 0.000us 0.000us 6
aten::view 0.28% 14.271us 0.28% 14.271us 2.378us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 56.75% 2.906ms 56.75% 2.906ms 2.906ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.120ms
Self CUDA time total: 3.369ms
impl wl p50(ms) ok
xformers_meff cuda_attn_L128_bfloat16 1.01 True
xformers_meff cuda_attn_L256_bfloat16 1.04 True
xformers_meff cuda_attn_L320_bfloat16 1.10 True
xformers_meff cuda_attn_L384_bfloat16 1.10 True
xformers_meff cuda_attn_L448_bfloat16 1.24 True
xformers_meff cuda_attn_L512_bfloat16 1.24 True
▶ UV Install Logs