# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels",
# "kernels-benchmark-tools",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the layer norm kernel
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
B, S, D = x.shape
# The kernel expects [N, D] input; support beta (bias) if provided.
out = layer_norm_kernel.dropout_add_ln_fwd(
input=x.view(-1, D),
gamma=weight,
beta=bias,
rowscale=None,
colscale=None,
x0_subset=None,
z_subset=None,
dropout_p=0.0,
epsilon=eps,
rowscale_const=1.0,
z_numrows=S,
gen=None,
residual_in_fp32=False,
is_rms_norm=False,
)[0].view(B, S, D)
return out
run_benchmark(
kernel_type=KernelTypeEnum.LAYER_NORM,
impl_name="hf_kernels_layer_norm",
impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
impl_func=hf_kernels_layer_norm,
)
Running layer_norm benchmark on cuda with 4 workloads.
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 5.01% 203.177us 46.78% 1.895ms 1.895ms 0.000us 0.00% 3.141ms 3.141ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 1.71% 69.312us 41.16% 1.668ms 555.914us 2.399ms 100.00% 3.141ms 1.047ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.401ms 100.06% 2.401ms 2.401ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.399ms 100.00% 2.399ms 799.825us 3
Activity Buffer Request 36.95% 1.497ms 36.95% 1.497ms 1.497ms 742.012us 30.92% 742.012us 742.012us 1
aten::view 0.61% 24.559us 0.61% 24.559us 4.093us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.20% 48.622us 1.20% 48.622us 5.402us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.170us 0.23% 9.170us 3.057us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.07% 43.390us 1.07% 43.390us 14.463us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 53.22% 2.156ms 53.22% 2.156ms 2.156ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.052ms
Self CUDA time total: 2.399ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 1.88% 119.443us 26.75% 1.701ms 1.701ms 0.000us 0.00% 6.407ms 6.407ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.71% 45.121us 24.67% 1.568ms 522.677us 4.827ms 100.00% 6.407ms 2.136ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.829ms 100.03% 4.829ms 4.829ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.827ms 100.00% 4.827ms 1.609ms 3
Activity Buffer Request 22.91% 1.456ms 22.91% 1.456ms 1.456ms 1.580ms 32.72% 1.580ms 1.580ms 1
aten::view 0.21% 13.200us 0.21% 13.200us 2.200us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.51% 32.711us 0.51% 32.711us 3.635us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.289us 0.08% 5.289us 1.763us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.45% 28.522us 0.45% 28.522us 9.507us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 73.25% 4.656ms 73.25% 4.656ms 4.656ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.357ms
Self CUDA time total: 4.827ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 1.89% 118.801us 26.85% 1.686ms 1.686ms 0.000us 0.00% 6.309ms 6.309ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.78% 49.183us 24.77% 1.555ms 518.493us 4.763ms 100.00% 6.309ms 2.103ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.765ms 100.03% 4.765ms 4.765ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.763ms 100.00% 4.763ms 1.588ms 3
Activity Buffer Request 22.96% 1.442ms 22.96% 1.442ms 1.442ms 1.546ms 32.46% 1.546ms 1.546ms 1
aten::view 0.19% 11.741us 0.19% 11.741us 1.957us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.49% 30.460us 0.49% 30.460us 3.384us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.920us 0.08% 4.920us 1.640us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.46% 29.050us 0.46% 29.050us 9.683us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 73.15% 4.593ms 73.15% 4.593ms 4.593ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.279ms
Self CUDA time total: 4.763ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 1.11% 112.814us 7.31% 743.908us 743.908us 0.000us 0.00% 12.737ms 12.737ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.47% 47.722us 6.09% 619.105us 206.368us 9.594ms 100.00% 12.737ms 4.246ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.595ms 100.02% 9.595ms 9.595ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.594ms 100.00% 9.594ms 3.198ms 3
Activity Buffer Request 2.50% 254.176us 2.50% 254.176us 254.176us 3.143ms 32.76% 3.143ms 3.143ms 1
aten::view 0.12% 11.989us 0.12% 11.989us 1.998us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.30% 30.280us 0.30% 30.280us 3.364us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.000us 0.05% 5.000us 1.667us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 2.77% 281.927us 2.77% 281.927us 93.976us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 92.69% 9.430ms 92.69% 9.430ms 9.430ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 10.174ms
Self CUDA time total: 9.594ms
impl wl p50(ms) ok
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
hf_kernels_layer_norm LN_B16_S4096_D4096 1.66 True
hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
▶ UV Install Logs
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.22it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.45it/s]