▼ code
▼ output
▶ uv-logs
|
Cell: benchmark | 6.33s
|
Raw
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels",
# "kernels-benchmark-tools",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the layer norm kernel
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
B, S, D = x.shape
# The kernel expects [N, D] input; support beta (bias) if provided.
out = layer_norm_kernel.dropout_add_ln_fwd(
input=x.view(-1, D),
gamma=weight,
beta=bias,
rowscale=None,
colscale=None,
x0_subset=None,
z_subset=None,
dropout_p=0.0,
epsilon=eps,
rowscale_const=1.0,
z_numrows=S,
gen=None,
residual_in_fp32=False,
is_rms_norm=False,
)[0].view(B, S, D)
return out
run_benchmark(
kernel_type=KernelTypeEnum.LAYER_NORM,
impl_name="hf_kernels_layer_norm",
impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
impl_func=hf_kernels_layer_norm,
)
Running layer_norm benchmark on cuda with 48 workloads.
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 126.624us 1327.85% 126.624us 126.624us 1
hf_kernels_layer_norm 10.50% 192.054us 99.63% 1.822ms 1.822ms 0.000us 0.00% 12.800us 12.800us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 3.73% 68.149us 87.79% 1.605ms 535.007us 9.536us 100.00% 12.800us 4.267us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.536us 100.00% 9.536us 3.179us 3
Activity Buffer Request 78.93% 1.443ms 78.93% 1.443ms 1.443ms 3.264us 34.23% 3.264us 3.264us 1
aten::view 1.34% 24.540us 1.34% 24.540us 4.090us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 2.50% 45.632us 2.50% 45.632us 5.070us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.52% 9.500us 0.52% 9.500us 3.167us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 2.11% 38.660us 2.11% 38.660us 12.887us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.37% 6.690us 0.37% 6.690us 6.690us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.828ms
Self CUDA time total: 9.536us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 118.975us 960.72% 118.975us 118.975us 1
hf_kernels_layer_norm 8.90% 155.923us 99.67% 1.747ms 1.747ms 0.000us 0.00% 16.576us 16.576us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.71% 47.470us 90.07% 1.579ms 526.204us 12.384us 100.00% 16.576us 5.525us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 12.384us 100.00% 12.384us 4.128us 3
Activity Buffer Request 83.60% 1.465ms 83.60% 1.465ms 1.465ms 4.192us 33.85% 4.192us 4.192us 1
aten::view 0.71% 12.400us 0.71% 12.400us 2.067us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.73% 30.340us 1.73% 30.340us 3.371us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.28% 4.970us 0.28% 4.970us 1.657us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.74% 30.551us 1.74% 30.551us 10.184us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.33% 5.780us 0.33% 5.780us 5.780us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.753ms
Self CUDA time total: 12.384us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 109.887us 1003.99% 109.887us 109.887us 1
hf_kernels_layer_norm 7.66% 143.860us 99.71% 1.872ms 1.872ms 0.000us 0.00% 14.626us 14.626us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.49% 46.702us 91.41% 1.716ms 571.882us 10.945us 100.00% 14.626us 4.875us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 10.945us 100.00% 10.945us 3.648us 3
Activity Buffer Request 85.70% 1.609ms 85.70% 1.609ms 1.609ms 3.681us 33.63% 3.681us 3.681us 1
aten::view 0.64% 12.051us 0.64% 12.051us 2.008us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.56% 29.239us 1.56% 29.239us 3.249us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 4.870us 0.26% 4.870us 1.623us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.40% 26.311us 1.40% 26.311us 8.770us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.29% 5.350us 0.29% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.877ms
Self CUDA time total: 10.945us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S128_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 120.287us 916.82% 120.287us 120.287us 1
hf_kernels_layer_norm 7.38% 148.710us 99.70% 2.008ms 2.008ms 0.000us 0.00% 17.504us 17.504us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.28% 45.984us 91.73% 1.848ms 615.912us 13.120us 100.00% 17.504us 5.835us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 13.120us 100.00% 13.120us 4.373us 3
Activity Buffer Request 71.87% 1.448ms 71.87% 1.448ms 1.448ms 4.384us 33.41% 4.384us 4.384us 1
aten::view 0.60% 12.011us 0.60% 12.011us 2.002us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.48% 29.740us 1.48% 29.740us 3.304us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 5.319us 0.26% 5.319us 1.773us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 15.83% 318.904us 15.83% 318.904us 106.301us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.30% 5.970us 0.30% 5.970us 5.970us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.014ms
Self CUDA time total: 13.120us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.559us 1189.48% 114.559us 114.559us 1
hf_kernels_layer_norm 7.21% 135.832us 99.75% 1.879ms 1.879ms 0.000us 0.00% 12.767us 12.767us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.53% 47.731us 91.89% 1.731ms 576.915us 9.631us 100.00% 12.767us 4.256us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.631us 100.00% 9.631us 3.210us 3
Activity Buffer Request 78.55% 1.480ms 78.55% 1.480ms 1.480ms 3.136us 32.56% 3.136us 3.136us 1
aten::view 0.65% 12.210us 0.65% 12.210us 2.035us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.55% 29.201us 1.55% 29.201us 3.245us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 4.830us 0.26% 4.830us 1.610us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 9.00% 169.482us 9.00% 169.482us 56.494us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.25% 4.770us 0.25% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.884ms
Self CUDA time total: 9.631us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 117.151us 841.66% 117.151us 117.151us 1
hf_kernels_layer_norm 7.38% 134.703us 99.74% 1.819ms 1.819ms 0.000us 0.00% 18.495us 18.495us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.52% 45.930us 91.68% 1.673ms 557.511us 13.919us 100.00% 18.495us 6.165us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 13.919us 100.00% 13.919us 4.640us 3
Activity Buffer Request 78.70% 1.436ms 78.70% 1.436ms 1.436ms 4.576us 32.88% 4.576us 4.576us 1
aten::view 0.67% 12.200us 0.67% 12.200us 2.033us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.63% 29.679us 1.63% 29.679us 3.298us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.30% 5.450us 0.30% 5.450us 1.817us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 8.54% 155.763us 8.54% 155.763us 51.921us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.26% 4.800us 0.26% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.824ms
Self CUDA time total: 13.919us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 121.982us 816.32% 121.982us 121.982us 1
hf_kernels_layer_norm 7.42% 137.921us 99.71% 1.853ms 1.853ms 0.000us 0.00% 19.934us 19.934us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.51% 46.641us 91.61% 1.702ms 567.498us 14.943us 100.00% 19.934us 6.645us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 14.943us 100.00% 14.943us 4.981us 3
Activity Buffer Request 78.68% 1.462ms 78.68% 1.462ms 1.462ms 4.991us 33.40% 4.991us 4.991us 1
aten::view 0.68% 12.581us 0.68% 12.581us 2.097us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.61% 30.011us 1.61% 30.011us 3.335us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 4.880us 0.26% 4.880us 1.627us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 8.55% 158.912us 8.55% 158.912us 52.971us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.29% 5.320us 0.29% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.858ms
Self CUDA time total: 14.943us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S512_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 122.336us 491.39% 122.336us 122.336us 1
hf_kernels_layer_norm 7.27% 134.311us 99.73% 1.842ms 1.842ms 0.000us 0.00% 33.152us 33.152us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.48% 45.720us 91.77% 1.695ms 564.845us 24.896us 100.00% 33.152us 11.051us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 24.896us 100.00% 24.896us 8.299us 3
Activity Buffer Request 78.89% 1.457ms 78.89% 1.457ms 1.457ms 8.256us 33.16% 8.256us 8.256us 1
aten::view 0.69% 12.770us 0.69% 12.770us 2.128us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.64% 30.291us 1.64% 30.291us 3.366us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.28% 5.131us 0.28% 5.131us 1.710us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 8.48% 156.672us 8.48% 156.672us 52.224us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.27% 4.950us 0.27% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.847ms
Self CUDA time total: 24.896us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 112.508us 1085.25% 112.508us 112.508us 1
hf_kernels_layer_norm 20.69% 103.551us 99.03% 495.767us 495.767us 0.000us 0.00% 13.759us 13.759us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 9.55% 47.810us 76.09% 380.926us 126.975us 10.367us 100.00% 13.759us 4.586us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 10.367us 100.00% 10.367us 3.456us 3
Activity Buffer Request 28.93% 144.803us 28.93% 144.803us 144.803us 3.392us 32.72% 3.392us 3.392us 1
aten::view 2.26% 11.290us 2.26% 11.290us 1.882us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 5.78% 28.941us 5.78% 28.941us 3.216us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.18% 5.889us 1.18% 5.889us 1.963us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 30.66% 153.483us 30.66% 153.483us 51.161us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.97% 4.840us 0.97% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 500.607us
Self CUDA time total: 10.367us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.622us 709.29% 114.622us 114.622us 1
hf_kernels_layer_norm 17.15% 104.082us 99.15% 601.769us 601.769us 0.000us 0.00% 21.536us 21.536us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 7.36% 44.690us 80.00% 485.537us 161.846us 16.160us 100.00% 21.536us 7.179us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 16.160us 100.00% 16.160us 5.387us 3
Activity Buffer Request 41.13% 249.624us 41.13% 249.624us 249.624us 5.376us 33.27% 5.376us 5.376us 1
aten::view 2.00% 12.150us 2.00% 12.150us 2.025us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 4.85% 29.441us 4.85% 29.441us 3.271us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.88% 5.329us 0.88% 5.329us 1.776us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 25.78% 156.453us 25.78% 156.453us 52.151us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.85% 5.140us 0.85% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 606.909us
Self CUDA time total: 16.160us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 116.126us 544.07% 116.126us 116.126us 1
hf_kernels_layer_norm 21.73% 103.750us 98.94% 472.437us 472.437us 0.000us 0.00% 28.448us 28.448us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 9.81% 46.840us 74.86% 357.435us 119.145us 21.344us 100.00% 28.448us 9.483us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 21.344us 100.00% 21.344us 7.115us 3
Activity Buffer Request 25.46% 121.562us 25.46% 121.562us 121.562us 7.104us 33.28% 7.104us 7.104us 1
aten::view 2.36% 11.252us 2.36% 11.252us 1.875us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 6.20% 29.622us 6.20% 29.622us 3.291us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.03% 4.929us 1.03% 4.929us 1.643us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 32.35% 154.482us 32.35% 154.482us 51.494us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 1.06% 5.060us 1.06% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 477.497us
Self CUDA time total: 21.344us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S1024_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 123.005us 198.35% 123.005us 123.005us 1
hf_kernels_layer_norm 17.67% 104.362us 99.18% 585.739us 585.739us 0.000us 0.00% 97.950us 97.950us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 7.69% 45.431us 79.53% 469.697us 156.566us 62.015us 100.00% 97.950us 32.650us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 62.015us 100.00% 62.015us 20.672us 3
Activity Buffer Request 38.94% 229.994us 38.94% 229.994us 229.994us 35.935us 57.95% 35.935us 35.935us 1
aten::view 1.98% 11.680us 1.98% 11.680us 1.947us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 4.96% 29.301us 4.96% 29.301us 3.256us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.94% 5.530us 0.94% 5.530us 1.843us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 27.00% 159.441us 27.00% 159.441us 53.147us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.82% 4.870us 0.82% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 590.609us
Self CUDA time total: 62.015us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 112.925us 880.03% 112.925us 112.925us 1
hf_kernels_layer_norm 21.36% 101.251us 98.99% 469.286us 469.286us 0.000us 0.00% 17.152us 17.152us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 9.95% 47.161us 75.23% 356.625us 118.875us 12.832us 100.00% 17.152us 5.717us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 12.832us 100.00% 12.832us 4.277us 3
Activity Buffer Request 24.52% 116.222us 24.52% 116.222us 116.222us 4.320us 33.67% 4.320us 4.320us 1
aten::view 2.41% 11.410us 2.41% 11.410us 1.902us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 6.33% 30.000us 6.33% 30.000us 3.333us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.99% 4.690us 0.99% 4.690us 1.563us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 33.44% 158.552us 33.44% 158.552us 52.851us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 1.01% 4.791us 1.01% 4.791us 4.791us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 474.077us
Self CUDA time total: 12.832us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 116.894us 456.05% 116.894us 116.894us 1
hf_kernels_layer_norm 16.78% 104.390us 99.21% 617.040us 617.040us 0.000us 0.00% 34.336us 34.336us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 7.67% 47.682us 80.57% 501.128us 167.043us 25.632us 100.00% 34.336us 11.445us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 25.632us 100.00% 25.632us 8.544us 3
Activity Buffer Request 42.51% 264.394us 42.51% 264.394us 264.394us 8.704us 33.96% 8.704us 8.704us 1
aten::view 1.85% 11.522us 1.85% 11.522us 1.920us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 4.71% 29.300us 4.71% 29.300us 3.256us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.84% 5.220us 0.84% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 24.85% 154.532us 24.85% 154.532us 51.511us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.79% 4.910us 0.79% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 621.950us
Self CUDA time total: 25.632us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 123.901us 207.17% 123.901us 123.901us 1
hf_kernels_layer_norm 17.03% 105.700us 99.25% 616.179us 616.179us 0.000us 0.00% 95.452us 95.452us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 7.57% 46.994us 80.35% 498.838us 166.279us 59.805us 100.00% 95.452us 31.817us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 59.805us 100.00% 59.805us 19.935us 3
Activity Buffer Request 42.09% 261.283us 42.09% 261.283us 261.283us 35.647us 59.61% 35.647us 35.647us 1
aten::view 1.88% 11.641us 1.88% 11.641us 1.940us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 4.84% 30.020us 4.84% 30.020us 3.336us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.92% 5.739us 0.92% 5.739us 1.913us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 24.93% 154.802us 24.93% 154.802us 51.601us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.75% 4.650us 0.75% 4.650us 4.650us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 620.829us
Self CUDA time total: 59.805us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B1_S2048_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 20.93% 115.170us 99.06% 545.227us 545.227us 0.000us 0.00% 194.686us 194.686us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 8.82% 48.552us 75.83% 417.326us 139.109us 120.767us 100.00% 194.686us 64.895us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 137.247us 113.65% 137.247us 137.247us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 120.767us 100.00% 120.767us 40.256us 3
Activity Buffer Request 31.56% 173.672us 31.56% 173.672us 173.672us 73.919us 61.21% 73.919us 73.919us 1
aten::view 2.31% 12.731us 2.31% 12.731us 2.122us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 5.79% 31.840us 5.79% 31.840us 3.538us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.15% 6.350us 1.15% 6.350us 2.117us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 28.51% 156.912us 28.51% 156.912us 52.304us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.94% 5.151us 0.94% 5.151us 5.151us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 550.378us
Self CUDA time total: 120.767us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 120.958us 1277.01% 120.958us 120.958us 1
hf_kernels_layer_norm 13.96% 126.333us 99.48% 900.293us 900.293us 0.000us 0.00% 12.480us 12.480us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 5.25% 47.490us 84.03% 760.450us 253.483us 9.472us 100.00% 12.480us 4.160us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.472us 100.00% 9.472us 3.157us 3
Activity Buffer Request 56.99% 515.778us 56.99% 515.778us 515.778us 3.008us 31.76% 3.008us 3.008us 1
aten::view 1.49% 13.510us 1.49% 13.510us 2.252us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 3.30% 29.900us 3.30% 29.900us 3.322us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.61% 5.520us 0.61% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 17.87% 161.762us 17.87% 161.762us 53.921us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.52% 4.731us 0.52% 4.731us 4.731us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 905.024us
Self CUDA time total: 9.472us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 119.647us 905.32% 119.647us 119.647us 1
hf_kernels_layer_norm 7.02% 129.983us 99.72% 1.846ms 1.846ms 0.000us 0.00% 17.632us 17.632us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.48% 45.879us 92.05% 1.704ms 568.058us 13.216us 100.00% 17.632us 5.877us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 13.216us 100.00% 13.216us 4.405us 3
Activity Buffer Request 79.30% 1.468ms 79.30% 1.468ms 1.468ms 4.416us 33.41% 4.416us 4.416us 1
aten::view 0.65% 12.030us 0.65% 12.030us 2.005us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.60% 29.701us 1.60% 29.701us 3.300us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.33% 6.090us 0.33% 6.090us 2.030us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 8.34% 154.332us 8.34% 154.332us 51.444us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.28% 5.130us 0.28% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.851ms
Self CUDA time total: 13.216us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 119.904us 814.57% 119.904us 119.904us 1
hf_kernels_layer_norm 6.96% 128.481us 99.73% 1.842ms 1.842ms 0.000us 0.00% 19.648us 19.648us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.56% 47.250us 92.11% 1.701ms 566.981us 14.720us 100.00% 19.648us 6.549us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 14.720us 100.00% 14.720us 4.907us 3
Activity Buffer Request 79.23% 1.463ms 79.23% 1.463ms 1.463ms 4.928us 33.48% 4.928us 4.928us 1
aten::view 0.66% 12.121us 0.66% 12.121us 2.020us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.62% 29.881us 1.62% 29.881us 3.320us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.34% 6.300us 0.34% 6.300us 2.100us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 8.36% 154.452us 8.36% 154.452us 51.484us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.27% 5.031us 0.27% 5.031us 5.031us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.847ms
Self CUDA time total: 14.720us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S128_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 123.838us 511.90% 123.838us 123.838us 1
hf_kernels_layer_norm 6.93% 126.950us 99.73% 1.827ms 1.827ms 0.000us 0.00% 32.224us 32.224us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.51% 46.080us 92.13% 1.688ms 562.698us 24.192us 100.00% 32.224us 10.741us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 24.192us 100.00% 24.192us 8.064us 3
Activity Buffer Request 79.12% 1.450ms 79.12% 1.450ms 1.450ms 8.032us 33.20% 8.032us 8.032us 1
aten::view 0.67% 12.241us 0.67% 12.241us 2.040us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.67% 30.641us 1.67% 30.641us 3.405us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.33% 5.980us 0.33% 5.980us 1.993us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 8.50% 155.772us 8.50% 155.772us 51.924us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.27% 4.990us 0.27% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.832ms
Self CUDA time total: 24.192us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.752us 903.27% 114.752us 114.752us 1
hf_kernels_layer_norm 6.98% 127.002us 99.74% 1.816ms 1.816ms 0.000us 0.00% 16.896us 16.896us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.46% 44.721us 92.11% 1.677ms 559.031us 12.704us 100.00% 16.896us 5.632us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 12.704us 100.00% 12.704us 4.235us 3
Activity Buffer Request 79.42% 1.446ms 79.42% 1.446ms 1.446ms 4.192us 33.00% 4.192us 4.192us 1
aten::view 0.65% 11.810us 0.65% 11.810us 1.968us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.61% 29.350us 1.61% 29.350us 3.261us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.30% 5.480us 0.30% 5.480us 1.827us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 8.33% 151.582us 8.33% 151.582us 50.527us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.26% 4.810us 0.26% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.821ms
Self CUDA time total: 12.704us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.171us 434.06% 114.171us 114.171us 1
hf_kernels_layer_norm 21.27% 106.031us 98.93% 493.167us 493.167us 0.000us 0.00% 35.134us 35.134us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 8.94% 44.581us 75.39% 375.835us 125.278us 26.303us 100.00% 35.134us 11.711us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 26.303us 100.00% 26.303us 8.768us 3
Activity Buffer Request 28.70% 143.052us 28.70% 143.052us 143.052us 8.831us 33.57% 8.831us 8.831us 1
aten::view 2.27% 11.301us 2.27% 11.301us 1.883us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 5.86% 29.220us 5.86% 29.220us 3.247us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.95% 4.720us 0.95% 4.720us 1.573us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 30.95% 154.262us 30.95% 154.262us 51.421us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 1.07% 5.331us 1.07% 5.331us 5.331us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 498.498us
Self CUDA time total: 26.303us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 126.302us 214.16% 126.302us 126.302us 1
hf_kernels_layer_norm 6.77% 126.701us 99.74% 1.866ms 1.866ms 0.000us 0.00% 94.496us 94.496us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.55% 47.732us 92.27% 1.726ms 575.432us 58.976us 100.00% 94.496us 31.499us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 58.976us 100.00% 58.976us 19.659us 3
Activity Buffer Request 79.36% 1.485ms 79.36% 1.485ms 1.485ms 35.520us 60.23% 35.520us 35.520us 1
aten::view 0.70% 13.010us 0.70% 13.010us 2.168us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.62% 30.339us 1.62% 30.339us 3.371us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.26% 4.881us 0.26% 4.881us 1.627us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 8.48% 158.562us 8.48% 158.562us 52.854us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.26% 4.860us 0.26% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.871ms
Self CUDA time total: 58.976us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S512_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 7.35% 135.313us 99.73% 1.836ms 1.836ms 0.000us 0.00% 200.830us 200.830us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.50% 46.052us 91.69% 1.688ms 562.585us 126.431us 100.00% 200.830us 66.943us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 142.015us 112.33% 142.015us 142.015us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 126.431us 100.00% 126.431us 42.144us 3
Activity Buffer Request 77.83% 1.433ms 77.83% 1.433ms 1.433ms 74.399us 58.85% 74.399us 74.399us 1
aten::view 0.68% 12.599us 0.68% 12.599us 2.100us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.73% 31.929us 1.73% 31.929us 3.548us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.30% 5.440us 0.30% 5.440us 1.813us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 9.33% 171.692us 9.33% 171.692us 57.231us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.27% 4.980us 0.27% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.841ms
Self CUDA time total: 126.431us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 114.877us 559.23% 114.877us 114.877us 1
hf_kernels_layer_norm 18.77% 104.472us 99.13% 551.627us 551.627us 0.000us 0.00% 27.357us 27.357us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 8.09% 45.039us 78.27% 435.585us 145.195us 20.542us 100.00% 27.357us 9.119us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 20.542us 100.00% 20.542us 6.847us 3
Activity Buffer Request 36.72% 204.352us 36.72% 204.352us 204.352us 6.815us 33.18% 6.815us 6.815us 1
aten::view 2.08% 11.570us 2.08% 11.570us 1.928us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 5.24% 29.142us 5.24% 29.142us 3.238us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.93% 5.150us 0.93% 5.150us 1.717us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 27.30% 151.902us 27.30% 151.902us 50.634us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.87% 4.869us 0.87% 4.869us 4.869us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 556.496us
Self CUDA time total: 20.542us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 128.543us 194.15% 128.543us 128.543us 1
hf_kernels_layer_norm 6.47% 121.263us 99.74% 1.870ms 1.870ms 0.000us 0.00% 103.680us 103.680us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.50% 46.880us 92.61% 1.737ms 578.834us 66.208us 100.00% 103.680us 34.560us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 66.208us 100.00% 66.208us 22.069us 3
Activity Buffer Request 80.04% 1.501ms 80.04% 1.501ms 1.501ms 37.472us 56.60% 37.472us 37.472us 1
aten::view 0.67% 12.550us 0.67% 12.550us 2.092us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.61% 30.111us 1.61% 30.111us 3.346us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.29% 5.429us 0.29% 5.429us 1.810us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 8.17% 153.262us 8.17% 153.262us 51.087us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.26% 4.790us 0.26% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.875ms
Self CUDA time total: 66.208us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 21.88% 101.912us 98.91% 460.726us 460.726us 0.000us 0.00% 193.786us 193.786us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 10.30% 47.997us 74.62% 347.614us 115.871us 120.124us 100.00% 193.786us 64.595us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 129.116us 107.49% 129.116us 129.116us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 120.124us 100.00% 120.124us 40.041us 3
Activity Buffer Request 23.66% 110.222us 23.66% 110.222us 110.222us 73.662us 61.32% 73.662us 73.662us 1
aten::view 2.40% 11.200us 2.40% 11.200us 1.867us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 6.29% 29.283us 6.29% 29.283us 3.254us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.07% 4.970us 1.07% 4.970us 1.657us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 33.30% 155.142us 33.30% 155.142us 51.714us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 1.09% 5.100us 1.09% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 465.826us
Self CUDA time total: 120.124us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S1024_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 10.47% 108.133us 61.96% 639.990us 639.990us 0.000us 0.00% 741.038us 741.038us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 4.66% 48.171us 50.27% 519.257us 173.086us 556.019us 100.00% 741.038us 247.013us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 557.395us 100.25% 557.395us 557.395us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 556.019us 100.00% 556.019us 185.340us 3
Activity Buffer Request 26.52% 273.914us 26.52% 273.914us 273.914us 185.019us 33.28% 185.019us 185.019us 1
aten::view 1.22% 12.600us 1.22% 12.600us 2.100us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 2.91% 30.100us 2.91% 30.100us 3.344us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.47% 4.869us 0.47% 4.869us 1.623us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 15.70% 162.203us 15.70% 162.203us 54.068us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 38.04% 392.946us 38.04% 392.946us 392.946us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.033ms
Self CUDA time total: 556.019us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 117.533us 202.70% 117.533us 117.533us 1
hf_kernels_layer_norm 16.63% 101.441us 99.21% 605.228us 605.228us 0.000us 0.00% 93.950us 93.950us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 7.68% 46.841us 80.72% 492.428us 164.143us 57.983us 100.00% 93.950us 31.317us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 57.983us 100.00% 57.983us 19.328us 3
Activity Buffer Request 41.81% 255.054us 41.81% 255.054us 255.054us 35.967us 62.03% 35.967us 35.967us 1
aten::view 1.86% 11.359us 1.86% 11.359us 1.893us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 4.84% 29.531us 4.84% 29.531us 3.281us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.88% 5.399us 0.88% 5.399us 1.800us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 25.51% 155.603us 25.51% 155.603us 51.868us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.79% 4.850us 0.79% 4.850us 4.850us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 610.078us
Self CUDA time total: 57.983us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 16.67% 104.061us 99.23% 619.539us 619.539us 0.000us 0.00% 218.617us 218.617us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 7.57% 47.260us 80.66% 503.568us 167.856us 138.780us 100.00% 218.617us 72.872us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 140.188us 101.01% 140.188us 140.188us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 138.780us 100.00% 138.780us 46.260us 3
Activity Buffer Request 42.90% 267.854us 42.90% 267.854us 267.854us 79.837us 57.53% 79.837us 79.837us 1
aten::view 1.91% 11.910us 1.91% 11.910us 1.985us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 4.81% 30.001us 4.81% 30.001us 3.333us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.76% 4.720us 0.76% 4.720us 1.573us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 24.62% 153.733us 24.62% 153.733us 51.244us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.77% 4.780us 0.77% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 624.319us
Self CUDA time total: 138.780us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 11.56% 103.222us 56.17% 501.697us 501.697us 0.000us 0.00% 729.744us 729.744us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 5.35% 47.791us 43.31% 386.845us 128.948us 547.924us 100.00% 729.744us 243.248us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 549.427us 100.27% 549.427us 549.427us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 547.924us 100.00% 547.924us 182.641us 3
Activity Buffer Request 16.56% 147.902us 16.56% 147.902us 147.902us 181.820us 33.18% 181.820us 181.820us 1
aten::view 1.30% 11.630us 1.30% 11.630us 1.938us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 3.31% 29.600us 3.31% 29.600us 3.289us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.55% 4.940us 0.55% 4.940us 1.647us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 17.53% 156.612us 17.53% 156.612us 52.204us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 43.83% 391.555us 43.83% 391.555us 391.555us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 893.252us
Self CUDA time total: 547.924us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B4_S2048_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 6.34% 102.532us 36.35% 588.198us 588.198us 0.000us 0.00% 1.536ms 1.536ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.97% 48.143us 29.27% 473.696us 157.899us 1.186ms 100.00% 1.536ms 511.906us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.188ms 100.13% 1.188ms 1.188ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.186ms 100.00% 1.186ms 395.396us 3
Activity Buffer Request 14.38% 232.673us 14.38% 232.673us 232.673us 349.530us 29.47% 349.530us 349.530us 1
aten::view 0.74% 11.970us 0.74% 11.970us 1.995us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.86% 30.039us 1.86% 30.039us 3.338us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.30% 4.850us 0.30% 4.850us 1.617us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 9.76% 157.991us 9.76% 157.991us 52.664us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 63.65% 1.030ms 63.65% 1.030ms 1.030ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.618ms
Self CUDA time total: 1.186ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 112.413us 848.59% 112.413us 112.413us 1
hf_kernels_layer_norm 21.62% 101.733us 99.00% 465.906us 465.906us 0.000us 0.00% 17.726us 17.726us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 10.03% 47.199us 74.95% 352.704us 117.568us 13.247us 100.00% 17.726us 5.909us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 13.247us 100.00% 13.247us 4.416us 3
Activity Buffer Request 24.84% 116.882us 24.84% 116.882us 116.882us 4.479us 33.81% 4.479us 4.479us 1
aten::view 2.44% 11.469us 2.44% 11.469us 1.912us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 6.31% 29.701us 6.31% 29.701us 3.300us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.17% 5.520us 1.17% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 32.60% 153.402us 32.60% 153.402us 51.134us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 1.00% 4.700us 1.00% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 470.606us
Self CUDA time total: 13.247us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 116.766us 456.71% 116.766us 116.766us 1
hf_kernels_layer_norm 17.51% 102.502us 99.17% 580.409us 580.409us 0.000us 0.00% 34.239us 34.239us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 7.99% 46.742us 79.55% 465.587us 155.196us 25.567us 100.00% 34.239us 11.413us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 25.567us 100.00% 25.567us 8.522us 3
Activity Buffer Request 39.32% 230.104us 39.32% 230.104us 230.104us 8.672us 33.92% 8.672us 8.672us 1
aten::view 2.11% 12.320us 2.11% 12.320us 2.053us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 5.04% 29.500us 5.04% 29.500us 3.278us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.02% 5.979us 1.02% 5.979us 1.993us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 26.19% 153.262us 26.19% 153.262us 51.087us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.83% 4.860us 0.83% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 585.269us
Self CUDA time total: 25.567us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 120.223us 201.23% 120.223us 120.223us 1
hf_kernels_layer_norm 16.35% 102.201us 99.23% 620.398us 620.398us 0.000us 0.00% 95.200us 95.200us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 7.44% 46.527us 81.07% 506.887us 168.962us 59.744us 100.00% 95.200us 31.733us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 59.744us 100.00% 59.744us 19.915us 3
Activity Buffer Request 43.52% 272.134us 43.52% 272.134us 272.134us 35.456us 59.35% 35.456us 35.456us 1
aten::view 1.81% 11.310us 1.81% 11.310us 1.885us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 4.69% 29.332us 4.69% 29.332us 3.259us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.86% 5.391us 0.86% 5.391us 1.797us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 24.55% 153.503us 24.55% 153.503us 51.168us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.77% 4.841us 0.77% 4.841us 4.841us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 625.239us
Self CUDA time total: 59.744us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S128_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 20.57% 103.320us 99.00% 497.196us 497.196us 0.000us 0.00% 197.814us 197.814us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 9.11% 45.760us 76.10% 382.195us 127.398us 124.346us 100.00% 197.814us 65.938us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 132.857us 106.84% 132.857us 132.857us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 124.346us 100.00% 124.346us 41.449us 3
Activity Buffer Request 28.52% 143.222us 28.52% 143.222us 143.222us 73.468us 59.08% 73.468us 73.468us 1
aten::view 2.33% 11.681us 2.33% 11.681us 1.947us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 6.37% 31.970us 6.37% 31.970us 3.552us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.95% 4.761us 0.95% 4.761us 1.587us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 31.16% 156.482us 31.16% 156.482us 52.161us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 1.00% 5.020us 1.00% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 502.216us
Self CUDA time total: 124.346us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 124.255us 213.59% 124.255us 124.255us 1
hf_kernels_layer_norm 13.39% 104.902us 99.38% 778.360us 778.360us 0.000us 0.00% 94.430us 94.430us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 6.10% 47.738us 84.51% 661.878us 220.626us 58.175us 100.00% 94.430us 31.477us 3
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 58.175us 100.00% 58.175us 19.392us 3
Activity Buffer Request 54.12% 423.885us 54.12% 423.885us 423.885us 36.255us 62.32% 36.255us 36.255us 1
aten::view 1.48% 11.580us 1.48% 11.580us 1.930us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 3.89% 30.461us 3.89% 30.461us 3.385us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.64% 5.001us 0.64% 5.001us 1.667us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 19.76% 154.793us 19.76% 154.793us 51.598us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.62% 4.840us 0.62% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 783.200us
Self CUDA time total: 58.175us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 21.79% 100.002us 98.87% 453.846us 453.846us 0.000us 0.00% 220.923us 220.923us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 9.94% 45.651us 74.52% 342.064us 114.021us 139.741us 100.00% 220.923us 73.641us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 141.149us 101.01% 141.149us 141.149us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 139.741us 100.00% 139.741us 46.580us 3
Activity Buffer Request 23.19% 106.461us 23.19% 106.461us 106.461us 81.182us 58.09% 81.182us 81.182us 1
aten::view 2.57% 11.780us 2.57% 11.780us 1.963us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 6.95% 31.900us 6.95% 31.900us 3.544us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.20% 5.510us 1.20% 5.510us 1.837us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 33.23% 152.542us 33.23% 152.542us 50.847us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 1.13% 5.191us 1.13% 5.191us 5.191us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 459.037us
Self CUDA time total: 139.741us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 8.50% 106.103us 68.87% 859.212us 859.212us 0.000us 0.00% 730.264us 730.264us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 3.84% 47.858us 59.45% 741.700us 247.233us 547.642us 100.00% 730.264us 243.421us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 549.114us 100.27% 549.114us 549.114us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 547.642us 100.00% 547.642us 182.547us 3
Activity Buffer Request 40.36% 503.557us 40.36% 503.557us 503.557us 182.622us 33.35% 182.622us 182.622us 1
aten::view 0.91% 11.409us 0.91% 11.409us 1.901us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 2.41% 30.103us 2.41% 30.103us 3.345us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.44% 5.510us 0.44% 5.510us 1.837us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 12.40% 154.672us 12.40% 154.672us 51.557us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 31.13% 388.435us 31.13% 388.435us 388.435us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.248ms
Self CUDA time total: 547.642us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S512_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 6.20% 117.401us 45.70% 865.822us 865.822us 0.000us 0.00% 1.533ms 1.533ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.53% 47.909us 38.86% 736.290us 245.430us 1.191ms 100.00% 1.533ms 511.056us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.192ms 100.13% 1.192ms 1.192ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.191ms 100.00% 1.191ms 396.977us 3
Activity Buffer Request 26.13% 495.047us 26.13% 495.047us 495.047us 342.236us 28.74% 342.236us 342.236us 1
aten::view 0.64% 12.131us 0.64% 12.131us 2.022us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.61% 30.562us 1.61% 30.562us 3.396us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.31% 5.930us 0.31% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 8.28% 156.842us 8.28% 156.842us 52.281us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 54.30% 1.029ms 54.30% 1.029ms 1.029ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.895ms
Self CUDA time total: 1.191ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 18.76% 102.890us 99.05% 543.128us 543.128us 0.000us 0.00% 191.549us 191.549us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 9.08% 49.784us 78.18% 428.658us 142.886us 117.790us 100.00% 191.549us 63.850us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 127.934us 108.61% 127.934us 127.934us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 117.790us 100.00% 117.790us 39.263us 3
Activity Buffer Request 33.02% 181.032us 33.02% 181.032us 181.032us 73.759us 62.62% 73.759us 73.759us 1
aten::view 2.11% 11.580us 2.11% 11.580us 1.930us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 5.47% 30.020us 5.47% 30.020us 3.336us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 1.00% 5.460us 1.00% 5.460us 1.820us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 29.61% 162.362us 29.61% 162.362us 54.121us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.95% 5.190us 0.95% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 548.318us
Self CUDA time total: 117.790us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 9.87% 125.762us 69.07% 879.903us 879.903us 0.000us 0.00% 766.838us 766.838us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 3.87% 49.332us 58.21% 741.561us 247.187us 575.481us 100.00% 766.838us 255.613us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 576.857us 100.24% 576.857us 576.857us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 575.481us 100.00% 575.481us 191.827us 3
Activity Buffer Request 39.29% 500.518us 39.29% 500.518us 500.518us 191.357us 33.25% 191.357us 191.357us 1
aten::view 0.99% 12.580us 0.99% 12.580us 2.097us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 2.41% 30.689us 2.41% 30.689us 3.410us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.43% 5.420us 0.43% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 12.21% 155.602us 12.21% 155.602us 51.867us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 30.93% 394.045us 30.93% 394.045us 394.045us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.274ms
Self CUDA time total: 575.481us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 6.87% 103.651us 31.62% 476.976us 476.976us 0.000us 0.00% 1.531ms 1.531ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 3.16% 47.619us 23.98% 361.844us 120.615us 1.187ms 100.00% 1.531ms 510.298us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.188ms 100.13% 1.188ms 1.188ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.187ms 100.00% 1.187ms 395.515us 3
Activity Buffer Request 8.20% 123.752us 8.20% 123.752us 123.752us 344.347us 29.02% 344.347us 344.347us 1
aten::view 0.76% 11.481us 0.76% 11.481us 1.913us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.98% 29.821us 1.98% 29.821us 3.313us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.39% 5.930us 0.39% 5.930us 1.977us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 10.26% 154.722us 10.26% 154.722us 51.574us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 68.38% 1.032ms 68.38% 1.032ms 1.032ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.509ms
Self CUDA time total: 1.187ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S1024_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 4.11% 127.961us 28.50% 887.612us 887.612us 0.000us 0.00% 3.104ms 3.104ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 1.47% 45.722us 24.01% 747.701us 249.234us 2.375ms 100.00% 3.104ms 1.035ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.376ms 100.06% 2.376ms 2.376ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.375ms 100.00% 2.375ms 791.601us 3
Activity Buffer Request 16.22% 505.157us 16.22% 505.157us 505.157us 729.500us 30.72% 729.500us 729.500us 1
aten::view 0.38% 11.950us 0.38% 11.950us 1.992us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.97% 30.190us 0.97% 30.190us 3.354us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.16% 4.890us 0.16% 4.890us 1.630us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.19% 161.742us 5.19% 161.742us 53.914us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 71.50% 2.226ms 71.50% 2.226ms 2.226ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.114ms
Self CUDA time total: 2.375ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 5.82% 128.863us 81.59% 1.808ms 1.808ms 0.000us 0.00% 756.792us 756.792us 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.11% 46.800us 75.21% 1.666ms 555.488us 566.586us 100.00% 756.792us 252.264us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 567.994us 100.25% 567.994us 567.994us 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 566.586us 100.00% 566.586us 188.862us 3
Activity Buffer Request 64.48% 1.429ms 64.48% 1.429ms 1.429ms 190.206us 33.57% 190.206us 190.206us 1
aten::view 0.56% 12.380us 0.56% 12.380us 2.063us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.35% 29.990us 1.35% 29.990us 3.332us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.24% 5.300us 0.24% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 7.03% 155.802us 7.03% 155.802us 51.934us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 18.41% 407.946us 18.41% 407.946us 407.946us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.216ms
Self CUDA time total: 566.586us
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 6.78% 107.581us 32.18% 510.957us 510.957us 0.000us 0.00% 1.590ms 1.590ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 2.95% 46.851us 24.67% 391.616us 130.539us 1.234ms 100.00% 1.590ms 529.905us 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 1.235ms 100.12% 1.235ms 1.235ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 1.234ms 100.00% 1.234ms 411.346us 3
Activity Buffer Request 9.78% 155.342us 9.78% 155.342us 155.342us 355.677us 28.82% 355.677us 355.677us 1
aten::view 0.74% 11.760us 0.74% 11.760us 1.960us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.88% 29.861us 1.88% 29.861us 3.318us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.38% 5.960us 0.38% 5.960us 1.987us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 9.67% 153.602us 9.67% 153.602us 51.201us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 67.82% 1.077ms 67.82% 1.077ms 1.077ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.588ms
Self CUDA time total: 1.234ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 4.29% 122.511us 22.27% 635.379us 635.379us 0.000us 0.00% 3.116ms 3.116ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 1.67% 47.772us 17.54% 500.568us 166.856us 2.375ms 100.00% 3.116ms 1.039ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.377ms 100.06% 2.377ms 2.377ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.375ms 100.00% 2.375ms 791.801us 3
Activity Buffer Request 8.85% 252.513us 8.85% 252.513us 252.513us 740.986us 31.19% 740.986us 740.986us 1
aten::view 0.43% 12.300us 0.43% 12.300us 2.050us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.05% 29.891us 1.05% 29.891us 3.321us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.21% 6.001us 0.21% 6.001us 2.000us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 5.76% 164.391us 5.76% 164.391us 54.797us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 77.73% 2.218ms 77.73% 2.218ms 2.218ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.853ms
Self CUDA time total: 2.375ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 2.07% 109.351us 12.73% 673.809us 673.809us 0.000us 0.00% 6.337ms 6.337ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.93% 49.100us 10.45% 553.127us 184.376us 4.781ms 100.00% 6.337ms 2.112ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.782ms 100.03% 4.782ms 4.782ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.781ms 100.00% 4.781ms 1.594ms 3
Activity Buffer Request 5.38% 284.544us 5.38% 284.544us 284.544us 1.556ms 32.54% 1.556ms 1.556ms 1
aten::view 0.21% 11.331us 0.21% 11.331us 1.889us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.57% 29.971us 0.57% 29.971us 3.330us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.11% 5.990us 0.11% 5.990us 1.997us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.47% 183.522us 3.47% 183.522us 61.174us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 87.27% 4.620ms 87.27% 4.620ms 4.620ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.294ms
Self CUDA time total: 4.781ms
impl wl p50(ms) ok
hf_kernels_layer_norm LN_B16_S1024_D1024 0.05 False
hf_kernels_layer_norm LN_B16_S1024_D2048 0.22 False
hf_kernels_layer_norm LN_B16_S1024_D4096 0.44 False
hf_kernels_layer_norm LN_B16_S1024_D8192 0.84 False
hf_kernels_layer_norm LN_B16_S128_D1024 0.05 False
hf_kernels_layer_norm LN_B16_S128_D2048 0.05 False
hf_kernels_layer_norm LN_B16_S128_D4096 0.05 False
hf_kernels_layer_norm LN_B16_S128_D8192 0.05 False
hf_kernels_layer_norm LN_B16_S2048_D1024 0.21 False
hf_kernels_layer_norm LN_B16_S2048_D2048 0.46 False
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 False
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 False
hf_kernels_layer_norm LN_B16_S512_D1024 0.05 False
hf_kernels_layer_norm LN_B16_S512_D2048 0.05 False
hf_kernels_layer_norm LN_B16_S512_D4096 0.21 False
hf_kernels_layer_norm LN_B16_S512_D8192 0.43 False
hf_kernels_layer_norm LN_B1_S1024_D1024 0.05 False
hf_kernels_layer_norm LN_B1_S1024_D2048 0.05 False
hf_kernels_layer_norm LN_B1_S1024_D4096 0.05 False
hf_kernels_layer_norm LN_B1_S1024_D8192 0.05 False
hf_kernels_layer_norm LN_B1_S128_D1024 0.04 False
hf_kernels_layer_norm LN_B1_S128_D2048 0.05 False
hf_kernels_layer_norm LN_B1_S128_D4096 0.05 False
hf_kernels_layer_norm LN_B1_S128_D8192 0.05 False
hf_kernels_layer_norm LN_B1_S2048_D1024 0.05 False
hf_kernels_layer_norm LN_B1_S2048_D2048 0.05 False
hf_kernels_layer_norm LN_B1_S2048_D4096 0.05 False
hf_kernels_layer_norm LN_B1_S2048_D8192 0.05 False
hf_kernels_layer_norm LN_B1_S512_D1024 0.05 False
hf_kernels_layer_norm LN_B1_S512_D2048 0.05 False
hf_kernels_layer_norm LN_B1_S512_D4096 0.05 False
hf_kernels_layer_norm LN_B1_S512_D8192 0.05 False
hf_kernels_layer_norm LN_B4_S1024_D1024 0.05 False
hf_kernels_layer_norm LN_B4_S1024_D2048 0.05 False
hf_kernels_layer_norm LN_B4_S1024_D4096 0.05 False
hf_kernels_layer_norm LN_B4_S1024_D8192 0.21 False
hf_kernels_layer_norm LN_B4_S128_D1024 0.05 False
hf_kernels_layer_norm LN_B4_S128_D2048 0.05 False
hf_kernels_layer_norm LN_B4_S128_D4096 0.05 False
hf_kernels_layer_norm LN_B4_S128_D8192 0.05 False
hf_kernels_layer_norm LN_B4_S2048_D1024 0.05 False
hf_kernels_layer_norm LN_B4_S2048_D2048 0.06 False
hf_kernels_layer_norm LN_B4_S2048_D4096 0.21 False
hf_kernels_layer_norm LN_B4_S2048_D8192 0.44 False
hf_kernels_layer_norm LN_B4_S512_D1024 0.05 False
hf_kernels_layer_norm LN_B4_S512_D2048 0.05 False
hf_kernels_layer_norm LN_B4_S512_D4096 0.05 False
hf_kernels_layer_norm LN_B4_S512_D8192 0.05 False
▶ UV Install Logs
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 7.10it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.13it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.59it/s]