Running activation benchmark on cuda with 9 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 172.065us 1357.73% 172.065us 172.065us 1
torch_eager 8.84% 192.611us 99.34% 2.164ms 2.164ms 0.000us 0.00% 14.977us 14.977us 1
aten::silu 2.51% 54.611us 85.85% 1.870ms 623.473us 6.496us 51.26% 8.800us 2.933us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 51.26% 6.496us 2.165us 3
aten::mul 1.45% 31.541us 2.42% 52.781us 17.594us 6.177us 48.74% 6.177us 2.059us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.177us 48.74% 6.177us 2.059us 3
Activity Buffer Request 81.30% 1.771ms 81.30% 1.771ms 1.771ms 2.304us 18.18% 2.304us 2.304us 1
aten::slice 1.79% 39.021us 2.23% 48.532us 8.089us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.44% 9.511us 0.44% 9.511us 1.585us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.01% 65.621us 3.01% 65.621us 10.937us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.66% 14.470us 0.66% 14.470us 14.470us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.179ms
Self CUDA time total: 12.673us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.935us 1197.66% 147.935us 147.935us 1
torch_eager 6.19% 128.671us 99.72% 2.072ms 2.072ms 0.000us 0.00% 14.528us 14.528us 1
aten::silu 1.99% 41.241us 90.00% 1.870ms 623.253us 6.432us 52.07% 8.608us 2.869us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 52.07% 6.432us 2.144us 3
aten::mul 1.21% 25.191us 2.13% 44.341us 14.780us 5.920us 47.93% 5.920us 1.973us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 47.93% 5.920us 1.973us 3
Activity Buffer Request 86.71% 1.801ms 86.71% 1.801ms 1.801ms 2.176us 17.62% 2.176us 2.176us 1
aten::slice 1.12% 23.301us 1.40% 28.981us 4.830us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.27% 5.680us 0.27% 5.680us 0.947us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.23% 46.310us 2.23% 46.310us 7.718us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.28% 5.721us 0.28% 5.721us 5.721us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.077ms
Self CUDA time total: 12.352us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.135us 1116.10% 147.135us 147.135us 1
torch_eager 6.76% 134.342us 99.73% 1.980ms 1.980ms 0.000us 0.00% 15.455us 15.455us 1
aten::silu 1.89% 37.461us 89.35% 1.774ms 591.479us 6.784us 51.46% 9.056us 3.019us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.46% 6.784us 2.261us 3
aten::mul 1.28% 25.422us 2.19% 43.411us 14.470us 6.399us 48.54% 6.399us 2.133us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 48.54% 6.399us 2.133us 3
Activity Buffer Request 86.14% 1.711ms 86.14% 1.711ms 1.711ms 2.272us 17.23% 2.272us 2.272us 1
aten::slice 1.16% 23.079us 1.42% 28.280us 4.713us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.26% 5.201us 0.26% 5.201us 0.867us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.23% 44.359us 2.23% 44.359us 7.393us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.27% 5.441us 0.27% 5.441us 5.441us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.986ms
Self CUDA time total: 13.183us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.034us 1193.64% 152.034us 152.034us 1
torch_eager 5.57% 123.804us 99.78% 2.219ms 2.219ms 0.000us 0.00% 14.945us 14.945us 1
aten::silu 1.71% 38.060us 90.80% 2.019ms 672.957us 6.561us 51.51% 8.769us 2.923us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 51.51% 6.561us 2.187us 3
aten::mul 1.26% 28.020us 2.11% 46.890us 15.630us 6.176us 48.49% 6.176us 2.059us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
Activity Buffer Request 81.46% 1.811ms 81.46% 1.811ms 1.811ms 2.208us 17.34% 2.208us 2.208us 1
aten::slice 1.06% 23.629us 1.31% 29.120us 4.853us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.25% 5.491us 0.25% 5.491us 0.915us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.48% 188.472us 8.48% 188.472us 31.412us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.22% 4.841us 0.22% 4.841us 4.841us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.224ms
Self CUDA time total: 12.737us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.353us 1152.70% 152.353us 152.353us 1
torch_eager 6.19% 135.991us 99.76% 2.192ms 2.192ms 0.000us 0.00% 15.489us 15.489us 1
aten::silu 1.77% 38.889us 90.16% 1.981ms 660.320us 6.752us 51.09% 9.024us 3.008us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.09% 6.752us 2.251us 3
aten::mul 1.20% 26.341us 2.10% 46.211us 15.404us 6.465us 48.91% 6.465us 2.155us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.91% 6.465us 2.155us 3
Activity Buffer Request 80.60% 1.771ms 80.60% 1.771ms 1.771ms 2.272us 17.19% 2.272us 2.272us 1
aten::slice 1.06% 23.362us 1.31% 28.762us 4.794us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.25% 5.400us 0.25% 5.400us 0.900us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.70% 191.103us 8.70% 191.103us 31.851us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.24% 5.211us 0.24% 5.211us 5.211us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.197ms
Self CUDA time total: 13.217us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.216us 991.30% 153.216us 153.216us 1
torch_eager 5.88% 135.461us 99.78% 2.300ms 2.300ms 0.000us 0.00% 18.144us 18.144us 1
aten::silu 1.72% 39.670us 90.62% 2.089ms 696.338us 7.936us 51.35% 10.624us 3.541us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.35% 7.936us 2.645us 3
aten::mul 1.19% 27.391us 2.02% 46.461us 15.487us 7.520us 48.65% 7.520us 2.507us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.520us 48.65% 7.520us 2.507us 3
Activity Buffer Request 81.58% 1.881ms 81.58% 1.881ms 1.881ms 2.688us 17.39% 2.688us 2.688us 1
aten::slice 1.04% 24.071us 1.27% 29.261us 4.877us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.23% 5.190us 0.23% 5.190us 0.865us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.15% 187.833us 8.15% 187.833us 31.305us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.22% 5.060us 0.22% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.305ms
Self CUDA time total: 15.456us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 148.287us 1031.99% 148.287us 148.287us 1
torch_eager 4.89% 105.043us 99.76% 2.144ms 2.144ms 0.000us 0.00% 16.833us 16.833us 1
aten::silu 1.85% 39.730us 91.47% 1.966ms 655.253us 7.361us 51.23% 9.825us 3.275us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 51.23% 7.361us 2.454us 3
aten::mul 1.23% 26.350us 2.09% 44.980us 14.993us 7.008us 48.77% 7.008us 2.336us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.77% 7.008us 2.336us 3
Activity Buffer Request 81.83% 1.759ms 81.83% 1.759ms 1.759ms 2.464us 17.15% 2.464us 2.464us 1
aten::slice 1.07% 23.090us 1.31% 28.260us 4.710us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.24% 5.170us 0.24% 5.170us 0.862us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.65% 185.993us 8.65% 185.993us 30.999us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.24% 5.111us 0.24% 5.111us 5.111us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.149ms
Self CUDA time total: 14.369us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.095us 983.92% 152.095us 152.095us 1
torch_eager 10.87% 257.253us 99.76% 2.361ms 2.361ms 0.000us 0.00% 18.146us 18.146us 1
aten::silu 1.67% 39.540us 85.73% 2.029ms 676.344us 7.905us 51.14% 10.593us 3.531us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.905us 51.14% 7.905us 2.635us 3
aten::mul 1.20% 28.421us 1.97% 46.561us 15.520us 7.553us 48.86% 7.553us 2.518us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.553us 48.86% 7.553us 2.518us 3
Activity Buffer Request 76.39% 1.808ms 76.39% 1.808ms 1.808ms 2.688us 17.39% 2.688us 2.688us 1
aten::slice 0.98% 23.079us 1.19% 28.100us 4.683us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.21% 5.021us 0.21% 5.021us 0.837us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.43% 199.594us 8.43% 199.594us 33.266us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.24% 5.780us 0.24% 5.780us 5.780us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.367ms
Self CUDA time total: 15.458us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.313us 647.79% 145.313us 145.313us 1
torch_eager 16.48% 98.469us 99.14% 592.319us 592.319us 0.000us 0.00% 26.336us 26.336us 1
aten::silu 6.71% 40.110us 70.79% 422.906us 140.969us 11.520us 51.36% 15.424us 5.141us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.520us 51.36% 11.520us 3.840us 3
aten::mul 4.29% 25.642us 7.38% 44.092us 14.697us 10.912us 48.64% 10.912us 3.637us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.912us 48.64% 10.912us 3.637us 3
Activity Buffer Request 36.05% 215.374us 36.05% 215.374us 215.374us 3.904us 17.40% 3.904us 3.904us 1
aten::slice 3.67% 21.912us 4.49% 26.852us 4.475us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.83% 4.940us 0.83% 4.940us 0.823us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 31.11% 185.872us 31.11% 185.872us 30.979us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.86% 5.130us 0.86% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 597.449us
Self CUDA time total: 22.432us
impl wl p50(ms) ok
torch_eager cuda_T128_D1024 0.05 True
torch_eager cuda_T128_D2048 0.05 True
torch_eager cuda_T128_D768 0.04 True
torch_eager cuda_T256_D1024 0.05 True
torch_eager cuda_T256_D2048 0.05 True
torch_eager cuda_T256_D768 0.05 True
torch_eager cuda_T512_D1024 0.05 True
torch_eager cuda_T512_D2048 0.05 True
torch_eager cuda_T512_D768 0.05 True