Running activation benchmark on cuda with 9 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 198.560us 1555.14% 198.560us 198.560us 1
torch_eager 10.82% 202.394us 99.60% 1.864ms 1.864ms 0.000us 0.00% 15.104us 15.104us 1
aten::silu 3.05% 57.001us 82.79% 1.549ms 516.356us 6.560us 51.38% 8.896us 2.965us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.38% 6.560us 2.187us 3
aten::mul 1.85% 34.663us 3.11% 58.253us 19.418us 6.208us 48.62% 6.208us 2.069us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 48.62% 6.208us 2.069us 3
Activity Buffer Request 77.33% 1.447ms 77.33% 1.447ms 1.447ms 2.336us 18.30% 2.336us 2.336us 1
aten::slice 2.27% 42.481us 2.88% 53.841us 8.973us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.61% 11.360us 0.61% 11.360us 1.893us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.67% 68.681us 3.67% 68.681us 11.447us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.40% 7.560us 0.40% 7.560us 7.560us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.871ms
Self CUDA time total: 12.768us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.854us 1245.68% 153.854us 153.854us 1
torch_eager 7.83% 135.935us 99.65% 1.729ms 1.729ms 0.000us 0.00% 14.495us 14.495us 1
aten::silu 2.47% 42.821us 87.44% 1.517ms 505.699us 6.399us 51.81% 8.543us 2.848us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.81% 6.399us 2.133us 3
aten::mul 1.58% 27.360us 2.69% 46.680us 15.560us 5.952us 48.19% 5.952us 1.984us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 48.19% 5.952us 1.984us 3
Activity Buffer Request 83.34% 1.446ms 83.34% 1.446ms 1.446ms 2.144us 17.36% 2.144us 2.144us 1
aten::slice 1.38% 23.991us 1.69% 29.361us 4.893us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.31% 5.370us 0.31% 5.370us 0.895us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.74% 47.550us 2.74% 47.550us 7.925us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.35% 6.041us 0.35% 6.041us 6.041us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.735ms
Self CUDA time total: 12.351us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.990us 1157.70% 152.990us 152.990us 1
torch_eager 7.93% 136.944us 99.69% 1.722ms 1.722ms 0.000us 0.00% 15.487us 15.487us 1
aten::silu 2.43% 41.922us 87.32% 1.508ms 502.829us 6.752us 51.09% 9.024us 3.008us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.09% 6.752us 2.251us 3
aten::mul 1.55% 26.841us 2.71% 46.791us 15.597us 6.463us 48.91% 6.463us 2.154us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.463us 48.91% 6.463us 2.154us 3
Activity Buffer Request 83.33% 1.439ms 83.33% 1.439ms 1.439ms 2.272us 17.19% 2.272us 2.272us 1
aten::slice 1.41% 24.420us 1.74% 29.990us 4.998us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.32% 5.570us 0.32% 5.570us 0.928us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.72% 47.030us 2.72% 47.030us 7.838us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.31% 5.290us 0.31% 5.290us 5.290us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.728ms
Self CUDA time total: 13.215us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.287us 1195.72% 152.287us 152.287us 1
torch_eager 6.75% 128.682us 99.76% 1.902ms 1.902ms 0.000us 0.00% 14.944us 14.944us 1
aten::silu 2.22% 42.301us 89.12% 1.699ms 566.261us 6.560us 51.51% 8.768us 2.923us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
aten::mul 1.34% 25.502us 2.28% 43.392us 14.464us 6.176us 48.49% 6.176us 2.059us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
Activity Buffer Request 74.83% 1.427ms 74.83% 1.427ms 1.427ms 2.208us 17.34% 2.208us 2.208us 1
aten::slice 1.32% 25.141us 1.61% 30.781us 5.130us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.30% 5.640us 0.30% 5.640us 0.940us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 13.00% 247.856us 13.00% 247.856us 41.309us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.24% 4.611us 0.24% 4.611us 4.611us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.906ms
Self CUDA time total: 12.736us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.054us 1155.39% 153.054us 153.054us 1
torch_eager 6.42% 122.793us 99.75% 1.907ms 1.907ms 0.000us 0.00% 15.518us 15.518us 1
aten::silu 2.19% 41.952us 89.33% 1.708ms 569.191us 6.751us 50.96% 9.022us 3.007us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 50.96% 6.751us 2.250us 3
aten::mul 1.27% 24.330us 2.36% 45.101us 15.034us 6.496us 49.04% 6.496us 2.165us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 49.04% 6.496us 2.165us 3
Activity Buffer Request 76.06% 1.454ms 76.06% 1.454ms 1.454ms 2.271us 17.14% 2.271us 2.271us 1
aten::slice 1.34% 25.570us 1.64% 31.330us 5.222us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.30% 5.760us 0.30% 5.760us 0.960us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.16% 232.387us 12.16% 232.387us 38.731us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.25% 4.840us 0.25% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.912ms
Self CUDA time total: 13.247us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.743us 1029.27% 159.743us 159.743us 1
torch_eager 7.04% 135.613us 99.74% 1.921ms 1.921ms 0.000us 0.00% 18.208us 18.208us 1
aten::silu 2.22% 42.702us 88.66% 1.708ms 569.181us 7.936us 51.13% 10.624us 3.541us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.13% 7.936us 2.645us 3
aten::mul 1.46% 28.181us 2.39% 45.941us 15.314us 7.584us 48.87% 7.584us 2.528us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.87% 7.584us 2.528us 3
Activity Buffer Request 75.65% 1.457ms 75.65% 1.457ms 1.457ms 2.688us 17.32% 2.688us 2.688us 1
aten::slice 1.35% 26.081us 1.66% 31.951us 5.325us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.30% 5.870us 0.30% 5.870us 0.978us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.71% 225.495us 11.71% 225.495us 37.582us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.26% 4.960us 0.26% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.926ms
Self CUDA time total: 15.520us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.031us 1088.46% 156.031us 156.031us 1
torch_eager 6.78% 127.672us 99.74% 1.878ms 1.878ms 0.000us 0.00% 16.798us 16.798us 1
aten::silu 2.24% 42.252us 88.75% 1.671ms 556.944us 7.327us 51.11% 9.790us 3.263us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 51.11% 7.327us 2.442us 3
aten::mul 1.40% 26.401us 2.46% 46.222us 15.407us 7.008us 48.89% 7.008us 2.336us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.89% 7.008us 2.336us 3
Activity Buffer Request 75.83% 1.428ms 75.83% 1.428ms 1.428ms 2.463us 17.18% 2.463us 2.463us 1
aten::slice 1.43% 26.941us 1.75% 32.941us 5.490us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.32% 6.000us 0.32% 6.000us 1.000us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.73% 220.885us 11.73% 220.885us 36.814us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.26% 4.871us 0.26% 4.871us 4.871us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.883ms
Self CUDA time total: 14.335us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.072us 971.40% 151.072us 151.072us 1
torch_eager 5.82% 108.433us 99.72% 1.859ms 1.859ms 0.000us 0.00% 18.240us 18.240us 1
aten::silu 2.20% 40.971us 89.83% 1.675ms 558.344us 7.968us 51.23% 10.656us 3.552us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.23% 7.968us 2.656us 3
aten::mul 1.42% 26.501us 2.46% 45.902us 15.301us 7.584us 48.77% 7.584us 2.528us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.77% 7.584us 2.528us 3
Activity Buffer Request 76.88% 1.433ms 76.88% 1.433ms 1.433ms 2.688us 17.28% 2.688us 2.688us 1
aten::slice 1.31% 24.441us 1.61% 29.960us 4.993us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.30% 5.519us 0.30% 5.519us 0.920us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.80% 219.996us 11.80% 219.996us 36.666us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.28% 5.300us 0.28% 5.300us 5.300us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.865ms
Self CUDA time total: 15.552us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.150us 692.69% 157.150us 157.150us 1
torch_eager 5.73% 107.203us 99.74% 1.865ms 1.865ms 0.000us 0.00% 26.622us 26.622us 1
aten::silu 2.21% 41.231us 89.87% 1.680ms 560.117us 11.647us 51.34% 15.582us 5.194us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.647us 51.34% 11.647us 3.882us 3
aten::mul 1.38% 25.882us 2.47% 46.192us 15.397us 11.040us 48.66% 11.040us 3.680us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.040us 48.66% 11.040us 3.680us 3
Activity Buffer Request 77.17% 1.443ms 77.17% 1.443ms 1.443ms 3.935us 17.34% 3.935us 3.935us 1
aten::slice 1.37% 25.600us 1.67% 31.160us 5.193us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.30% 5.560us 0.30% 5.560us 0.927us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.58% 216.535us 11.58% 216.535us 36.089us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.26% 4.830us 0.26% 4.830us 4.830us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.870ms
Self CUDA time total: 22.687us
impl wl p50(ms) ok
torch_eager cuda_T128_D1024 0.05 True
torch_eager cuda_T128_D2048 0.05 True
torch_eager cuda_T128_D768 0.04 True
torch_eager cuda_T256_D1024 0.05 True
torch_eager cuda_T256_D2048 0.05 True
torch_eager cuda_T256_D768 0.05 True
torch_eager cuda_T512_D1024 0.05 True
torch_eager cuda_T512_D2048 0.05 True
torch_eager cuda_T512_D768 0.05 True