Running activation benchmark on cuda with 9 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 183.359us 1436.08% 183.359us 183.359us 1
torch_eager 11.24% 212.694us 99.53% 1.883ms 1.883ms 0.000us 0.00% 15.072us 15.072us 1
aten::silu 3.31% 62.660us 82.30% 1.557ms 519.134us 6.527us 51.12% 8.831us 2.944us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.527us 51.12% 6.527us 2.176us 3
aten::mul 1.85% 35.100us 2.98% 56.340us 18.780us 6.241us 48.88% 6.241us 2.080us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.241us 48.88% 6.241us 2.080us 3
Activity Buffer Request 76.74% 1.452ms 76.74% 1.452ms 1.452ms 2.304us 18.05% 2.304us 2.304us 1
aten::slice 2.41% 45.561us 3.01% 56.902us 9.484us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.60% 11.341us 0.60% 11.341us 1.890us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.37% 63.741us 3.37% 63.741us 10.623us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.47% 8.969us 0.47% 8.969us 8.969us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.892ms
Self CUDA time total: 12.768us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.431us 1279.63% 158.431us 158.431us 1
torch_eager 6.85% 117.301us 99.69% 1.707ms 1.707ms 0.000us 0.00% 14.557us 14.557us 1
aten::silu 2.45% 41.990us 88.25% 1.511ms 503.680us 6.398us 51.68% 8.574us 2.858us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.398us 51.68% 6.398us 2.133us 3
aten::mul 1.63% 27.830us 2.78% 47.630us 15.877us 5.983us 48.32% 5.983us 1.994us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
Activity Buffer Request 84.28% 1.443ms 84.28% 1.443ms 1.443ms 2.176us 17.58% 2.176us 2.176us 1
aten::slice 1.45% 24.820us 1.81% 30.931us 5.155us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.36% 6.111us 0.36% 6.111us 1.019us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.67% 45.711us 2.67% 45.711us 7.618us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.31% 5.320us 0.31% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.712ms
Self CUDA time total: 12.381us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.182us 1095.88% 145.182us 145.182us 1
torch_eager 6.28% 105.841us 99.65% 1.680ms 1.680ms 0.000us 0.00% 15.552us 15.552us 1
aten::silu 2.40% 40.400us 89.03% 1.501ms 500.258us 6.816us 51.45% 9.120us 3.040us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.45% 6.816us 2.272us 3
aten::mul 1.52% 25.690us 2.64% 44.480us 14.827us 6.432us 48.55% 6.432us 2.144us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.55% 6.432us 2.144us 3
Activity Buffer Request 85.10% 1.434ms 85.10% 1.434ms 1.434ms 2.304us 17.39% 2.304us 2.304us 1
aten::slice 1.37% 23.030us 1.70% 28.690us 4.782us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.34% 5.660us 0.34% 5.660us 0.943us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.66% 44.762us 2.66% 44.762us 7.460us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.35% 5.820us 0.35% 5.820us 5.820us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.686ms
Self CUDA time total: 13.248us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.025us 1135.85% 145.025us 145.025us 1
torch_eager 7.55% 116.292us 99.65% 1.535ms 1.535ms 0.000us 0.00% 14.976us 14.976us 1
aten::silu 2.67% 41.061us 87.34% 1.345ms 448.460us 6.592us 51.63% 8.800us 2.933us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 51.63% 6.592us 2.197us 3
aten::mul 1.71% 26.359us 2.88% 44.330us 14.777us 6.176us 48.37% 6.176us 2.059us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.37% 6.176us 2.059us 3
Activity Buffer Request 69.61% 1.072ms 69.61% 1.072ms 1.072ms 2.208us 17.29% 2.208us 2.208us 1
aten::slice 1.52% 23.350us 1.89% 29.050us 4.842us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.37% 5.700us 0.37% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 16.23% 250.045us 16.23% 250.045us 41.674us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.35% 5.360us 0.35% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.540ms
Self CUDA time total: 12.768us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 144.030us 1089.82% 144.030us 144.030us 1
torch_eager 5.82% 104.551us 99.68% 1.792ms 1.792ms 0.000us 0.00% 15.488us 15.488us 1
aten::silu 2.32% 41.682us 89.81% 1.614ms 538.151us 6.752us 51.09% 9.024us 3.008us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.09% 6.752us 2.251us 3
aten::mul 1.41% 25.409us 2.48% 44.550us 14.850us 6.464us 48.91% 6.464us 2.155us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.91% 6.464us 2.155us 3
Activity Buffer Request 78.50% 1.411ms 78.50% 1.411ms 1.411ms 2.272us 17.19% 2.272us 2.272us 1
aten::slice 1.27% 22.830us 1.58% 28.320us 4.720us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.31% 5.490us 0.31% 5.490us 0.915us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.06% 180.853us 10.06% 180.853us 30.142us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.32% 5.710us 0.32% 5.710us 5.710us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.798ms
Self CUDA time total: 13.216us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 140.382us 902.66% 140.382us 140.382us 1
torch_eager 21.39% 103.633us 98.99% 479.697us 479.697us 0.000us 0.00% 18.240us 18.240us 1
aten::silu 8.56% 41.460us 63.18% 306.154us 102.051us 7.936us 51.03% 10.624us 3.541us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.03% 7.936us 2.645us 3
aten::mul 4.90% 23.759us 8.63% 41.840us 13.947us 7.616us 48.97% 7.616us 2.539us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.97% 7.616us 2.539us 3
Activity Buffer Request 23.12% 112.032us 23.12% 112.032us 112.032us 2.688us 17.28% 2.688us 2.688us 1
aten::slice 4.68% 22.671us 5.79% 28.070us 4.678us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 1.11% 5.399us 1.11% 5.399us 0.900us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 35.23% 170.743us 35.23% 170.743us 28.457us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 1.01% 4.900us 1.01% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 484.597us
Self CUDA time total: 15.552us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.662us 1011.54% 145.662us 145.662us 1
torch_eager 5.99% 108.381us 99.73% 1.804ms 1.804ms 0.000us 0.00% 16.896us 16.896us 1
aten::silu 2.28% 41.342us 89.69% 1.623ms 540.945us 7.392us 51.33% 9.888us 3.296us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 51.33% 7.392us 2.464us 3
aten::mul 1.44% 26.049us 2.45% 44.420us 14.807us 7.008us 48.67% 7.008us 2.336us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.67% 7.008us 2.336us 3
Activity Buffer Request 78.99% 1.429ms 78.99% 1.429ms 1.429ms 2.496us 17.33% 2.496us 2.496us 1
aten::slice 1.28% 23.160us 1.59% 28.810us 4.802us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.31% 5.650us 0.31% 5.650us 0.942us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.43% 170.603us 9.43% 170.603us 28.434us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.27% 4.930us 0.27% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.809ms
Self CUDA time total: 14.400us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 142.206us 914.45% 142.206us 142.206us 1
torch_eager 21.70% 105.494us 98.87% 480.727us 480.727us 0.000us 0.00% 18.239us 18.239us 1
aten::silu 8.21% 39.900us 62.39% 303.354us 101.118us 7.966us 51.23% 10.654us 3.551us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.966us 51.23% 7.966us 2.655us 3
aten::mul 5.16% 25.070us 8.84% 42.990us 14.330us 7.585us 48.77% 7.585us 2.528us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.585us 48.77% 7.585us 2.528us 3
Activity Buffer Request 23.29% 113.242us 23.29% 113.242us 113.242us 2.688us 17.29% 2.688us 2.688us 1
aten::slice 4.75% 23.080us 5.94% 28.889us 4.815us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 1.19% 5.809us 1.19% 5.809us 0.968us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 34.58% 168.132us 34.58% 168.132us 28.022us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 1.13% 5.500us 1.13% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 486.227us
Self CUDA time total: 15.551us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.022us 661.50% 149.022us 149.022us 1
torch_eager 5.72% 105.900us 99.72% 1.847ms 1.847ms 0.000us 0.00% 26.431us 26.431us 1
aten::silu 2.24% 41.461us 90.05% 1.668ms 555.875us 11.552us 51.28% 15.455us 5.152us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 51.28% 11.552us 3.851us 3
aten::mul 1.41% 26.021us 2.40% 44.421us 14.807us 10.976us 48.72% 10.976us 3.659us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.976us 48.72% 10.976us 3.659us 3
Activity Buffer Request 79.50% 1.472ms 79.50% 1.472ms 1.472ms 3.903us 17.33% 3.903us 3.903us 1
aten::slice 1.25% 23.131us 1.56% 28.831us 4.805us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.31% 5.700us 0.31% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.31% 172.382us 9.31% 172.382us 28.730us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.28% 5.130us 0.28% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.852ms
Self CUDA time total: 22.528us
impl wl p50(ms) ok
torch_eager cuda_T128_D1024 0.05 True
torch_eager cuda_T128_D2048 0.05 True
torch_eager cuda_T128_D768 0.04 True
torch_eager cuda_T256_D1024 0.05 True
torch_eager cuda_T256_D2048 0.05 True
torch_eager cuda_T256_D768 0.05 True
torch_eager cuda_T512_D1024 0.05 True
torch_eager cuda_T512_D2048 0.05 True
torch_eager cuda_T512_D768 0.05 True