Running activation benchmark on cuda with 9 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 188.575us 1476.70% 188.575us 188.575us 1
torch_eager 11.13% 210.826us 99.56% 1.887ms 1.887ms 0.000us 0.00% 15.106us 15.106us 1
aten::silu 3.37% 63.781us 82.44% 1.562ms 520.736us 6.497us 50.88% 8.833us 2.944us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 50.88% 6.497us 2.166us 3
aten::mul 1.86% 35.170us 2.95% 55.841us 18.614us 6.273us 49.12% 6.273us 2.091us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.273us 49.12% 6.273us 2.091us 3
Activity Buffer Request 76.78% 1.455ms 76.78% 1.455ms 1.455ms 2.336us 18.29% 2.336us 2.336us 1
aten::slice 2.45% 46.380us 3.05% 57.842us 9.640us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.60% 11.462us 0.60% 11.462us 1.910us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.38% 64.112us 3.38% 64.112us 10.685us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.44% 8.280us 0.44% 8.280us 8.280us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.895ms
Self CUDA time total: 12.770us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.926us 1234.87% 152.926us 152.926us 1
torch_eager 6.55% 113.093us 99.67% 1.721ms 1.721ms 0.000us 0.00% 14.560us 14.560us 1
aten::silu 2.40% 41.391us 88.69% 1.532ms 510.609us 6.400us 51.68% 8.576us 2.859us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 51.68% 6.400us 2.133us 3
aten::mul 1.50% 25.830us 2.63% 45.361us 15.120us 5.984us 48.32% 5.984us 1.995us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
Activity Buffer Request 84.72% 1.463ms 84.72% 1.463ms 1.463ms 2.176us 17.57% 2.176us 2.176us 1
aten::slice 1.43% 24.741us 1.80% 31.062us 5.177us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.37% 6.321us 0.37% 6.321us 1.054us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.71% 46.721us 2.71% 46.721us 7.787us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.33% 5.741us 0.33% 5.741us 5.741us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.727ms
Self CUDA time total: 12.384us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.413us 1147.86% 152.413us 152.413us 1
torch_eager 6.17% 105.134us 99.68% 1.699ms 1.699ms 0.000us 0.00% 15.581us 15.581us 1
aten::silu 2.58% 43.990us 88.96% 1.517ms 505.533us 6.814us 51.32% 9.117us 3.039us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.814us 51.32% 6.814us 2.271us 3
aten::mul 1.63% 27.711us 2.72% 46.371us 15.457us 6.464us 48.68% 6.464us 2.155us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.68% 6.464us 2.155us 3
Activity Buffer Request 84.84% 1.446ms 84.84% 1.446ms 1.446ms 2.303us 17.34% 2.303us 2.303us 1
aten::slice 1.47% 24.990us 1.83% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.37% 6.260us 0.37% 6.260us 1.043us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.63% 44.871us 2.63% 44.871us 7.478us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.32% 5.431us 0.32% 5.431us 5.431us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.705ms
Self CUDA time total: 13.278us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.359us 1219.84% 155.359us 155.359us 1
torch_eager 6.31% 109.593us 99.71% 1.733ms 1.733ms 0.000us 0.00% 14.944us 14.944us 1
aten::silu 2.48% 43.021us 88.93% 1.545ms 515.160us 6.560us 51.51% 8.768us 2.923us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
aten::mul 1.62% 28.091us 2.66% 46.261us 15.420us 6.176us 48.49% 6.176us 2.059us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
Activity Buffer Request 74.70% 1.298ms 74.70% 1.298ms 1.298ms 2.208us 17.34% 2.208us 2.208us 1
aten::slice 1.46% 25.370us 1.82% 31.631us 5.272us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.36% 6.261us 0.36% 6.261us 1.043us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.80% 222.405us 12.80% 222.405us 37.068us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.29% 4.960us 0.29% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.738ms
Self CUDA time total: 12.736us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.122us 1152.94% 153.122us 153.122us 1
torch_eager 5.95% 108.905us 99.72% 1.827ms 1.827ms 0.000us 0.00% 15.585us 15.585us 1
aten::silu 2.26% 41.441us 89.57% 1.641ms 546.874us 6.816us 51.32% 9.120us 3.040us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.32% 6.816us 2.272us 3
aten::mul 1.45% 26.581us 2.47% 45.261us 15.087us 6.465us 48.68% 6.465us 2.155us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.68% 6.465us 2.155us 3
Activity Buffer Request 78.54% 1.439ms 78.54% 1.439ms 1.439ms 2.304us 17.35% 2.304us 2.304us 1
aten::slice 1.41% 25.869us 1.74% 31.870us 5.312us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.33% 6.001us 0.33% 6.001us 1.000us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.78% 179.164us 9.78% 179.164us 29.861us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.28% 5.090us 0.28% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.832ms
Self CUDA time total: 13.281us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.877us 970.08% 150.877us 150.877us 1
torch_eager 20.61% 104.763us 99.03% 503.283us 503.283us 0.000us 0.00% 18.241us 18.241us 1
aten::silu 8.60% 43.701us 63.19% 321.148us 107.049us 7.969us 51.24% 10.657us 3.552us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.969us 51.24% 7.969us 2.656us 3
aten::mul 5.45% 27.720us 8.99% 45.690us 15.230us 7.584us 48.76% 7.584us 2.528us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.76% 7.584us 2.528us 3
Activity Buffer Request 24.24% 123.213us 24.24% 123.213us 123.213us 2.688us 17.28% 2.688us 2.688us 1
aten::slice 5.04% 25.603us 6.23% 31.682us 5.280us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 1.20% 6.079us 1.20% 6.079us 1.013us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 33.88% 172.204us 33.88% 172.204us 28.701us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.97% 4.940us 0.97% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 508.223us
Self CUDA time total: 15.553us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.541us 1089.44% 156.541us 156.541us 1
torch_eager 6.81% 125.673us 99.72% 1.840ms 1.840ms 0.000us 0.00% 16.866us 16.866us 1
aten::silu 2.28% 42.101us 88.57% 1.634ms 544.654us 7.361us 51.23% 9.858us 3.286us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 51.23% 7.361us 2.454us 3
aten::mul 1.53% 28.200us 2.53% 46.622us 15.541us 7.008us 48.77% 7.008us 2.336us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.77% 7.008us 2.336us 3
Activity Buffer Request 77.96% 1.438ms 77.96% 1.438ms 1.438ms 2.497us 17.38% 2.497us 2.497us 1
aten::slice 1.46% 26.979us 1.81% 33.310us 5.552us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.34% 6.331us 0.34% 6.331us 1.055us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.33% 172.076us 9.33% 172.076us 28.679us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.28% 5.210us 0.28% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.845ms
Self CUDA time total: 14.369us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.754us 962.92% 149.754us 149.754us 1
torch_eager 21.77% 106.163us 98.85% 481.952us 481.952us 0.000us 0.00% 18.240us 18.240us 1
aten::silu 8.65% 42.151us 61.90% 301.788us 100.596us 7.968us 51.23% 10.656us 3.552us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.23% 7.968us 2.656us 3
aten::mul 5.09% 24.801us 8.77% 42.752us 14.251us 7.584us 48.77% 7.584us 2.528us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.77% 7.584us 2.528us 3
Activity Buffer Request 21.73% 105.953us 21.73% 105.953us 105.953us 2.688us 17.28% 2.688us 2.688us 1
aten::slice 5.14% 25.050us 6.41% 31.249us 5.208us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 1.27% 6.199us 1.27% 6.199us 1.033us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 35.20% 171.635us 35.20% 171.635us 28.606us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 1.15% 5.600us 1.15% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 487.552us
Self CUDA time total: 15.552us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 187.357us 834.00% 187.357us 187.357us 1
torch_eager 6.93% 128.860us 99.74% 1.856ms 1.856ms 0.000us 0.00% 26.369us 26.369us 1
aten::silu 2.32% 43.123us 88.23% 1.642ms 547.175us 11.616us 51.71% 15.520us 5.173us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.616us 51.71% 11.616us 3.872us 3
aten::mul 1.63% 30.312us 2.74% 50.922us 16.974us 10.849us 48.29% 10.849us 3.616us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.849us 48.29% 10.849us 3.616us 3
Activity Buffer Request 77.79% 1.447ms 77.79% 1.447ms 1.447ms 3.904us 17.38% 3.904us 3.904us 1
aten::slice 1.49% 27.691us 1.84% 34.251us 5.708us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.35% 6.560us 0.35% 6.560us 1.093us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.23% 171.734us 9.23% 171.734us 28.622us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.26% 4.930us 0.26% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.860ms
Self CUDA time total: 22.465us
impl wl p50(ms) ok
torch_eager cuda_T128_D1024 0.05 True
torch_eager cuda_T128_D2048 0.05 True
torch_eager cuda_T128_D768 0.04 True
torch_eager cuda_T256_D1024 0.05 True
torch_eager cuda_T256_D2048 0.05 True
torch_eager cuda_T256_D768 0.05 True
torch_eager cuda_T512_D1024 0.05 True
torch_eager cuda_T512_D2048 0.05 True
torch_eager cuda_T512_D768 0.05 True