Running rotary benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.037ms 1163.70% 1.037ms 1.037ms 1
torch_eager 14.48% 388.465us 99.71% 2.675ms 2.675ms 0.000us 0.00% 90.368us 90.368us 1
aten::mul 6.24% 167.371us 10.81% 289.974us 12.082us 46.850us 52.55% 46.850us 1.952us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.850us 52.55% 46.850us 1.952us 24
aten::copy_ 3.95% 106.042us 62.52% 1.677ms 93.189us 29.055us 32.59% 30.271us 1.682us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.431us 25.16% 22.431us 1.869us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.247us 14.86% 13.247us 1.104us 12
aten::clone 1.33% 35.811us 60.70% 1.628ms 271.409us 0.000us 0.00% 7.840us 1.307us 6
aten::sub 1.70% 45.710us 2.68% 71.932us 11.989us 6.688us 7.50% 6.688us 1.115us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 7.43% 6.624us 1.104us 6
aten::add 1.31% 35.129us 2.15% 57.710us 9.618us 6.559us 7.36% 6.559us 1.093us 6
Activity Buffer Request 53.28% 1.429ms 53.28% 1.429ms 1.429ms 1.216us 1.36% 1.216us 1.216us 1
aten::empty_strided 2.07% 55.651us 2.07% 55.651us 9.275us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 2.67% 71.682us 2.67% 71.682us 11.947us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.88% 77.398us 3.66% 98.099us 4.087us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.77% 20.701us 0.77% 20.701us 0.863us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 9.01% 241.667us 9.01% 241.667us 5.035us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.29% 7.810us 0.29% 7.810us 7.810us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.683ms
Self CUDA time total: 89.152us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 974.701us 1078.26% 974.701us 974.701us 1
torch_eager 13.04% 331.863us 99.80% 2.539ms 2.539ms 0.000us 0.00% 91.516us 91.516us 1
aten::mul 6.08% 154.764us 10.71% 272.436us 11.351us 47.740us 52.81% 47.740us 1.989us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.740us 52.81% 47.740us 1.989us 24
aten::copy_ 4.22% 107.278us 65.67% 1.671ms 92.831us 29.344us 32.46% 30.464us 1.692us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.528us 24.92% 22.528us 1.877us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.312us 14.73% 13.312us 1.109us 12
aten::clone 1.12% 28.494us 62.70% 1.595ms 265.883us 0.000us 0.00% 7.936us 1.323us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 7.54% 6.816us 1.136us 6
aten::sub 1.50% 38.242us 2.49% 63.402us 10.567us 6.688us 7.40% 6.688us 1.115us 6
aten::add 1.20% 30.490us 2.06% 52.342us 8.724us 6.624us 7.33% 6.624us 1.104us 6
Activity Buffer Request 56.69% 1.442ms 56.69% 1.442ms 1.442ms 1.120us 1.24% 1.120us 1.120us 1
aten::empty_strided 1.23% 31.410us 1.23% 31.410us 5.235us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 2.23% 56.711us 2.23% 56.711us 9.452us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.71% 68.925us 3.47% 88.365us 3.682us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.76% 19.440us 0.76% 19.440us 0.810us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 9.01% 229.327us 9.01% 229.327us 4.778us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.20% 5.130us 0.20% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.544ms
Self CUDA time total: 90.396us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 992.026us 1054.84% 992.026us 992.026us 1
torch_eager 13.38% 342.168us 99.79% 2.552ms 2.552ms 0.000us 0.00% 95.357us 95.357us 1
aten::mul 6.15% 157.234us 10.75% 274.750us 11.448us 48.894us 51.99% 48.894us 2.037us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.894us 51.99% 48.894us 2.037us 24
aten::copy_ 4.01% 102.532us 65.33% 1.670ms 92.800us 30.817us 32.77% 32.129us 1.785us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.977us 24.43% 22.977us 1.915us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.334us 15.24% 14.334us 1.194us 12
aten::clone 1.05% 26.950us 62.23% 1.591ms 265.191us 0.000us 0.00% 9.152us 1.525us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 8.34% 7.840us 1.307us 6
aten::sub 1.50% 38.270us 2.44% 62.460us 10.410us 7.198us 7.65% 7.198us 1.200us 6
aten::add 1.23% 31.400us 2.10% 53.770us 8.962us 7.136us 7.59% 7.136us 1.189us 6
Activity Buffer Request 56.41% 1.442ms 56.41% 1.442ms 1.442ms 1.312us 1.40% 1.312us 1.312us 1
aten::empty_strided 1.23% 31.530us 1.23% 31.530us 5.255us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 2.22% 56.682us 2.22% 56.682us 9.447us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.75% 70.221us 3.50% 89.542us 3.731us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.76% 19.321us 0.76% 19.321us 0.805us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 9.11% 232.827us 9.11% 232.827us 4.851us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.21% 5.280us 0.21% 5.280us 5.280us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.557ms
Self CUDA time total: 94.045us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 963.417us 953.96% 963.417us 963.417us 1
torch_eager 11.52% 317.176us 99.82% 2.749ms 2.749ms 0.000us 0.00% 102.303us 102.303us 1
aten::mul 5.45% 150.206us 9.82% 270.557us 11.273us 52.736us 52.22% 52.736us 2.197us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.736us 52.22% 52.736us 2.197us 24
aten::copy_ 3.72% 102.545us 68.70% 1.892ms 105.120us 32.255us 31.94% 33.567us 1.865us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.511us 24.27% 24.511us 2.043us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.000us 15.84% 16.000us 1.333us 12
aten::clone 1.08% 29.720us 65.99% 1.817ms 302.902us 0.000us 0.00% 9.056us 1.509us 6
aten::sub 1.33% 36.580us 2.29% 63.082us 10.514us 8.000us 7.92% 8.000us 1.333us 6
aten::add 1.19% 32.640us 2.06% 56.790us 9.465us 8.000us 7.92% 8.000us 1.333us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.67% 7.744us 1.291us 6
Activity Buffer Request 52.30% 1.440ms 52.30% 1.440ms 1.440ms 1.312us 1.30% 1.312us 1.312us 1
aten::empty_strided 1.15% 31.721us 1.15% 31.721us 5.287us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 10.21% 281.246us 10.21% 281.246us 46.874us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.50% 68.838us 3.19% 87.951us 3.665us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.69% 19.113us 0.69% 19.113us 0.796us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.68% 239.024us 8.68% 239.024us 4.980us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 4.960us 0.18% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.754ms
Self CUDA time total: 100.991us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 994.264us 1059.01% 994.264us 994.264us 1
torch_eager 12.10% 336.594us 99.83% 2.776ms 2.776ms 0.000us 0.00% 95.197us 95.197us 1
aten::mul 5.53% 153.843us 9.85% 273.965us 11.415us 48.927us 52.11% 48.927us 2.039us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.927us 52.11% 48.927us 2.039us 24
aten::copy_ 3.84% 106.831us 68.28% 1.899ms 105.502us 30.784us 32.79% 32.095us 1.783us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.911us 24.40% 22.911us 1.909us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.175us 15.10% 14.175us 1.181us 12
aten::clone 0.99% 27.653us 65.29% 1.816ms 302.643us 0.000us 0.00% 9.184us 1.531us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.873us 8.39% 7.873us 1.312us 6
aten::add 1.11% 30.890us 1.95% 54.150us 9.025us 7.103us 7.57% 7.103us 1.184us 6
aten::sub 1.31% 36.550us 2.21% 61.372us 10.229us 7.072us 7.53% 7.072us 1.179us 6
Activity Buffer Request 52.41% 1.458ms 52.41% 1.458ms 1.458ms 1.311us 1.40% 1.311us 1.311us 1
aten::empty_strided 1.15% 31.950us 1.15% 31.950us 5.325us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 9.52% 264.666us 9.52% 264.666us 44.111us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.56% 71.249us 3.30% 91.758us 3.823us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.74% 20.509us 0.74% 20.509us 0.855us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.56% 238.154us 8.56% 238.154us 4.962us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.17% 4.831us 0.17% 4.831us 4.831us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.781ms
Self CUDA time total: 93.886us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 947.484us 938.47% 947.484us 947.484us 1
torch_eager 10.88% 292.632us 99.82% 2.684ms 2.684ms 0.000us 0.00% 102.274us 102.274us 1
aten::mul 5.59% 150.412us 9.99% 268.638us 11.193us 52.575us 52.07% 52.575us 2.191us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.575us 52.07% 52.575us 2.191us 24
aten::copy_ 3.76% 101.124us 69.31% 1.864ms 103.538us 32.417us 32.11% 33.730us 1.874us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 24.41% 24.640us 2.053us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.969us 15.82% 15.969us 1.331us 12
aten::clone 0.88% 23.678us 66.40% 1.785ms 297.581us 0.000us 0.00% 9.090us 1.515us 6
aten::add 1.17% 31.492us 2.09% 56.082us 9.347us 8.001us 7.92% 8.001us 1.333us 6
aten::sub 1.33% 35.751us 2.27% 61.172us 10.195us 7.968us 7.89% 7.968us 1.328us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.777us 7.70% 7.777us 1.296us 6
Activity Buffer Request 53.61% 1.442ms 53.61% 1.442ms 1.442ms 1.313us 1.30% 1.313us 1.313us 1
aten::empty_strided 1.16% 31.231us 1.16% 31.231us 5.205us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 9.49% 255.066us 9.49% 255.066us 42.511us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.55% 68.470us 3.23% 86.863us 3.619us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.68% 18.393us 0.68% 18.393us 0.766us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.71% 234.118us 8.71% 234.118us 4.877us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 4.960us 0.18% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.689ms
Self CUDA time total: 100.961us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.015ms 841.96% 1.015ms 1.015ms 1
torch_eager 12.11% 330.713us 99.82% 2.726ms 2.726ms 0.000us 0.00% 122.270us 122.270us 1
aten::mul 5.81% 158.614us 10.12% 276.274us 11.511us 62.015us 51.46% 62.015us 2.584us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.015us 51.46% 62.015us 2.584us 24
aten::copy_ 3.83% 104.612us 67.54% 1.845ms 102.474us 39.328us 32.63% 41.088us 2.283us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.832us 23.92% 28.832us 2.403us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.167us 15.90% 19.167us 1.597us 12
aten::clone 0.82% 22.270us 64.60% 1.764ms 294.026us 0.000us 0.00% 12.256us 2.043us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 8.71% 10.496us 1.749us 6
aten::add 1.30% 35.623us 2.23% 60.872us 10.145us 9.600us 7.97% 9.600us 1.600us 6
aten::sub 1.39% 37.930us 2.30% 62.752us 10.459us 9.567us 7.94% 9.567us 1.594us 6
Activity Buffer Request 51.93% 1.418ms 51.93% 1.418ms 1.418ms 1.760us 1.46% 1.760us 1.760us 1
aten::empty_strided 1.39% 37.931us 1.39% 37.931us 6.322us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 9.20% 251.364us 9.20% 251.364us 41.894us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.57% 70.176us 3.31% 90.509us 3.771us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.74% 20.333us 0.74% 20.333us 0.847us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.72% 238.202us 8.72% 238.202us 4.963us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 4.991us 0.18% 4.991us 4.991us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.731ms
Self CUDA time total: 120.510us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 969.687us 565.36% 969.687us 969.687us 1
torch_eager 11.93% 323.252us 99.82% 2.704ms 2.704ms 0.000us 0.00% 174.431us 174.431us 1
aten::mul 5.73% 155.191us 10.09% 273.452us 11.394us 89.149us 51.98% 89.149us 3.715us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.149us 51.98% 89.149us 3.715us 24
aten::copy_ 3.81% 103.212us 67.97% 1.841ms 102.304us 57.504us 33.53% 60.417us 3.357us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.448us 23.58% 40.448us 3.371us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.865us 14.50% 24.865us 2.072us 12
aten::clone 1.01% 27.391us 65.07% 1.763ms 293.813us 0.000us 0.00% 19.969us 3.328us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 9.94% 17.056us 2.843us 6
aten::sub 1.36% 36.973us 2.33% 63.083us 10.514us 12.448us 7.26% 12.448us 2.075us 6
aten::add 1.19% 32.138us 2.00% 54.180us 9.030us 12.417us 7.24% 12.417us 2.069us 6
Activity Buffer Request 52.59% 1.425ms 52.59% 1.425ms 1.425ms 2.913us 1.70% 2.913us 2.913us 1
aten::empty_strided 1.13% 30.731us 1.13% 30.731us 5.122us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 9.09% 246.234us 9.09% 246.234us 41.039us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.62% 70.850us 3.34% 90.602us 3.775us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.73% 19.752us 0.73% 19.752us 0.823us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.62% 233.633us 8.62% 233.633us 4.867us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 4.920us 0.18% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.709ms
Self CUDA time total: 171.518us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 954.052us 791.00% 954.052us 954.052us 1
torch_eager 19.97% 292.412us 99.63% 1.459ms 1.459ms 0.000us 0.00% 122.437us 122.437us 1
aten::mul 10.56% 154.645us 18.69% 273.576us 11.399us 62.020us 51.42% 62.020us 2.584us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.020us 51.42% 62.020us 2.584us 24
aten::copy_ 7.03% 103.000us 43.41% 635.575us 35.310us 39.424us 32.69% 41.248us 2.292us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.800us 23.88% 28.800us 2.400us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.169us 15.89% 19.169us 1.597us 12
aten::clone 1.44% 21.120us 37.54% 549.571us 91.595us 0.000us 0.00% 12.448us 2.075us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.624us 8.81% 10.624us 1.771us 6
aten::add 2.24% 32.821us 3.84% 56.284us 9.381us 9.600us 7.96% 9.600us 1.600us 6
aten::sub 2.67% 39.093us 4.44% 64.973us 10.829us 9.569us 7.93% 9.569us 1.595us 6
Activity Buffer Request 15.36% 224.935us 15.36% 224.935us 224.935us 1.824us 1.51% 1.824us 1.824us 1
aten::empty_strided 1.99% 29.111us 1.99% 29.111us 4.852us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 16.48% 241.265us 16.48% 241.265us 40.211us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.62% 67.580us 5.85% 85.721us 3.572us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.24% 18.141us 1.24% 18.141us 0.756us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.03% 234.649us 16.03% 234.649us 4.889us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.37% 5.351us 0.37% 5.351us 5.351us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.464ms
Self CUDA time total: 120.613us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 956.920us 558.23% 956.920us 956.920us 1
torch_eager 19.50% 289.238us 99.68% 1.478ms 1.478ms 0.000us 0.00% 174.235us 174.235us 1
aten::mul 10.48% 155.363us 18.66% 276.703us 11.529us 89.180us 52.02% 89.180us 3.716us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.180us 52.02% 89.180us 3.716us 24
aten::copy_ 6.89% 102.110us 44.09% 653.841us 36.324us 57.375us 33.47% 60.191us 3.344us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.415us 23.58% 40.415us 3.368us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.864us 14.50% 24.864us 2.072us 12
aten::clone 1.47% 21.742us 38.41% 569.623us 94.937us 0.000us 0.00% 19.776us 3.296us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 9.89% 16.960us 2.827us 6
aten::add 2.10% 31.093us 3.60% 53.332us 8.889us 12.512us 7.30% 12.512us 2.085us 6
aten::sub 2.55% 37.851us 4.17% 61.831us 10.305us 12.352us 7.21% 12.352us 2.059us 6
Activity Buffer Request 16.56% 245.575us 16.56% 245.575us 245.575us 2.816us 1.64% 2.816us 2.816us 1
aten::empty_strided 2.00% 29.651us 2.00% 29.651us 4.942us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 16.13% 239.165us 16.13% 239.165us 39.861us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.82% 71.554us 6.20% 91.934us 3.831us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.37% 20.380us 1.37% 20.380us 0.849us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.82% 234.550us 15.82% 234.550us 4.886us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.32% 4.730us 0.32% 4.730us 4.730us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.483ms
Self CUDA time total: 171.419us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 977.978us 346.82% 977.978us 977.978us 1
torch_eager 11.12% 340.956us 99.84% 3.061ms 3.061ms 0.000us 0.00% 300.126us 300.126us 1
aten::mul 4.97% 152.432us 8.78% 269.242us 11.218us 132.256us 46.90% 132.256us 5.511us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.256us 46.90% 132.256us 5.511us 24
aten::copy_ 3.32% 101.920us 71.12% 2.181ms 121.149us 108.702us 38.55% 126.846us 7.047us 18
aten::clone 0.96% 29.312us 68.68% 2.106ms 350.996us 0.000us 0.00% 69.855us 11.642us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 56.991us 20.21% 56.991us 4.749us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.711us 18.34% 51.711us 8.618us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.024us 14.55% 41.024us 3.419us 12
aten::sub 1.25% 38.245us 2.06% 63.315us 10.553us 20.608us 7.31% 20.608us 3.435us 6
aten::add 1.02% 31.345us 1.82% 55.786us 9.298us 20.416us 7.24% 20.416us 3.403us 6
Activity Buffer Request 57.69% 1.769ms 57.69% 1.769ms 1.769ms 18.144us 6.43% 18.144us 18.144us 1
aten::empty_strided 1.06% 32.360us 1.06% 32.360us 5.393us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 7.88% 241.465us 7.88% 241.465us 40.244us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.31% 70.749us 2.93% 89.730us 3.739us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.62% 18.981us 0.62% 18.981us 0.791us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.66% 234.833us 7.66% 234.833us 4.892us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.16% 4.769us 0.16% 4.769us 4.769us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.066ms
Self CUDA time total: 281.982us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 965.746us 171.28% 965.746us 965.746us 1
torch_eager 20.15% 293.418us 99.65% 1.451ms 1.451ms 0.000us 0.00% 587.545us 587.545us 1
aten::copy_ 6.98% 101.683us 42.57% 619.773us 34.432us 272.605us 48.35% 296.317us 16.462us 18
aten::mul 10.89% 158.509us 19.23% 280.051us 11.669us 225.082us 39.92% 225.082us 9.378us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 225.082us 39.92% 225.082us 9.378us 24
aten::clone 1.41% 20.520us 36.78% 535.511us 89.252us 0.000us 0.00% 206.046us 34.341us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.334us 32.34% 182.334us 30.389us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.271us 16.01% 90.271us 7.523us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.146us 11.73% 66.146us 5.512us 12
aten::sub 2.75% 40.021us 4.44% 64.623us 10.771us 33.857us 6.00% 33.857us 5.643us 6
aten::add 2.25% 32.703us 3.82% 55.604us 9.267us 32.289us 5.73% 32.289us 5.381us 6
Activity Buffer Request 15.08% 219.615us 15.08% 219.615us 219.615us 23.712us 4.21% 23.712us 23.712us 1
aten::empty_strided 2.09% 30.380us 2.09% 30.380us 5.063us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 16.00% 233.025us 16.00% 233.025us 38.838us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.65% 67.660us 5.95% 86.582us 3.608us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.30% 18.922us 1.30% 18.922us 0.788us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.11% 234.495us 16.11% 234.495us 4.885us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.35% 5.030us 0.35% 5.030us 5.030us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.456ms
Self CUDA time total: 563.833us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 970.000us 1049.25% 970.000us 970.000us 1
torch_eager 21.04% 308.715us 99.66% 1.462ms 1.462ms 0.000us 0.00% 93.567us 93.567us 1
aten::mul 10.47% 153.593us 18.60% 272.905us 11.371us 49.631us 53.69% 49.631us 2.068us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.631us 53.69% 49.631us 2.068us 24
aten::copy_ 7.22% 105.943us 42.60% 624.955us 34.720us 29.345us 31.74% 30.465us 1.693us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.592us 24.44% 22.592us 1.883us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.471us 14.57% 13.471us 1.123us 12
aten::clone 1.54% 22.631us 36.99% 542.672us 90.445us 0.000us 0.00% 7.873us 1.312us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.753us 7.30% 6.753us 1.126us 6
aten::sub 2.47% 36.281us 4.16% 61.001us 10.167us 6.751us 7.30% 6.751us 1.125us 6
aten::add 2.12% 31.122us 3.62% 53.173us 8.862us 6.720us 7.27% 6.720us 1.120us 6
Activity Buffer Request 15.54% 227.975us 15.54% 227.975us 227.975us 1.120us 1.21% 1.120us 1.120us 1
aten::empty_strided 2.05% 30.140us 2.05% 30.140us 5.023us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 15.29% 224.265us 15.29% 224.265us 37.378us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.74% 69.541us 6.04% 88.642us 3.693us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.30% 19.101us 1.30% 19.101us 0.796us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.87% 232.855us 15.87% 232.855us 4.851us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.34% 4.941us 0.34% 4.941us 4.941us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.467ms
Self CUDA time total: 92.447us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 961.400us 998.83% 961.400us 961.400us 1
torch_eager 11.86% 316.997us 99.82% 2.667ms 2.667ms 0.000us 0.00% 97.565us 97.565us 1
aten::mul 5.68% 151.840us 10.03% 267.904us 11.163us 51.071us 53.06% 51.071us 2.128us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.071us 53.06% 51.071us 2.128us 24
aten::copy_ 3.83% 102.366us 67.99% 1.817ms 100.926us 30.911us 32.11% 32.223us 1.790us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.040us 23.94% 23.040us 1.920us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.271us 14.83% 14.271us 1.189us 12
aten::clone 1.08% 28.789us 65.14% 1.741ms 290.113us 0.000us 0.00% 9.183us 1.530us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.871us 8.18% 7.871us 1.312us 6
aten::add 1.17% 31.182us 2.13% 57.023us 9.504us 7.136us 7.41% 7.136us 1.189us 6
aten::sub 1.39% 37.021us 2.28% 60.881us 10.147us 7.135us 7.41% 7.135us 1.189us 6
Activity Buffer Request 53.27% 1.423ms 53.27% 1.423ms 1.423ms 1.312us 1.36% 1.312us 1.312us 1
aten::empty_strided 1.17% 31.390us 1.17% 31.390us 5.232us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 8.40% 224.384us 8.40% 224.384us 37.397us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.55% 68.065us 3.28% 87.603us 3.650us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.73% 19.538us 0.73% 19.538us 0.814us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.69% 232.215us 8.69% 232.215us 4.838us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 4.791us 0.18% 4.791us 4.791us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.672ms
Self CUDA time total: 96.253us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 938.779us 906.86% 938.779us 938.779us 1
torch_eager 20.95% 294.336us 99.65% 1.400ms 1.400ms 0.000us 0.00% 104.832us 104.832us 1
aten::mul 10.92% 153.493us 19.14% 268.855us 11.202us 55.265us 53.39% 55.265us 2.303us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.265us 53.39% 55.265us 2.303us 24
aten::copy_ 7.29% 102.391us 41.81% 587.481us 32.638us 32.287us 31.19% 33.599us 1.867us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.544us 23.71% 24.544us 2.045us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.968us 15.43% 15.968us 1.331us 12
aten::clone 1.48% 20.724us 35.96% 505.273us 84.212us 0.000us 0.00% 9.055us 1.509us 6
aten::sub 2.56% 35.921us 4.48% 63.011us 10.502us 8.000us 7.73% 8.000us 1.333us 6
aten::add 2.24% 31.440us 3.80% 53.431us 8.905us 7.968us 7.70% 7.968us 1.328us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 7.48% 7.743us 1.290us 6
Activity Buffer Request 14.48% 203.474us 14.48% 203.474us 203.474us 1.312us 1.27% 1.312us 1.312us 1
aten::empty_strided 2.08% 29.281us 2.08% 29.281us 4.880us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 15.64% 219.755us 15.64% 219.755us 36.626us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.61% 64.735us 5.90% 82.941us 3.456us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.30% 18.206us 1.30% 18.206us 0.759us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.11% 226.304us 16.11% 226.304us 4.715us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.35% 4.920us 0.35% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.405ms
Self CUDA time total: 103.520us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 946.116us 766.76% 946.116us 946.116us 1
torch_eager 20.39% 290.555us 99.66% 1.420ms 1.420ms 0.000us 0.00% 125.184us 125.184us 1
aten::mul 10.89% 155.196us 19.03% 271.116us 11.296us 64.930us 52.62% 64.930us 2.705us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 64.930us 52.62% 64.930us 2.705us 24
aten::copy_ 7.20% 102.573us 42.57% 606.535us 33.696us 39.295us 31.85% 41.087us 2.283us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.928us 23.44% 28.928us 2.411us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.167us 15.53% 19.167us 1.597us 12
aten::clone 1.46% 20.780us 36.49% 519.930us 86.655us 0.000us 0.00% 12.159us 2.026us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.367us 8.40% 10.367us 1.728us 6
aten::add 2.22% 31.661us 3.85% 54.881us 9.147us 9.632us 7.81% 9.632us 1.605us 6
aten::sub 2.54% 36.222us 4.30% 61.232us 10.205us 9.535us 7.73% 9.535us 1.589us 6
Activity Buffer Request 15.30% 218.045us 15.30% 218.045us 218.045us 1.792us 1.45% 1.792us 1.792us 1
aten::empty_strided 2.05% 29.230us 2.05% 29.230us 4.872us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 15.35% 218.676us 15.35% 218.676us 36.446us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.69% 66.771us 6.02% 85.802us 3.575us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.34% 19.031us 1.34% 19.031us 0.793us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.24% 231.391us 16.24% 231.391us 4.821us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.34% 4.790us 0.34% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.425ms
Self CUDA time total: 123.392us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 984.379us 951.82% 984.379us 984.379us 1
torch_eager 12.07% 328.136us 99.82% 2.714ms 2.714ms 0.000us 0.00% 104.765us 104.765us 1
aten::mul 5.81% 158.021us 10.21% 277.512us 11.563us 55.167us 53.34% 55.167us 2.299us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.167us 53.34% 55.167us 2.299us 24
aten::copy_ 3.85% 104.771us 67.79% 1.843ms 102.400us 32.352us 31.28% 33.696us 1.872us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.576us 23.76% 24.576us 2.048us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.902us 15.38% 15.902us 1.325us 12
aten::clone 1.05% 28.482us 64.81% 1.762ms 293.686us 0.000us 0.00% 9.120us 1.520us 6
aten::add 1.18% 32.072us 2.05% 55.622us 9.270us 7.966us 7.70% 7.966us 1.328us 6
aten::sub 1.34% 36.429us 2.30% 62.454us 10.409us 7.936us 7.67% 7.936us 1.323us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 7.52% 7.776us 1.296us 6
Activity Buffer Request 53.37% 1.451ms 53.37% 1.451ms 1.451ms 1.344us 1.30% 1.344us 1.344us 1
aten::empty_strided 1.13% 30.791us 1.13% 30.791us 5.132us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 8.01% 217.895us 8.01% 217.895us 36.316us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.53% 68.900us 3.23% 87.945us 3.664us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.70% 19.045us 0.70% 19.045us 0.794us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.78% 238.656us 8.78% 238.656us 4.972us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 4.760us 0.18% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.719ms
Self CUDA time total: 103.421us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 975.859us 788.60% 975.859us 975.859us 1
torch_eager 11.99% 325.892us 99.81% 2.713ms 2.713ms 0.000us 0.00% 125.537us 125.537us 1
aten::mul 5.63% 152.991us 10.00% 271.842us 11.327us 65.056us 52.57% 65.056us 2.711us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.056us 52.57% 65.056us 2.711us 24
aten::copy_ 3.75% 101.941us 67.93% 1.846ms 102.570us 39.393us 31.83% 41.185us 2.288us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.961us 23.40% 28.961us 2.413us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.296us 15.59% 19.296us 1.608us 12
aten::clone 1.11% 30.152us 65.06% 1.768ms 294.730us 0.000us 0.00% 12.224us 2.037us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.43% 10.432us 1.739us 6
aten::sub 1.31% 35.640us 2.30% 62.642us 10.440us 9.696us 7.84% 9.696us 1.616us 6
aten::add 1.19% 32.290us 2.11% 57.400us 9.567us 9.600us 7.76% 9.600us 1.600us 6
Activity Buffer Request 53.71% 1.460ms 53.71% 1.460ms 1.460ms 1.792us 1.45% 1.792us 1.792us 1
aten::empty_strided 1.14% 30.851us 1.14% 30.851us 5.142us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 7.91% 214.935us 7.91% 214.935us 35.822us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.54% 69.161us 3.23% 87.912us 3.663us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.69% 18.751us 0.69% 18.751us 0.781us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.85% 240.634us 8.85% 240.634us 5.013us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.19% 5.031us 0.19% 5.031us 5.031us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.718ms
Self CUDA time total: 123.745us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 974.260us 552.15% 974.260us 974.260us 1
torch_eager 19.78% 293.688us 99.66% 1.480ms 1.480ms 0.000us 0.00% 179.361us 179.361us 1
aten::mul 10.71% 158.995us 19.78% 293.648us 12.235us 94.434us 53.52% 94.434us 3.935us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.434us 53.52% 94.434us 3.935us 24
aten::copy_ 6.79% 100.834us 42.92% 637.126us 35.396us 57.375us 32.52% 60.287us 3.349us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.416us 22.91% 40.416us 3.368us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 13.96% 24.640us 2.053us 12
aten::clone 1.76% 26.199us 37.53% 557.122us 92.854us 0.000us 0.00% 19.871us 3.312us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.959us 9.61% 16.959us 2.826us 6
aten::sub 2.42% 35.930us 4.08% 60.590us 10.098us 12.320us 6.98% 12.320us 2.053us 6
aten::add 2.11% 31.302us 3.66% 54.401us 9.067us 12.320us 6.98% 12.320us 2.053us 6
Activity Buffer Request 16.99% 252.166us 16.99% 252.166us 252.166us 2.912us 1.65% 2.912us 2.912us 1
aten::empty_strided 2.00% 29.691us 2.00% 29.691us 4.948us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 14.50% 215.285us 14.50% 215.285us 35.881us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.45% 66.098us 5.67% 84.159us 3.507us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.22% 18.061us 1.22% 18.061us 0.753us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.92% 251.253us 16.92% 251.253us 5.234us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.34% 5.020us 0.34% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.485ms
Self CUDA time total: 176.449us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 953.139us 322.89% 953.139us 953.139us 1
torch_eager 20.45% 288.223us 99.65% 1.404ms 1.404ms 0.000us 0.00% 312.341us 312.341us 1
aten::mul 10.90% 153.585us 19.25% 271.218us 11.301us 144.345us 48.90% 144.345us 6.014us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 144.345us 48.90% 144.345us 6.014us 24
aten::copy_ 7.18% 101.222us 41.60% 586.173us 32.565us 110.174us 37.32% 127.326us 7.074us 18
aten::clone 1.48% 20.790us 35.41% 498.991us 83.165us 0.000us 0.00% 70.207us 11.701us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.119us 19.35% 57.119us 4.760us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.055us 17.97% 53.055us 8.843us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.670us 13.78% 40.670us 3.389us 12
aten::sub 2.69% 37.950us 4.51% 63.611us 10.602us 20.448us 6.93% 20.448us 3.408us 6
aten::add 2.21% 31.201us 3.90% 54.891us 9.149us 20.222us 6.85% 20.222us 3.370us 6
Activity Buffer Request 14.48% 203.984us 14.48% 203.984us 203.984us 17.152us 5.81% 17.152us 17.152us 1
aten::empty_strided 2.20% 31.071us 2.20% 31.071us 5.179us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 15.00% 211.404us 15.00% 211.404us 35.234us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.92% 69.344us 6.26% 88.243us 3.677us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.34% 18.899us 1.34% 18.899us 0.787us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.79% 236.547us 16.79% 236.547us 4.928us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.35% 4.990us 0.35% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.409ms
Self CUDA time total: 295.189us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 955.061us 540.29% 955.061us 955.061us 1
torch_eager 20.13% 285.326us 99.66% 1.412ms 1.412ms 0.000us 0.00% 179.647us 179.647us 1
aten::mul 11.53% 163.362us 19.81% 280.694us 11.696us 94.558us 53.49% 94.558us 3.940us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.558us 53.49% 94.558us 3.940us 24
aten::copy_ 7.22% 102.272us 41.72% 591.162us 32.842us 57.633us 32.60% 60.513us 3.362us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.610us 22.97% 40.610us 3.384us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.576us 13.90% 24.576us 2.048us 12
aten::clone 1.52% 21.581us 35.73% 506.321us 84.387us 0.000us 0.00% 19.903us 3.317us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.023us 9.63% 17.023us 2.837us 6
aten::add 2.20% 31.170us 3.74% 52.991us 8.832us 12.352us 6.99% 12.352us 2.059us 6
aten::sub 2.66% 37.720us 4.39% 62.161us 10.360us 12.224us 6.92% 12.224us 2.037us 6
Activity Buffer Request 14.91% 211.305us 14.91% 211.305us 211.305us 2.880us 1.63% 2.880us 2.880us 1
aten::empty_strided 2.11% 29.970us 2.11% 29.970us 4.995us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 14.80% 209.714us 14.80% 209.714us 34.952us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.81% 68.154us 6.24% 88.396us 3.683us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.43% 20.242us 1.43% 20.242us 0.843us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.33% 231.465us 16.33% 231.465us 4.822us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.34% 4.860us 0.34% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.417ms
Self CUDA time total: 176.767us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 947.102us 319.90% 947.102us 947.102us 1
torch_eager 20.76% 285.746us 99.63% 1.371ms 1.371ms 0.000us 0.00% 313.885us 313.885us 1
aten::mul 11.26% 155.004us 19.90% 273.893us 11.412us 144.735us 48.89% 144.735us 6.031us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 144.735us 48.89% 144.735us 6.031us 24
aten::copy_ 7.73% 106.340us 40.54% 558.012us 31.001us 110.624us 37.37% 128.447us 7.136us 18
aten::clone 1.60% 22.060us 34.25% 471.499us 78.583us 0.000us 0.00% 71.454us 11.909us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 56.993us 19.25% 56.993us 4.749us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.631us 18.11% 53.631us 8.939us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.703us 13.75% 40.703us 3.392us 12
aten::sub 2.65% 36.432us 4.41% 60.743us 10.124us 20.447us 6.91% 20.447us 3.408us 6
aten::add 2.33% 32.010us 4.20% 57.842us 9.640us 20.256us 6.84% 20.256us 3.376us 6
Activity Buffer Request 13.03% 179.384us 13.03% 179.384us 179.384us 17.823us 6.02% 17.823us 17.823us 1
aten::empty_strided 2.15% 29.560us 2.15% 29.560us 4.927us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 14.91% 205.294us 14.91% 205.294us 34.216us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.76% 65.565us 6.07% 83.544us 3.481us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.31% 17.979us 1.31% 17.979us 0.749us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 17.15% 236.026us 17.15% 236.026us 4.917us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.37% 5.100us 0.37% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.377ms
Self CUDA time total: 296.062us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.189ms 203.30% 1.189ms 1.189ms 1
torch_eager 20.67% 348.290us 99.66% 1.679ms 1.679ms 0.000us 0.00% 608.543us 608.543us 1
aten::copy_ 7.05% 118.714us 39.22% 660.725us 36.707us 268.894us 45.98% 292.638us 16.258us 18
aten::mul 12.17% 204.984us 20.89% 351.955us 14.665us 249.922us 42.74% 249.922us 10.413us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 249.922us 42.74% 249.922us 10.413us 24
aten::clone 1.51% 25.362us 33.77% 568.912us 94.819us 0.000us 0.00% 201.823us 33.637us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 178.079us 30.45% 178.079us 29.680us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.815us 15.53% 90.815us 7.568us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.983us 11.28% 65.983us 5.499us 12
aten::sub 2.59% 43.649us 4.45% 75.001us 12.500us 33.056us 5.65% 33.056us 5.509us 6
aten::add 2.76% 46.482us 4.73% 79.603us 13.267us 32.927us 5.63% 32.927us 5.488us 6
Activity Buffer Request 13.27% 223.575us 13.27% 223.575us 223.575us 23.744us 4.06% 23.744us 23.744us 1
aten::empty_strided 2.16% 36.470us 2.16% 36.470us 6.078us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 14.48% 243.975us 14.48% 243.975us 40.662us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.76% 80.109us 6.03% 101.610us 4.234us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.28% 21.501us 1.28% 21.501us 0.896us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.97% 285.905us 16.97% 285.905us 5.956us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.34% 5.680us 0.34% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.685ms
Self CUDA time total: 584.799us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 9.32% 354.797us 80.27% 3.054ms 3.054ms 0.000us 0.00% 1.838ms 1.838ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.810ms 102.14% 1.810ms 1.810ms 1
aten::copy_ 2.75% 104.734us 56.42% 2.147ms 119.270us 795.642us 44.90% 861.818us 47.879us 18
aten::mul 4.14% 157.684us 7.25% 275.917us 11.497us 828.220us 46.73% 828.220us 34.509us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 828.220us 46.73% 828.220us 34.509us 24
aten::clone 0.75% 28.679us 54.47% 2.072ms 345.404us 0.000us 0.00% 628.732us 104.789us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 562.556us 31.74% 562.556us 93.759us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.086us 13.15% 233.086us 19.424us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 148.320us 8.37% 148.320us 12.360us 12
aten::sub 1.03% 39.321us 1.69% 64.121us 10.687us 89.920us 5.07% 89.920us 14.987us 6
Activity Buffer Request 46.12% 1.755ms 46.12% 1.755ms 1.755ms 66.176us 3.73% 66.176us 66.176us 1
aten::add 0.96% 36.600us 1.59% 60.490us 10.082us 58.400us 3.30% 58.400us 9.733us 6
aten::empty_strided 0.88% 33.672us 0.88% 33.672us 5.612us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 5.82% 221.424us 5.82% 221.424us 36.904us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 1.87% 71.004us 2.35% 89.583us 3.733us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.49% 18.579us 0.49% 18.579us 0.774us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 6.12% 232.984us 6.12% 232.984us 4.854us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 19.73% 750.696us 19.73% 750.696us 750.696us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.805ms
Self CUDA time total: 1.772ms
impl wl p50(ms) ok
torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
torch_eager cuda_B1_S128_H8_D128_R64 0.22 True
torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
torch_eager cuda_B1_S512_H32_D128_R64 0.22 True
torch_eager cuda_B1_S512_H32_D64_R32 0.22 True
torch_eager cuda_B1_S512_H8_D128_R64 0.22 True
torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
torch_eager cuda_B2_S128_H8_D64_R32 0.22 True
torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
torch_eager cuda_B2_S512_H8_D64_R32 0.22 True