Running rotary benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.315ms 1474.39% 1.315ms 1.315ms 1
torch_eager 7.00% 401.548us 82.40% 4.729ms 4.729ms 0.000us 0.00% 90.432us 90.432us 1
aten::mul 3.25% 186.430us 5.35% 307.044us 12.793us 46.943us 52.62% 46.943us 1.956us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.943us 52.62% 46.943us 1.956us 24
aten::copy_ 2.48% 142.261us 48.48% 2.782ms 154.576us 29.122us 32.64% 30.338us 1.685us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.433us 25.14% 22.433us 1.869us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.151us 14.74% 13.151us 1.096us 12
aten::clone 0.88% 50.441us 59.65% 3.423ms 570.575us 0.000us 0.00% 7.905us 1.318us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.689us 7.50% 6.689us 1.115us 6
aten::sub 0.82% 47.350us 1.28% 73.411us 12.235us 6.591us 7.39% 6.591us 1.098us 6
aten::add 0.64% 36.811us 1.04% 59.601us 9.934us 6.560us 7.35% 6.560us 1.093us 6
Activity Buffer Request 39.92% 2.291ms 39.92% 2.291ms 2.291ms 1.216us 1.36% 1.216us 1.216us 1
aten::empty_strided 16.52% 948.386us 16.52% 948.386us 158.064us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 1.38% 78.980us 1.38% 78.980us 13.163us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 1.46% 83.925us 1.86% 106.703us 4.446us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.40% 22.778us 0.40% 22.778us 0.949us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.66% 439.430us 7.66% 439.430us 9.155us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 17.60% 1.010ms 17.60% 1.010ms 1.010ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.740ms
Self CUDA time total: 89.216us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 967.576us 1072.55% 967.576us 967.576us 1
torch_eager 10.80% 301.919us 99.80% 2.790ms 2.790ms 0.000us 0.00% 91.365us 91.365us 1
aten::mul 5.82% 162.824us 9.87% 275.997us 11.500us 47.523us 52.68% 47.523us 1.980us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.523us 52.68% 47.523us 1.980us 24
aten::copy_ 4.18% 116.751us 70.01% 1.957ms 108.723us 29.282us 32.46% 30.434us 1.691us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.497us 24.94% 22.497us 1.875us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.408us 14.86% 13.408us 1.117us 12
aten::clone 0.79% 22.172us 66.92% 1.871ms 311.782us 0.000us 0.00% 7.937us 1.323us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.785us 7.52% 6.785us 1.131us 6
aten::add 1.23% 34.361us 2.02% 56.562us 9.427us 6.720us 7.45% 6.720us 1.120us 6
aten::sub 1.36% 38.010us 2.19% 61.310us 10.218us 6.688us 7.41% 6.688us 1.115us 6
Activity Buffer Request 61.66% 1.724ms 61.66% 1.724ms 1.724ms 1.152us 1.28% 1.152us 1.152us 1
aten::empty_strided 1.16% 32.541us 1.16% 32.541us 5.424us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 2.01% 56.260us 2.01% 56.260us 9.377us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.34% 65.363us 2.94% 82.214us 3.426us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.60% 16.851us 0.60% 16.851us 0.702us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.84% 219.114us 7.84% 219.114us 4.565us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.20% 5.580us 0.20% 5.580us 5.580us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.795ms
Self CUDA time total: 90.213us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 927.639us 987.31% 927.639us 927.639us 1
torch_eager 10.07% 282.335us 99.80% 2.798ms 2.798ms 0.000us 0.00% 95.268us 95.268us 1
aten::mul 5.75% 161.290us 9.68% 271.373us 11.307us 48.769us 51.91% 48.769us 2.032us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.769us 51.91% 48.769us 2.032us 24
aten::copy_ 3.66% 102.626us 71.21% 1.996ms 110.912us 30.720us 32.70% 32.032us 1.780us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 24.39% 22.912us 1.909us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.467us 15.40% 14.467us 1.206us 12
aten::clone 0.79% 22.060us 68.41% 1.918ms 319.628us 0.000us 0.00% 9.120us 1.520us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 8.31% 7.808us 1.301us 6
aten::sub 1.36% 38.040us 2.18% 61.002us 10.167us 7.265us 7.73% 7.265us 1.211us 6
aten::add 1.15% 32.220us 1.90% 53.280us 8.880us 7.202us 7.67% 7.202us 1.200us 6
Activity Buffer Request 63.51% 1.780ms 63.51% 1.780ms 1.780ms 1.312us 1.40% 1.312us 1.312us 1
aten::empty_strided 1.12% 31.490us 1.12% 31.490us 5.248us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 1.87% 52.452us 1.87% 52.452us 8.742us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.25% 63.104us 2.86% 80.042us 3.335us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.60% 16.938us 0.60% 16.938us 0.706us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.67% 215.090us 7.67% 215.090us 4.481us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.20% 5.470us 0.20% 5.470us 5.470us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.803ms
Self CUDA time total: 93.956us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 918.847us 904.69% 918.847us 918.847us 1
torch_eager 11.08% 278.185us 99.79% 2.506ms 2.506ms 0.000us 0.00% 102.877us 102.877us 1
aten::mul 6.15% 154.372us 10.54% 264.762us 11.032us 52.638us 51.83% 52.638us 2.193us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.638us 51.83% 52.638us 2.193us 24
aten::copy_ 4.16% 104.580us 68.26% 1.714ms 95.219us 32.416us 31.92% 33.728us 1.874us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.641us 24.26% 24.641us 2.053us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.511us 16.26% 16.511us 1.376us 12
aten::clone 0.84% 21.090us 65.15% 1.636ms 272.671us 0.000us 0.00% 9.087us 1.514us 6
aten::sub 1.51% 38.031us 2.44% 61.190us 10.198us 8.288us 8.16% 8.288us 1.381us 6
aten::add 1.29% 32.470us 2.19% 54.880us 9.147us 8.223us 8.10% 8.223us 1.371us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 7.66% 7.775us 1.296us 6
Activity Buffer Request 52.27% 1.312ms 52.27% 1.312ms 1.312ms 1.312us 1.29% 1.312us 1.312us 1
aten::empty_strided 1.29% 32.302us 1.29% 32.302us 5.384us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 9.44% 236.943us 9.44% 236.943us 39.491us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.53% 63.496us 3.16% 79.393us 3.308us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.63% 15.897us 0.63% 15.897us 0.662us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.60% 215.892us 8.60% 215.892us 4.498us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.21% 5.340us 0.21% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.511ms
Self CUDA time total: 101.565us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 991.709us 1060.94% 991.709us 991.709us 1
torch_eager 10.56% 336.649us 99.82% 3.183ms 3.183ms 0.000us 0.00% 94.755us 94.755us 1
aten::mul 5.20% 165.794us 8.73% 278.295us 11.596us 48.674us 52.07% 48.674us 2.028us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.674us 52.07% 48.674us 2.028us 24
aten::copy_ 3.76% 119.863us 72.07% 2.298ms 127.674us 30.622us 32.76% 31.902us 1.772us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.878us 24.47% 22.878us 1.907us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.179us 15.17% 14.179us 1.182us 12
aten::clone 0.88% 28.161us 69.55% 2.218ms 369.616us 0.000us 0.00% 9.024us 1.504us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 8.28% 7.744us 1.291us 6
aten::sub 1.28% 40.920us 2.05% 65.511us 10.918us 7.138us 7.64% 7.138us 1.190us 6
aten::add 1.05% 33.330us 1.81% 57.620us 9.603us 7.041us 7.53% 7.041us 1.173us 6
Activity Buffer Request 55.60% 1.773ms 55.60% 1.773ms 1.773ms 1.280us 1.37% 1.280us 1.280us 1
aten::empty_strided 1.06% 33.640us 1.06% 33.640us 5.607us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 10.74% 342.585us 10.74% 342.585us 57.097us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.14% 68.349us 2.66% 84.959us 3.540us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.52% 16.610us 0.52% 16.610us 0.692us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.03% 224.072us 7.03% 224.072us 4.668us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 5.590us 0.18% 5.590us 5.590us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.189ms
Self CUDA time total: 93.475us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 941.177us 926.36% 941.177us 941.177us 1
torch_eager 9.56% 295.804us 99.83% 3.088ms 3.088ms 0.000us 0.00% 102.911us 102.911us 1
aten::mul 5.03% 155.643us 8.60% 265.986us 11.083us 52.802us 51.97% 52.802us 2.200us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.802us 51.97% 52.802us 2.200us 24
aten::copy_ 3.66% 113.330us 73.34% 2.269ms 126.052us 32.447us 31.94% 33.759us 1.876us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 24.31% 24.703us 2.059us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.350us 16.09% 16.350us 1.363us 12
aten::clone 0.71% 21.820us 70.53% 2.182ms 363.694us 0.000us 0.00% 9.056us 1.509us 6
aten::sub 1.30% 40.120us 2.07% 63.950us 10.658us 8.223us 8.09% 8.223us 1.370us 6
aten::add 1.17% 36.201us 1.90% 58.931us 9.822us 8.127us 8.00% 8.127us 1.355us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.62% 7.744us 1.291us 6
Activity Buffer Request 57.23% 1.771ms 57.23% 1.771ms 1.771ms 1.312us 1.29% 1.312us 1.312us 1
aten::empty_strided 0.98% 30.371us 0.98% 30.371us 5.062us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 10.40% 321.885us 10.40% 321.885us 53.647us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.12% 65.592us 2.67% 82.622us 3.443us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.55% 17.030us 0.55% 17.030us 0.710us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.11% 219.985us 7.11% 219.985us 4.583us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.17% 5.340us 0.17% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.094ms
Self CUDA time total: 101.599us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.963us 782.64% 943.963us 943.963us 1
torch_eager 9.85% 301.136us 99.82% 3.051ms 3.051ms 0.000us 0.00% 122.468us 122.468us 1
aten::mul 5.14% 157.189us 8.67% 264.988us 11.041us 61.985us 51.39% 61.985us 2.583us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.985us 51.39% 61.985us 2.583us 24
aten::copy_ 3.53% 107.981us 72.58% 2.218ms 123.247us 39.362us 32.64% 41.218us 2.290us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.802us 23.88% 28.802us 2.400us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.265us 15.97% 19.265us 1.605us 12
aten::clone 0.97% 29.629us 70.14% 2.144ms 357.356us 0.000us 0.00% 12.416us 2.069us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 8.76% 10.560us 1.760us 6
aten::add 1.14% 34.930us 1.90% 58.161us 9.693us 9.633us 7.99% 9.633us 1.606us 6
aten::sub 1.25% 38.210us 2.05% 62.510us 10.418us 9.632us 7.99% 9.632us 1.605us 6
Activity Buffer Request 57.00% 1.742ms 57.00% 1.742ms 1.742ms 1.856us 1.54% 1.856us 1.856us 1
aten::empty_strided 1.01% 31.021us 1.01% 31.021us 5.170us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 10.03% 306.454us 10.03% 306.454us 51.076us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.23% 68.242us 2.79% 85.430us 3.560us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.56% 17.188us 0.56% 17.188us 0.716us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.10% 217.131us 7.10% 217.131us 4.524us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 5.390us 0.18% 5.390us 5.390us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.057ms
Self CUDA time total: 120.612us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 928.245us 538.18% 928.245us 928.245us 1
torch_eager 19.14% 292.425us 99.66% 1.523ms 1.523ms 0.000us 0.00% 175.325us 175.325us 1
aten::mul 10.16% 155.270us 17.20% 262.742us 10.948us 89.630us 51.97% 89.630us 3.735us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.630us 51.97% 89.630us 3.735us 24
aten::copy_ 6.82% 104.170us 46.76% 714.441us 39.691us 57.920us 33.58% 60.768us 3.376us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.768us 23.64% 40.768us 3.397us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.927us 14.45% 24.927us 2.077us 12
aten::clone 1.34% 20.471us 41.24% 630.180us 105.030us 0.000us 0.00% 20.000us 3.333us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.152us 9.94% 17.152us 2.859us 6
aten::sub 2.56% 39.072us 4.07% 62.112us 10.352us 12.480us 7.24% 12.480us 2.080us 6
aten::add 2.20% 33.610us 3.65% 55.810us 9.302us 12.447us 7.22% 12.447us 2.075us 6
Activity Buffer Request 16.69% 254.944us 16.69% 254.944us 254.944us 2.848us 1.65% 2.848us 2.848us 1
aten::empty_strided 2.04% 31.181us 2.04% 31.181us 5.197us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 19.06% 291.294us 19.06% 291.294us 48.549us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.37% 66.700us 5.47% 83.522us 3.480us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.10% 16.822us 1.10% 16.822us 0.701us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 14.19% 216.745us 14.19% 216.745us 4.516us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.34% 5.240us 0.34% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.528ms
Self CUDA time total: 172.477us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 923.899us 767.46% 923.899us 923.899us 1
torch_eager 19.14% 287.798us 99.65% 1.499ms 1.499ms 0.000us 0.00% 122.144us 122.144us 1
aten::mul 10.49% 157.698us 17.70% 266.255us 11.094us 61.982us 51.49% 61.982us 2.583us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.982us 51.49% 61.982us 2.583us 24
aten::copy_ 6.99% 105.118us 46.36% 697.187us 38.733us 39.264us 32.62% 41.024us 2.279us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.832us 23.95% 28.832us 2.403us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.138us 15.90% 19.138us 1.595us 12
aten::clone 1.32% 19.822us 40.79% 613.519us 102.253us 0.000us 0.00% 12.192us 2.032us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.67% 10.432us 1.739us 6
aten::sub 2.51% 37.801us 4.08% 61.341us 10.224us 9.570us 7.95% 9.570us 1.595us 6
aten::add 2.16% 32.471us 3.63% 54.661us 9.110us 9.568us 7.95% 9.568us 1.595us 6
Activity Buffer Request 16.71% 251.314us 16.71% 251.314us 251.314us 1.760us 1.46% 1.760us 1.760us 1
aten::empty_strided 2.00% 30.060us 2.00% 30.060us 5.010us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 18.58% 279.394us 18.58% 279.394us 46.566us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.31% 64.750us 5.43% 81.609us 3.400us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.12% 16.859us 1.12% 16.859us 0.702us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 14.34% 215.648us 14.34% 215.648us 4.493us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.35% 5.220us 0.35% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.504ms
Self CUDA time total: 120.384us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.259us 547.68% 943.259us 943.259us 1
torch_eager 9.82% 293.988us 99.82% 2.988ms 2.988ms 0.000us 0.00% 175.075us 175.075us 1
aten::mul 5.17% 154.631us 8.81% 263.742us 10.989us 89.536us 51.99% 89.536us 3.731us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.536us 51.99% 89.536us 3.731us 24
aten::copy_ 3.66% 109.570us 72.53% 2.171ms 120.590us 57.795us 33.56% 60.643us 3.369us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.835us 23.71% 40.835us 3.403us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.896us 14.46% 24.896us 2.075us 12
aten::clone 0.74% 22.030us 69.74% 2.087ms 347.874us 0.000us 0.00% 19.808us 3.301us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 9.85% 16.960us 2.827us 6
aten::add 1.10% 32.890us 1.87% 55.840us 9.307us 12.481us 7.25% 12.481us 2.080us 6
aten::sub 1.28% 38.273us 2.11% 63.142us 10.524us 12.415us 7.21% 12.415us 2.069us 6
Activity Buffer Request 58.02% 1.736ms 58.02% 1.736ms 1.736ms 2.848us 1.65% 2.848us 2.848us 1
aten::empty_strided 1.00% 30.050us 1.00% 30.050us 5.008us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 8.83% 264.325us 8.83% 264.325us 44.054us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.36% 70.650us 2.95% 88.161us 3.673us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.59% 17.511us 0.59% 17.511us 0.730us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.26% 217.282us 7.26% 217.282us 4.527us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 5.289us 0.18% 5.289us 5.289us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.993ms
Self CUDA time total: 172.227us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 922.006us 322.11% 922.006us 922.006us 1
torch_eager 19.42% 278.764us 99.64% 1.431ms 1.431ms 0.000us 0.00% 304.543us 304.543us 1
aten::mul 10.68% 153.400us 18.09% 259.803us 10.825us 134.112us 46.85% 134.112us 5.588us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.112us 46.85% 134.112us 5.588us 24
aten::copy_ 7.65% 109.831us 44.83% 643.670us 35.759us 111.232us 38.86% 129.536us 7.196us 18
aten::clone 1.43% 20.539us 38.82% 557.349us 92.892us 0.000us 0.00% 72.160us 12.027us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.376us 20.04% 57.376us 4.781us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.856us 18.82% 53.856us 8.976us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.895us 14.29% 40.895us 3.408us 12
aten::sub 2.68% 38.501us 4.30% 61.692us 10.282us 20.543us 7.18% 20.543us 3.424us 6
aten::add 2.29% 32.829us 3.81% 54.730us 9.122us 20.352us 7.11% 20.352us 3.392us 6
Activity Buffer Request 16.08% 230.904us 16.08% 230.904us 230.904us 18.304us 6.39% 18.304us 18.304us 1
aten::empty_strided 2.06% 29.601us 2.06% 29.601us 4.933us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 16.83% 241.674us 16.83% 241.674us 40.279us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.51% 64.754us 5.69% 81.743us 3.406us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.18% 16.989us 1.18% 16.989us 0.708us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 14.82% 212.756us 14.82% 212.756us 4.432us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.36% 5.240us 0.36% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.436ms
Self CUDA time total: 286.239us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 970.352us 169.72% 970.352us 970.352us 1
torch_eager 19.50% 289.365us 99.64% 1.478ms 1.478ms 0.000us 0.00% 595.480us 595.480us 1
aten::copy_ 7.05% 104.551us 43.31% 642.598us 35.700us 273.596us 47.85% 297.340us 16.519us 18
aten::mul 11.63% 172.532us 19.46% 288.666us 12.028us 232.863us 40.73% 232.863us 9.703us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 232.863us 40.73% 232.863us 9.703us 24
aten::clone 1.45% 21.521us 37.67% 558.878us 93.146us 0.000us 0.00% 205.949us 34.325us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.205us 31.87% 182.205us 30.367us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.391us 15.98% 91.391us 7.616us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.277us 11.42% 65.277us 5.440us 12
aten::sub 2.70% 40.111us 4.36% 64.701us 10.784us 32.768us 5.73% 32.768us 5.461us 6
aten::add 2.31% 34.320us 3.88% 57.510us 9.585us 32.509us 5.69% 32.509us 5.418us 6
Activity Buffer Request 17.48% 259.324us 17.48% 259.324us 259.324us 23.744us 4.15% 23.744us 23.744us 1
aten::empty_strided 2.00% 29.720us 2.00% 29.720us 4.953us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 14.68% 217.742us 14.68% 217.742us 36.290us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.50% 66.694us 5.68% 84.252us 3.511us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.18% 17.558us 1.18% 17.558us 0.732us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.16% 224.895us 15.16% 224.895us 4.685us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.36% 5.340us 0.36% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.484ms
Self CUDA time total: 571.736us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 936.155us 1011.59% 936.155us 936.155us 1
torch_eager 9.66% 281.404us 99.82% 2.908ms 2.908ms 0.000us 0.00% 93.663us 93.663us 1
aten::mul 5.48% 159.764us 9.36% 272.564us 11.357us 49.568us 53.56% 49.568us 2.065us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.568us 53.56% 49.568us 2.065us 24
aten::copy_ 3.70% 107.711us 72.25% 2.105ms 116.944us 29.407us 31.78% 30.527us 1.696us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.591us 24.41% 22.591us 1.883us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.568us 14.66% 13.568us 1.131us 12
aten::clone 0.74% 21.551us 69.34% 2.020ms 336.695us 0.000us 0.00% 7.936us 1.323us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 7.37% 6.816us 1.136us 6
aten::sub 1.31% 38.128us 2.13% 61.912us 10.319us 6.815us 7.36% 6.815us 1.136us 6
aten::add 1.08% 31.450us 1.84% 53.600us 8.933us 6.753us 7.30% 6.753us 1.126us 6
Activity Buffer Request 59.75% 1.741ms 59.75% 1.741ms 1.741ms 1.120us 1.21% 1.120us 1.120us 1
aten::empty_strided 1.04% 30.170us 1.04% 30.170us 5.028us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 6.73% 196.044us 6.73% 196.044us 32.674us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.24% 65.300us 2.82% 82.022us 3.418us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.57% 16.722us 0.57% 16.722us 0.697us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.53% 219.305us 7.53% 219.305us 4.569us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 5.160us 0.18% 5.160us 5.160us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.913ms
Self CUDA time total: 92.543us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 918.262us 956.86% 918.262us 918.262us 1
torch_eager 20.02% 274.163us 99.62% 1.364ms 1.364ms 0.000us 0.00% 97.279us 97.279us 1
aten::mul 11.52% 157.766us 19.39% 265.646us 11.069us 51.167us 53.32% 51.167us 2.132us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.167us 53.32% 51.167us 2.132us 24
aten::copy_ 7.76% 106.268us 42.02% 575.576us 31.976us 30.720us 32.01% 32.033us 1.780us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 23.88% 22.912us 1.909us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.079us 14.67% 14.079us 1.173us 12
aten::clone 1.48% 20.322us 36.02% 493.298us 82.216us 0.000us 0.00% 9.121us 1.520us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 8.14% 7.808us 1.301us 6
aten::sub 2.81% 38.541us 4.49% 61.481us 10.247us 7.072us 7.37% 7.072us 1.179us 6
aten::add 2.42% 33.131us 4.04% 55.302us 9.217us 7.007us 7.30% 7.007us 1.168us 6
Activity Buffer Request 16.17% 221.544us 16.17% 221.544us 221.544us 1.313us 1.37% 1.313us 1.313us 1
aten::empty_strided 2.33% 31.950us 2.33% 31.950us 5.325us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.69% 187.513us 13.69% 187.513us 31.252us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.61% 63.101us 5.84% 79.961us 3.332us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.23% 16.860us 1.23% 16.860us 0.702us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.57% 213.242us 15.57% 213.242us 4.443us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.38% 5.270us 0.38% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.370ms
Self CUDA time total: 95.966us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 929.528us 892.96% 929.528us 929.528us 1
torch_eager 20.25% 278.528us 99.63% 1.370ms 1.370ms 0.000us 0.00% 105.439us 105.439us 1
aten::mul 11.59% 159.422us 19.60% 269.583us 11.233us 55.326us 53.15% 55.326us 2.305us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.326us 53.15% 55.326us 2.305us 24
aten::copy_ 7.64% 105.130us 41.59% 572.021us 31.779us 32.351us 31.08% 33.695us 1.872us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.607us 23.64% 24.607us 2.051us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.418us 15.77% 16.418us 1.368us 12
aten::clone 1.49% 20.431us 35.49% 488.057us 81.343us 0.000us 0.00% 9.088us 1.515us 6
aten::sub 2.60% 35.723us 4.36% 59.953us 9.992us 8.258us 7.93% 8.258us 1.376us 6
aten::add 2.46% 33.770us 4.07% 55.940us 9.323us 8.160us 7.84% 8.160us 1.360us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.44% 7.744us 1.291us 6
Activity Buffer Request 16.10% 221.454us 16.10% 221.454us 221.454us 1.344us 1.29% 1.344us 1.344us 1
aten::empty_strided 2.25% 30.990us 2.25% 30.990us 5.165us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.30% 182.863us 13.30% 182.863us 30.477us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.81% 66.212us 6.02% 82.825us 3.451us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.21% 16.613us 1.21% 16.613us 0.692us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.93% 219.135us 15.93% 219.135us 4.565us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.37% 5.090us 0.37% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.375ms
Self CUDA time total: 104.095us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.134us 762.57% 943.134us 943.134us 1
torch_eager 9.91% 288.756us 99.81% 2.907ms 2.907ms 0.000us 0.00% 125.503us 125.503us 1
aten::mul 5.47% 159.428us 9.14% 266.247us 11.094us 65.088us 52.63% 65.088us 2.712us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.088us 52.63% 65.088us 2.712us 24
aten::copy_ 3.82% 111.411us 72.08% 2.100ms 116.650us 39.391us 31.85% 41.215us 2.290us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.799us 23.29% 28.799us 2.400us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.200us 15.52% 19.200us 1.600us 12
aten::clone 0.71% 20.821us 69.14% 2.014ms 335.649us 0.000us 0.00% 12.416us 2.069us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.592us 8.56% 10.592us 1.765us 6
aten::sub 1.35% 39.440us 2.20% 63.980us 10.663us 9.632us 7.79% 9.632us 1.605us 6
aten::add 1.16% 33.802us 1.92% 55.961us 9.327us 9.568us 7.74% 9.568us 1.595us 6
Activity Buffer Request 59.81% 1.742ms 59.81% 1.742ms 1.742ms 1.824us 1.47% 1.824us 1.824us 1
aten::empty_strided 1.06% 30.871us 1.06% 30.871us 5.145us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 6.32% 184.202us 6.32% 184.202us 30.700us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.20% 64.120us 2.78% 80.888us 3.370us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.58% 16.768us 0.58% 16.768us 0.699us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.39% 215.298us 7.39% 215.298us 4.485us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.19% 5.660us 0.19% 5.660us 5.660us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.913ms
Self CUDA time total: 123.679us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 926.451us 888.37% 926.451us 926.451us 1
torch_eager 20.56% 277.090us 99.61% 1.342ms 1.342ms 0.000us 0.00% 105.599us 105.599us 1
aten::mul 11.75% 158.363us 19.88% 267.883us 11.162us 55.423us 53.14% 55.423us 2.309us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.423us 53.14% 55.423us 2.309us 24
aten::copy_ 7.94% 107.035us 40.62% 547.383us 30.410us 32.352us 31.02% 33.664us 1.870us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 23.63% 24.640us 2.053us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.512us 15.83% 16.512us 1.376us 12
aten::clone 1.47% 19.840us 34.29% 462.099us 77.016us 0.000us 0.00% 9.024us 1.504us 6
aten::sub 2.93% 39.461us 4.68% 63.054us 10.509us 8.287us 7.95% 8.287us 1.381us 6
aten::add 2.50% 33.680us 4.16% 56.100us 9.350us 8.225us 7.89% 8.225us 1.371us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.39% 7.712us 1.285us 6
Activity Buffer Request 14.74% 198.654us 14.74% 198.654us 198.654us 1.312us 1.26% 1.312us 1.312us 1
aten::empty_strided 2.26% 30.481us 2.26% 30.481us 5.080us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.39% 180.523us 13.39% 180.523us 30.087us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.73% 63.708us 5.98% 80.630us 3.360us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.26% 16.922us 1.26% 16.922us 0.705us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.08% 216.704us 16.08% 216.704us 4.515us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.39% 5.231us 0.39% 5.231us 5.231us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.348ms
Self CUDA time total: 104.287us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 931.662us 754.64% 931.662us 931.662us 1
torch_eager 20.88% 278.302us 99.60% 1.328ms 1.328ms 0.000us 0.00% 125.281us 125.281us 1
aten::mul 11.71% 156.112us 20.55% 273.936us 11.414us 65.153us 52.77% 65.153us 2.715us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.153us 52.77% 65.153us 2.715us 24
aten::copy_ 7.95% 105.951us 39.52% 526.779us 29.265us 39.169us 31.73% 40.993us 2.277us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.737us 23.28% 28.737us 2.395us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.135us 15.50% 19.135us 1.595us 12
aten::clone 1.44% 19.200us 33.27% 443.406us 73.901us 0.000us 0.00% 12.256us 2.043us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.45% 10.432us 1.739us 6
aten::sub 2.81% 37.440us 4.58% 61.110us 10.185us 9.632us 7.80% 9.632us 1.605us 6
aten::add 2.52% 33.611us 4.17% 55.611us 9.268us 9.503us 7.70% 9.503us 1.584us 6
Activity Buffer Request 13.21% 176.083us 13.21% 176.083us 176.083us 1.824us 1.48% 1.824us 1.824us 1
aten::empty_strided 2.29% 30.570us 2.29% 30.570us 5.095us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.82% 184.192us 13.82% 184.192us 30.699us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.90% 65.274us 6.16% 82.123us 3.422us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.26% 16.849us 1.26% 16.849us 0.702us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.81% 224.047us 16.81% 224.047us 4.668us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.40% 5.310us 0.40% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.333ms
Self CUDA time total: 123.457us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.092us 532.26% 944.092us 944.092us 1
torch_eager 9.66% 282.874us 99.81% 2.921ms 2.921ms 0.000us 0.00% 180.253us 180.253us 1
aten::mul 5.51% 161.402us 9.28% 271.603us 11.317us 95.040us 53.58% 95.040us 3.960us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.040us 53.58% 95.040us 3.960us 24
aten::copy_ 3.62% 106.065us 72.07% 2.109ms 117.193us 57.663us 32.51% 60.543us 3.364us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.703us 22.95% 40.703us 3.392us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.670us 13.91% 24.670us 2.056us 12
aten::clone 0.77% 22.428us 69.22% 2.026ms 337.680us 0.000us 0.00% 19.840us 3.307us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 9.56% 16.960us 2.827us 6
aten::add 1.16% 34.010us 1.95% 57.150us 9.525us 12.383us 6.98% 12.383us 2.064us 6
aten::sub 1.32% 38.563us 2.15% 62.972us 10.495us 12.287us 6.93% 12.287us 2.048us 6
Activity Buffer Request 59.97% 1.755ms 59.97% 1.755ms 1.755ms 2.880us 1.62% 2.880us 2.880us 1
aten::empty_strided 1.05% 30.691us 1.05% 30.691us 5.115us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 6.31% 184.633us 6.31% 184.633us 30.772us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.32% 67.977us 2.88% 84.170us 3.507us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.55% 16.193us 0.55% 16.193us 0.675us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.56% 221.262us 7.56% 221.262us 4.610us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.19% 5.669us 0.19% 5.669us 5.669us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.927ms
Self CUDA time total: 177.373us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 956.029us 320.35% 956.029us 956.029us 1
torch_eager 10.28% 306.488us 99.82% 2.977ms 2.977ms 0.000us 0.00% 316.194us 316.194us 1
aten::mul 5.10% 152.001us 8.95% 266.845us 11.119us 146.560us 49.11% 146.560us 6.107us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.560us 49.11% 146.560us 6.107us 24
aten::copy_ 3.72% 110.901us 71.64% 2.137ms 118.718us 110.754us 37.11% 128.514us 7.140us 18
aten::clone 0.97% 28.901us 68.99% 2.058ms 342.957us 0.000us 0.00% 70.944us 11.824us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.570us 19.29% 57.570us 4.797us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.184us 17.82% 53.184us 8.864us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.120us 13.78% 41.120us 3.427us 12
aten::add 1.16% 34.740us 1.93% 57.500us 9.583us 20.641us 6.92% 20.641us 3.440us 6
aten::sub 1.34% 39.998us 2.18% 65.101us 10.850us 20.479us 6.86% 20.479us 3.413us 6
Activity Buffer Request 59.58% 1.777ms 59.58% 1.777ms 1.777ms 17.760us 5.95% 17.760us 17.760us 1
aten::empty_strided 1.05% 31.260us 1.05% 31.260us 5.210us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 6.26% 186.663us 6.26% 186.663us 31.111us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.24% 66.809us 2.82% 84.238us 3.510us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.58% 17.429us 0.58% 17.429us 0.726us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.54% 224.919us 7.54% 224.919us 4.686us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 5.469us 0.18% 5.469us 5.469us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.983ms
Self CUDA time total: 298.434us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 916.392us 515.61% 916.392us 916.392us 1
torch_eager 19.58% 274.201us 99.60% 1.394ms 1.394ms 0.000us 0.00% 180.610us 180.610us 1
aten::mul 11.24% 157.371us 18.87% 264.183us 11.008us 95.074us 53.49% 95.074us 3.961us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.074us 53.49% 95.074us 3.961us 24
aten::copy_ 7.77% 108.775us 43.49% 608.863us 33.826us 57.825us 32.54% 60.705us 3.373us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.897us 23.01% 40.897us 3.408us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.831us 13.97% 24.831us 2.069us 12
aten::clone 1.40% 19.580us 37.38% 523.368us 87.228us 0.000us 0.00% 19.808us 3.301us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 9.52% 16.928us 2.821us 6
aten::add 2.38% 33.360us 4.00% 56.040us 9.340us 12.416us 6.99% 12.416us 2.069us 6
aten::sub 2.76% 38.582us 4.39% 61.472us 10.245us 12.415us 6.99% 12.415us 2.069us 6
Activity Buffer Request 18.14% 253.955us 18.14% 253.955us 253.955us 2.880us 1.62% 2.880us 2.880us 1
aten::empty_strided 2.13% 29.860us 2.13% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.38% 187.273us 13.38% 187.273us 31.212us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.53% 63.391us 5.73% 80.293us 3.346us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.21% 16.902us 1.21% 16.902us 0.704us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.09% 211.242us 15.09% 211.242us 4.401us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.40% 5.600us 0.40% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.400ms
Self CUDA time total: 177.730us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 934.618us 312.71% 934.618us 934.618us 1
torch_eager 20.60% 280.895us 99.62% 1.358ms 1.358ms 0.000us 0.00% 316.921us 316.921us 1
aten::mul 11.57% 157.759us 19.61% 267.373us 11.141us 146.460us 49.00% 146.460us 6.102us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.460us 49.00% 146.460us 6.102us 24
aten::copy_ 8.07% 110.072us 41.19% 561.700us 31.206us 111.966us 37.46% 130.013us 7.223us 18
aten::clone 1.51% 20.600us 34.77% 474.096us 79.016us 0.000us 0.00% 72.670us 12.112us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.343us 19.19% 57.343us 4.779us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.623us 18.28% 54.623us 9.104us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.448us 13.53% 40.448us 3.371us 12
aten::add 2.59% 35.260us 4.22% 57.590us 9.598us 20.288us 6.79% 20.288us 3.381us 6
aten::sub 2.60% 35.410us 4.30% 58.621us 9.770us 20.160us 6.75% 20.160us 3.360us 6
Activity Buffer Request 14.73% 200.853us 14.73% 200.853us 200.853us 18.047us 6.04% 18.047us 18.047us 1
aten::empty_strided 2.18% 29.660us 2.18% 29.660us 4.943us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.85% 188.823us 13.85% 188.823us 31.471us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.75% 64.754us 6.01% 81.922us 3.413us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.26% 17.168us 1.26% 17.168us 0.715us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.92% 217.107us 15.92% 217.107us 4.523us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.38% 5.180us 0.38% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.364ms
Self CUDA time total: 298.874us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 956.919us 161.50% 956.919us 956.919us 1
torch_eager 21.30% 289.504us 99.57% 1.353ms 1.353ms 0.000us 0.00% 616.281us 616.281us 1
aten::copy_ 7.84% 106.532us 38.89% 528.548us 29.364us 278.013us 46.92% 301.788us 16.766us 18
aten::mul 11.95% 162.407us 20.79% 282.469us 11.770us 248.703us 41.97% 248.703us 10.363us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 248.703us 41.97% 248.703us 10.363us 24
aten::clone 1.53% 20.799us 32.73% 444.735us 74.123us 0.000us 0.00% 210.204us 35.034us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 186.429us 31.46% 186.429us 31.072us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.584us 15.46% 91.584us 7.632us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.790us 11.10% 65.790us 5.483us 12
aten::add 2.44% 33.161us 4.08% 55.501us 9.250us 32.927us 5.56% 32.927us 5.488us 6
aten::sub 2.95% 40.030us 4.74% 64.440us 10.740us 32.863us 5.55% 32.863us 5.477us 6
Activity Buffer Request 13.07% 177.663us 13.07% 177.663us 177.663us 23.775us 4.01% 23.775us 23.775us 1
aten::empty_strided 2.15% 29.270us 2.15% 29.270us 4.878us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 13.63% 185.172us 13.63% 185.172us 30.862us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.83% 65.662us 6.08% 82.660us 3.444us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.25% 16.998us 1.25% 16.998us 0.708us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.63% 225.993us 16.63% 225.993us 4.708us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.43% 5.780us 0.43% 5.780us 5.780us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.359ms
Self CUDA time total: 592.506us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 12.69% 276.287us 61.52% 1.340ms 1.340ms 0.000us 0.00% 1.863ms 1.863ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.835ms 102.22% 1.835ms 1.835ms 1
aten::copy_ 5.01% 109.060us 24.98% 544.137us 30.230us 806.007us 44.89% 873.590us 48.533us 18
aten::mul 7.11% 154.844us 12.06% 262.604us 10.942us 842.615us 46.93% 842.615us 35.109us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 842.615us 46.93% 842.615us 35.109us 24
aten::clone 1.01% 22.000us 21.12% 459.916us 76.653us 0.000us 0.00% 622.361us 103.727us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 554.778us 30.90% 554.778us 92.463us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 251.229us 13.99% 251.229us 20.936us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 146.939us 8.18% 146.939us 12.245us 12
aten::sub 1.90% 41.421us 3.00% 65.411us 10.902us 88.573us 4.93% 88.573us 14.762us 6
Activity Buffer Request 8.49% 184.983us 8.49% 184.983us 184.983us 67.583us 3.76% 67.583us 67.583us 1
aten::add 1.54% 33.561us 2.59% 56.461us 9.410us 58.366us 3.25% 58.366us 9.728us 6
aten::empty_strided 1.42% 30.960us 1.42% 30.960us 5.160us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 8.70% 189.543us 8.70% 189.543us 31.591us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.99% 65.113us 3.77% 82.061us 3.419us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.78% 16.948us 0.78% 16.948us 0.706us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 9.88% 215.201us 9.88% 215.201us 4.483us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 38.48% 838.063us 38.48% 838.063us 838.063us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.178ms
Self CUDA time total: 1.796ms
impl wl p50(ms) ok
torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
torch_eager cuda_B1_S128_H32_D64_R32 0.23 True
torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
torch_eager cuda_B1_S512_H32_D128_R64 0.22 True
torch_eager cuda_B1_S512_H32_D64_R32 0.22 True
torch_eager cuda_B1_S512_H8_D128_R64 0.23 True
torch_eager cuda_B1_S512_H8_D64_R32 0.23 True
torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
torch_eager cuda_B2_S128_H8_D64_R32 0.22 True
torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True
torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
torch_eager cuda_B2_S512_H32_D64_R32 0.23 True
torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
torch_eager cuda_B2_S512_H8_D64_R32 0.23 True