Running causal_conv1d benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 439.324us 2269.12% 439.324us 439.324us 1
torch_eager 10.31% 220.478us 99.69% 2.131ms 2.131ms 0.000us 0.00% 21.729us 21.729us 1
aten::to 0.50% 10.770us 79.87% 1.707ms 284.530us 0.000us 0.00% 14.369us 2.395us 6
aten::_to_copy 1.71% 36.499us 79.36% 1.696ms 282.735us 0.000us 0.00% 14.369us 2.395us 6
aten::copy_ 2.77% 59.234us 75.21% 1.608ms 267.930us 12.001us 61.99% 14.369us 2.395us 6
aten::conv1d 0.36% 7.590us 7.34% 156.883us 52.294us 0.000us 0.00% 7.360us 2.453us 3
aten::convolution 0.66% 14.070us 6.98% 149.293us 49.764us 0.000us 0.00% 7.360us 2.453us 3
aten::_convolution 1.51% 32.210us 6.33% 135.223us 45.074us 0.000us 0.00% 7.360us 2.453us 3
aten::_conv_depthwise2d 1.61% 34.371us 4.00% 85.463us 28.488us 7.360us 38.01% 7.360us 2.453us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 38.01% 7.360us 2.453us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.337us 32.73% 6.337us 2.112us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.25% 5.664us 1.888us 3
Activity Buffer Request 69.37% 1.483ms 69.37% 1.483ms 1.483ms 2.368us 12.23% 2.368us 2.368us 1
aten::empty_strided 2.45% 52.331us 2.45% 52.331us 8.722us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 4.26% 91.032us 4.26% 91.032us 10.115us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.32% 28.311us 1.71% 36.491us 4.055us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.64% 13.700us 0.64% 13.700us 0.913us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.60% 12.790us 0.60% 12.790us 4.263us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.59% 12.710us 0.59% 12.710us 4.237us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.31% 6.640us 0.38% 8.090us 2.697us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.138ms
Self CUDA time total: 19.361us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.789us 1742.49% 341.789us 341.789us 1
torch_eager 7.86% 151.082us 99.71% 1.916ms 1.916ms 0.000us 0.00% 21.695us 21.695us 1
aten::to 0.35% 6.661us 83.96% 1.614ms 268.966us 0.000us 0.00% 13.695us 2.282us 6
aten::_to_copy 1.29% 24.781us 83.61% 1.607ms 267.856us 0.000us 0.00% 13.695us 2.282us 6
aten::copy_ 2.59% 49.784us 80.72% 1.552ms 258.589us 11.615us 59.21% 13.695us 2.282us 6
aten::conv1d 0.32% 6.220us 6.35% 122.113us 40.704us 0.000us 0.00% 8.000us 2.667us 3
aten::convolution 0.53% 10.120us 6.03% 115.893us 38.631us 0.000us 0.00% 8.000us 2.667us 3
aten::_convolution 1.20% 23.080us 5.50% 105.773us 35.258us 0.000us 0.00% 8.000us 2.667us 3
aten::_conv_depthwise2d 1.19% 22.952us 3.39% 65.123us 21.708us 8.000us 40.79% 8.000us 2.667us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 40.79% 8.000us 2.667us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.047us 30.83% 6.047us 2.016us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.39% 5.568us 1.856us 3
Activity Buffer Request 75.54% 1.452ms 75.54% 1.452ms 1.452ms 2.080us 10.60% 2.080us 2.080us 1
aten::empty_strided 1.60% 30.820us 1.60% 30.820us 5.137us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.74% 71.953us 3.74% 71.953us 7.995us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.98% 18.881us 1.29% 24.750us 2.750us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.50% 9.609us 0.50% 9.609us 0.641us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.56% 10.750us 0.56% 10.750us 3.583us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.49% 9.339us 0.49% 9.339us 3.113us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.34% 6.630us 0.42% 8.000us 2.667us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.922ms
Self CUDA time total: 19.615us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 343.328us 1837.45% 343.328us 343.328us 1
torch_eager 7.88% 151.015us 99.69% 1.911ms 1.911ms 0.000us 0.00% 20.605us 20.605us 1
aten::to 0.33% 6.409us 84.02% 1.611ms 268.468us 0.000us 0.00% 13.662us 2.277us 6
aten::_to_copy 1.32% 25.354us 83.68% 1.604ms 267.400us 0.000us 0.00% 13.662us 2.277us 6
aten::copy_ 2.65% 50.770us 80.80% 1.549ms 258.170us 11.742us 62.84% 13.662us 2.277us 6
aten::conv1d 0.33% 6.290us 6.34% 121.483us 40.494us 0.000us 0.00% 6.943us 2.314us 3
aten::convolution 0.54% 10.430us 6.01% 115.193us 38.398us 0.000us 0.00% 6.943us 2.314us 3
aten::_convolution 1.17% 22.439us 5.46% 104.763us 34.921us 0.000us 0.00% 6.943us 2.314us 3
aten::_conv_depthwise2d 1.17% 22.412us 3.43% 65.843us 21.948us 6.943us 37.16% 6.943us 2.314us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.943us 37.16% 6.943us 2.314us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.982us 32.01% 5.982us 1.994us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.760us 30.83% 5.760us 1.920us 3
Activity Buffer Request 75.50% 1.448ms 75.50% 1.448ms 1.448ms 1.920us 10.28% 1.920us 1.920us 1
aten::empty_strided 1.57% 30.029us 1.57% 30.029us 5.005us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.90% 74.680us 3.90% 74.680us 8.298us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.93% 17.782us 1.21% 23.252us 2.584us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.48% 9.281us 0.48% 9.281us 0.619us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.57% 10.910us 0.57% 10.910us 3.637us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 8.531us 0.44% 8.531us 2.844us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.32% 6.170us 0.39% 7.570us 2.523us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.917ms
Self CUDA time total: 18.685us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.280us 1734.88% 340.280us 340.280us 1
torch_eager 6.89% 141.563us 99.72% 2.049ms 2.049ms 0.000us 0.00% 21.726us 21.726us 1
aten::to 0.30% 6.132us 85.38% 1.755ms 292.424us 0.000us 0.00% 13.982us 2.330us 6
aten::_to_copy 1.19% 24.439us 85.08% 1.748ms 291.402us 0.000us 0.00% 13.982us 2.330us 6
aten::copy_ 2.50% 51.302us 82.39% 1.693ms 282.182us 11.870us 60.52% 13.982us 2.330us 6
aten::conv1d 0.29% 5.930us 5.97% 122.723us 40.908us 0.000us 0.00% 7.744us 2.581us 3
aten::convolution 0.50% 10.300us 5.68% 116.793us 38.931us 0.000us 0.00% 7.744us 2.581us 3
aten::_convolution 1.17% 23.960us 5.18% 106.493us 35.498us 0.000us 0.00% 7.744us 2.581us 3
aten::_conv_depthwise2d 1.08% 22.141us 3.19% 65.452us 21.817us 7.744us 39.48% 7.744us 2.581us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 39.48% 7.744us 2.581us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.143us 31.32% 6.143us 2.048us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.727us 29.20% 5.727us 1.909us 3
Activity Buffer Request 70.00% 1.438ms 70.00% 1.438ms 1.438ms 2.112us 10.77% 2.112us 2.112us 1
aten::empty_strided 1.50% 30.881us 1.50% 30.881us 5.147us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.01% 226.194us 11.01% 226.194us 25.133us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.89% 18.302us 1.19% 24.432us 2.715us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.49% 9.981us 0.49% 9.981us 0.665us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.55% 11.260us 0.55% 11.260us 3.753us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.45% 9.171us 0.45% 9.171us 3.057us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.32% 6.620us 0.39% 8.030us 2.677us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.055ms
Self CUDA time total: 19.614us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 379.964us 1548.03% 379.964us 379.964us 1
torch_eager 7.69% 160.944us 99.76% 2.089ms 2.089ms 0.000us 0.00% 26.817us 26.817us 1
aten::to 0.33% 7.000us 83.76% 1.754ms 292.349us 0.000us 0.00% 15.265us 2.544us 6
aten::_to_copy 1.23% 25.779us 83.43% 1.747ms 291.183us 0.000us 0.00% 15.265us 2.544us 6
aten::copy_ 2.49% 52.100us 80.65% 1.689ms 281.484us 12.993us 52.94% 15.265us 2.544us 6
aten::conv1d 0.31% 6.410us 6.85% 143.364us 47.788us 0.000us 0.00% 11.552us 3.851us 3
aten::convolution 1.48% 31.021us 6.54% 136.954us 45.651us 0.000us 0.00% 11.552us 3.851us 3
aten::_convolution 1.13% 23.621us 5.06% 105.933us 35.311us 0.000us 0.00% 11.552us 3.851us 3
aten::_conv_depthwise2d 1.06% 22.209us 3.13% 65.632us 21.877us 11.552us 47.06% 11.552us 3.851us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.06% 11.552us 3.851us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.625us 26.99% 6.625us 2.208us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 25.94% 6.368us 2.123us 3
Activity Buffer Request 68.76% 1.440ms 68.76% 1.440ms 1.440ms 2.272us 9.26% 2.272us 2.272us 1
aten::empty_strided 1.55% 32.413us 1.55% 32.413us 5.402us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.50% 219.817us 10.50% 219.817us 24.424us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.87% 18.301us 1.15% 24.061us 2.673us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.50% 10.530us 0.50% 10.530us 0.702us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.50% 10.490us 0.50% 10.490us 3.497us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.47% 9.872us 0.47% 9.872us 3.291us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.30% 6.220us 0.37% 7.740us 2.580us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.094ms
Self CUDA time total: 24.545us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 351.133us 1341.43% 351.133us 351.133us 1
torch_eager 7.55% 157.812us 99.73% 2.084ms 2.084ms 0.000us 0.00% 28.416us 28.416us 1
aten::to 0.31% 6.571us 84.80% 1.772ms 295.318us 0.000us 0.00% 15.264us 2.544us 6
aten::_to_copy 1.22% 25.450us 84.49% 1.765ms 294.223us 0.000us 0.00% 15.264us 2.544us 6
aten::copy_ 2.31% 48.301us 81.82% 1.710ms 284.947us 13.024us 49.76% 15.264us 2.544us 6
aten::conv1d 0.32% 6.640us 5.96% 124.543us 41.514us 0.000us 0.00% 13.152us 4.384us 3
aten::convolution 0.50% 10.360us 5.64% 117.903us 39.301us 0.000us 0.00% 13.152us 4.384us 3
aten::_convolution 1.16% 24.330us 5.15% 107.543us 35.848us 0.000us 0.00% 13.152us 4.384us 3
aten::_conv_depthwise2d 1.06% 22.241us 3.14% 65.623us 21.874us 13.152us 50.24% 13.152us 4.384us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.152us 50.24% 13.152us 4.384us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 25.43% 6.656us 2.219us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.33% 6.368us 2.123us 3
Activity Buffer Request 70.10% 1.465ms 70.10% 1.465ms 1.465ms 2.240us 8.56% 2.240us 2.240us 1
aten::empty_strided 1.45% 30.202us 1.45% 30.202us 5.034us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.51% 219.677us 10.51% 219.677us 24.409us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.90% 18.881us 1.17% 24.421us 2.713us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.46% 9.580us 0.46% 9.580us 0.639us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.55% 11.471us 0.55% 11.471us 3.824us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.43% 8.890us 0.43% 8.890us 2.963us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.33% 6.950us 0.40% 8.400us 2.800us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.089ms
Self CUDA time total: 26.176us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 349.627us 908.24% 349.627us 349.627us 1
torch_eager 7.45% 152.992us 99.76% 2.049ms 2.049ms 0.000us 0.00% 41.086us 41.086us 1
aten::conv1d 0.32% 6.640us 6.06% 124.413us 41.471us 0.000us 0.00% 22.561us 7.520us 3
aten::convolution 0.50% 10.370us 5.73% 117.773us 39.258us 0.000us 0.00% 22.561us 7.520us 3
aten::_convolution 1.14% 23.411us 5.23% 107.403us 35.801us 0.000us 0.00% 22.561us 7.520us 3
aten::_conv_depthwise2d 1.15% 23.650us 3.29% 67.532us 22.511us 22.561us 58.61% 22.561us 7.520us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.561us 58.61% 22.561us 7.520us 3
aten::to 0.33% 6.780us 84.82% 1.743ms 290.446us 0.000us 0.00% 18.525us 3.087us 6
aten::_to_copy 1.29% 26.502us 84.49% 1.736ms 289.316us 0.000us 0.00% 18.525us 3.087us 6
aten::copy_ 2.40% 49.251us 81.74% 1.679ms 279.869us 15.934us 41.39% 18.525us 3.087us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.543us 22.19% 8.543us 2.848us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.391us 19.20% 7.391us 2.464us 3
Activity Buffer Request 69.84% 1.435ms 69.84% 1.435ms 1.435ms 2.591us 6.73% 2.591us 2.591us 1
aten::empty_strided 1.47% 30.182us 1.47% 30.182us 5.030us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.64% 218.664us 10.64% 218.664us 24.296us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.89% 18.281us 1.17% 24.011us 2.668us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.47% 9.739us 0.47% 9.739us 0.649us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.53% 10.991us 0.53% 10.991us 3.664us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.46% 9.421us 0.46% 9.421us 3.140us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.29% 5.970us 0.36% 7.320us 2.440us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.054ms
Self CUDA time total: 38.495us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 345.054us 837.81% 345.054us 345.054us 1
torch_eager 7.39% 151.695us 99.75% 2.049ms 2.049ms 0.000us 0.00% 43.810us 43.810us 1
aten::conv1d 0.32% 6.620us 6.03% 123.883us 41.294us 0.000us 0.00% 25.375us 8.458us 3
aten::convolution 0.50% 10.320us 5.71% 117.263us 39.088us 0.000us 0.00% 25.375us 8.458us 3
aten::_convolution 1.20% 24.592us 5.21% 106.943us 35.648us 0.000us 0.00% 25.375us 8.458us 3
aten::_conv_depthwise2d 1.13% 23.150us 3.19% 65.451us 21.817us 25.375us 61.61% 25.375us 8.458us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.375us 61.61% 25.375us 8.458us 3
aten::to 0.31% 6.440us 84.93% 1.744ms 290.716us 0.000us 0.00% 18.435us 3.072us 6
aten::_to_copy 1.24% 25.501us 84.61% 1.738ms 289.642us 0.000us 0.00% 18.435us 3.072us 6
aten::copy_ 2.41% 49.431us 81.91% 1.682ms 280.380us 15.810us 38.39% 18.435us 3.072us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.386us 20.36% 8.386us 2.795us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 18.03% 7.424us 2.475us 3
Activity Buffer Request 70.32% 1.444ms 70.32% 1.444ms 1.444ms 2.625us 6.37% 2.625us 2.625us 1
aten::empty_strided 1.46% 30.070us 1.46% 30.070us 5.012us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.28% 211.144us 10.28% 211.144us 23.460us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.92% 18.949us 1.19% 24.411us 2.712us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.45% 9.313us 0.45% 9.313us 0.621us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.52% 10.601us 0.52% 10.601us 3.534us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 9.110us 0.44% 9.110us 3.037us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.29% 5.930us 0.36% 7.410us 2.470us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.054ms
Self CUDA time total: 41.185us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.348us 338.39% 348.348us 348.348us 1
torch_eager 7.21% 148.863us 99.73% 2.059ms 2.059ms 0.000us 0.00% 108.926us 108.926us 1
aten::conv1d 0.31% 6.430us 5.95% 122.893us 40.964us 0.000us 0.00% 70.592us 23.531us 3
aten::convolution 0.50% 10.290us 5.64% 116.463us 38.821us 0.000us 0.00% 70.592us 23.531us 3
aten::_convolution 1.17% 24.211us 5.14% 106.173us 35.391us 0.000us 0.00% 70.592us 23.531us 3
aten::_conv_depthwise2d 1.12% 23.052us 3.16% 65.282us 21.761us 70.592us 68.57% 70.592us 23.531us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.592us 68.57% 70.592us 23.531us 3
aten::to 0.31% 6.372us 85.15% 1.758ms 292.949us 0.000us 0.00% 38.334us 6.389us 6
aten::_to_copy 1.20% 24.680us 84.84% 1.751ms 291.887us 0.000us 0.00% 38.334us 6.389us 6
aten::copy_ 2.47% 51.072us 82.20% 1.697ms 282.787us 32.350us 31.43% 38.334us 6.389us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.695us 17.19% 17.695us 5.898us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.655us 14.24% 14.655us 4.885us 3
Activity Buffer Request 70.59% 1.457ms 70.59% 1.457ms 1.457ms 5.984us 5.81% 5.984us 5.984us 1
aten::empty_strided 1.45% 29.921us 1.45% 29.921us 4.987us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.23% 211.264us 10.23% 211.264us 23.474us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.89% 18.462us 1.17% 24.111us 2.679us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.47% 9.709us 0.47% 9.709us 0.647us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.47% 9.780us 0.47% 9.780us 3.260us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.47% 9.740us 0.47% 9.740us 3.247us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 5.880us 0.35% 7.260us 2.420us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.064ms
Self CUDA time total: 102.942us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 344.181us 304.53% 344.181us 344.181us 1
torch_eager 14.98% 124.863us 99.35% 828.302us 828.302us 0.000us 0.00% 119.036us 119.036us 1
aten::conv1d 0.70% 5.870us 14.55% 121.343us 40.448us 0.000us 0.00% 80.669us 26.890us 3
aten::convolution 1.17% 9.720us 13.85% 115.473us 38.491us 0.000us 0.00% 80.669us 26.890us 3
aten::_convolution 2.96% 24.691us 12.68% 105.753us 35.251us 0.000us 0.00% 80.669us 26.890us 3
aten::_conv_depthwise2d 2.65% 22.121us 7.65% 63.762us 21.254us 80.669us 71.38% 80.669us 26.890us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.669us 71.38% 80.669us 26.890us 3
aten::to 0.77% 6.429us 66.53% 554.705us 92.451us 0.000us 0.00% 38.367us 6.394us 6
aten::_to_copy 3.01% 25.101us 65.76% 548.276us 91.379us 0.000us 0.00% 38.367us 6.394us 6
aten::copy_ 6.16% 51.352us 59.05% 492.343us 82.057us 32.351us 28.62% 38.367us 6.394us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.696us 15.66% 17.696us 5.899us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.655us 12.97% 14.655us 4.885us 3
Activity Buffer Request 28.81% 240.197us 28.81% 240.197us 240.197us 6.016us 5.32% 6.016us 6.016us 1
aten::empty_strided 3.70% 30.832us 3.70% 30.832us 5.139us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 26.65% 222.174us 26.65% 222.174us 24.686us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.09% 17.401us 2.70% 22.541us 2.505us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.05% 8.790us 1.05% 8.790us 0.586us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.34% 11.151us 1.34% 11.151us 3.717us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.09% 9.110us 1.09% 9.110us 3.037us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.89% 7.450us 1.05% 8.790us 2.930us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 833.752us
Self CUDA time total: 113.020us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 14.21% 122.455us 95.83% 825.681us 825.681us 0.000us 0.00% 433.339us 433.339us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 419.771us 106.59% 419.771us 419.771us 1
aten::conv1d 0.75% 6.429us 14.10% 121.522us 40.507us 0.000us 0.00% 251.453us 83.818us 3
aten::convolution 1.15% 9.929us 13.36% 115.093us 38.364us 0.000us 0.00% 251.453us 83.818us 3
aten::_convolution 2.67% 23.042us 12.21% 105.164us 35.055us 0.000us 0.00% 251.453us 83.818us 3
aten::_conv_depthwise2d 2.60% 22.440us 7.52% 64.810us 21.603us 251.453us 63.85% 251.453us 83.818us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 251.453us 63.85% 251.453us 83.818us 3
aten::to 0.70% 6.001us 64.14% 552.672us 92.112us 0.000us 0.00% 181.886us 30.314us 6
aten::_to_copy 2.73% 23.540us 63.45% 546.671us 91.112us 0.000us 0.00% 181.886us 30.314us 6
aten::copy_ 5.94% 51.140us 57.36% 494.211us 82.368us 142.367us 36.15% 181.886us 30.314us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 102.367us 25.99% 102.367us 34.122us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.000us 10.16% 40.000us 13.333us 3
Activity Buffer Request 29.04% 250.247us 29.04% 250.247us 250.247us 39.519us 10.03% 39.519us 39.519us 1
aten::empty_strided 3.36% 28.920us 3.36% 28.920us 4.820us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.89% 214.494us 24.89% 214.494us 23.833us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.98% 17.062us 2.59% 22.273us 2.475us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.09% 9.391us 1.09% 9.391us 0.626us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.24% 10.660us 1.24% 10.660us 3.553us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.17% 10.040us 1.17% 10.040us 3.347us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.86% 7.370us 1.02% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 861.602us
Self CUDA time total: 393.820us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 15.32% 134.312us 91.67% 803.971us 803.971us 0.000us 0.00% 487.924us 487.924us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 476.501us 106.34% 476.501us 476.501us 1
aten::conv1d 0.67% 5.860us 13.82% 121.173us 40.391us 0.000us 0.00% 299.161us 99.720us 3
aten::convolution 1.17% 10.220us 13.15% 115.313us 38.438us 0.000us 0.00% 299.161us 99.720us 3
aten::_convolution 2.67% 23.450us 11.98% 105.093us 35.031us 0.000us 0.00% 299.161us 99.720us 3
aten::_conv_depthwise2d 2.56% 22.451us 7.48% 65.623us 21.874us 299.161us 66.76% 299.161us 99.720us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 299.161us 66.76% 299.161us 99.720us 3
aten::to 0.69% 6.051us 59.17% 518.906us 86.484us 0.000us 0.00% 188.763us 31.460us 6
aten::_to_copy 2.71% 23.771us 58.48% 512.855us 85.476us 0.000us 0.00% 188.763us 31.460us 6
aten::copy_ 5.69% 49.880us 52.31% 458.742us 76.457us 148.924us 33.24% 188.763us 31.460us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 108.861us 24.29% 108.861us 36.287us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.063us 8.94% 40.063us 13.354us 3
Activity Buffer Request 25.01% 219.366us 25.01% 219.366us 219.366us 39.839us 8.89% 39.839us 39.839us 1
aten::empty_strided 3.46% 30.342us 3.46% 30.342us 5.057us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.34% 213.439us 24.34% 213.439us 23.715us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.98% 17.400us 2.59% 22.720us 2.524us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.09% 9.540us 1.09% 9.540us 0.636us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.14% 10.010us 1.14% 10.010us 3.337us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.05% 9.219us 1.05% 9.219us 3.073us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.66% 5.750us 0.82% 7.210us 2.403us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 876.983us
Self CUDA time total: 448.085us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.392us 1804.85% 338.392us 338.392us 1
torch_eager 18.33% 161.236us 99.35% 873.703us 873.703us 0.000us 0.00% 20.637us 20.637us 1
aten::to 0.69% 6.070us 63.71% 560.224us 93.371us 0.000us 0.00% 13.406us 2.234us 6
aten::_to_copy 2.78% 24.471us 63.02% 554.154us 92.359us 0.000us 0.00% 13.406us 2.234us 6
aten::copy_ 5.94% 52.212us 56.85% 499.953us 83.325us 11.518us 61.43% 13.406us 2.234us 6
aten::conv1d 0.64% 5.659us 14.02% 123.282us 41.094us 0.000us 0.00% 7.231us 2.410us 3
aten::convolution 1.14% 9.999us 13.38% 117.623us 39.208us 0.000us 0.00% 7.231us 2.410us 3
aten::_convolution 2.72% 23.952us 12.24% 107.624us 35.875us 0.000us 0.00% 7.231us 2.410us 3
aten::_conv_depthwise2d 2.67% 23.519us 7.63% 67.130us 22.377us 7.231us 38.57% 7.231us 2.410us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.231us 38.57% 7.231us 2.410us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.854us 31.22% 5.854us 1.951us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 30.21% 5.664us 1.888us 3
Activity Buffer Request 29.52% 259.596us 29.52% 259.596us 259.596us 1.888us 10.07% 1.888us 1.888us 1
aten::empty_strided 3.38% 29.730us 3.38% 29.730us 4.955us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 23.99% 210.946us 23.99% 210.946us 23.438us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.07% 18.190us 2.71% 23.871us 2.652us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.11% 9.761us 1.11% 9.761us 0.651us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.24% 10.890us 1.24% 10.890us 3.630us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.13% 9.920us 1.13% 9.920us 3.307us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.68% 5.972us 0.85% 7.452us 2.484us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 879.393us
Self CUDA time total: 18.749us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.934us 1741.87% 338.934us 338.934us 1
torch_eager 16.71% 145.362us 99.29% 863.592us 863.592us 0.000us 0.00% 21.314us 21.314us 1
aten::to 0.71% 6.200us 65.36% 568.524us 94.754us 0.000us 0.00% 13.282us 2.214us 6
aten::_to_copy 2.85% 24.831us 64.65% 562.324us 93.721us 0.000us 0.00% 13.282us 2.214us 6
aten::copy_ 5.81% 50.550us 58.39% 507.883us 84.647us 11.426us 58.72% 13.282us 2.214us 6
aten::conv1d 0.78% 6.753us 14.06% 122.315us 40.772us 0.000us 0.00% 8.032us 2.677us 3
aten::convolution 1.19% 10.380us 13.29% 115.562us 38.521us 0.000us 0.00% 8.032us 2.677us 3
aten::_convolution 2.63% 22.841us 12.09% 105.182us 35.061us 0.000us 0.00% 8.032us 2.677us 3
aten::_conv_depthwise2d 2.65% 23.042us 7.65% 66.512us 22.171us 8.032us 41.28% 8.032us 2.677us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.032us 41.28% 8.032us 2.677us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.825us 29.94% 5.825us 1.942us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.601us 28.79% 5.601us 1.867us 3
Activity Buffer Request 30.62% 266.307us 30.62% 266.307us 266.307us 1.856us 9.54% 1.856us 1.856us 1
aten::empty_strided 3.40% 29.610us 3.40% 29.610us 4.935us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.61% 214.076us 24.61% 214.076us 23.786us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.02% 17.612us 2.63% 22.841us 2.538us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.02% 8.840us 1.02% 8.840us 0.589us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.22% 10.630us 1.22% 10.630us 3.543us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.13% 9.790us 1.13% 9.790us 3.263us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.67% 5.798us 0.82% 7.109us 2.370us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 869.783us
Self CUDA time total: 19.458us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.862us 1751.78% 340.862us 340.862us 1
torch_eager 8.44% 173.073us 99.74% 2.045ms 2.045ms 0.000us 0.00% 21.635us 21.635us 1
aten::to 0.33% 6.670us 84.06% 1.723ms 287.196us 0.000us 0.00% 14.307us 2.385us 6
aten::_to_copy 1.21% 24.883us 83.74% 1.717ms 286.084us 0.000us 0.00% 14.307us 2.385us 6
aten::copy_ 2.36% 48.471us 81.06% 1.662ms 276.949us 12.130us 62.34% 14.307us 2.385us 6
aten::conv1d 0.29% 5.970us 5.84% 119.613us 39.871us 0.000us 0.00% 7.328us 2.443us 3
aten::convolution 0.48% 9.780us 5.54% 113.643us 37.881us 0.000us 0.00% 7.328us 2.443us 3
aten::_convolution 1.14% 23.420us 5.07% 103.863us 34.621us 0.000us 0.00% 7.328us 2.443us 3
aten::_conv_depthwise2d 1.10% 22.512us 3.15% 64.503us 21.501us 7.328us 37.66% 7.328us 2.443us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 37.66% 7.328us 2.443us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.241us 32.07% 6.241us 2.080us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 30.27% 5.889us 1.963us 3
Activity Buffer Request 69.34% 1.421ms 69.34% 1.421ms 1.421ms 2.177us 11.19% 2.177us 2.177us 1
aten::empty_strided 1.46% 29.930us 1.46% 29.930us 4.988us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.50% 215.256us 10.50% 215.256us 23.917us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.86% 17.669us 1.13% 23.180us 2.576us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.47% 9.581us 0.47% 9.581us 0.639us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.48% 9.759us 0.48% 9.759us 3.253us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.43% 8.742us 0.43% 8.742us 2.914us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 5.760us 0.35% 7.110us 2.370us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.050ms
Self CUDA time total: 19.458us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 367.067us 1820.95% 367.067us 367.067us 1
torch_eager 17.50% 145.595us 99.30% 826.111us 826.111us 0.000us 0.00% 22.366us 22.366us 1
aten::to 0.75% 6.199us 63.72% 530.082us 88.347us 0.000us 0.00% 14.431us 2.405us 6
aten::_to_copy 2.95% 24.573us 62.97% 523.883us 87.314us 0.000us 0.00% 14.431us 2.405us 6
aten::copy_ 6.31% 52.521us 56.15% 467.170us 77.862us 12.223us 60.64% 14.431us 2.405us 6
aten::conv1d 0.69% 5.760us 14.59% 121.354us 40.451us 0.000us 0.00% 7.935us 2.645us 3
aten::convolution 1.24% 10.281us 13.89% 115.594us 38.531us 0.000us 0.00% 7.935us 2.645us 3
aten::_convolution 2.68% 22.269us 12.66% 105.313us 35.104us 0.000us 0.00% 7.935us 2.645us 3
aten::_conv_depthwise2d 2.73% 22.701us 8.02% 66.711us 22.237us 7.935us 39.36% 7.935us 2.645us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 39.36% 7.935us 2.645us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 31.27% 6.304us 2.101us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 29.36% 5.919us 1.973us 3
Activity Buffer Request 27.00% 224.665us 27.00% 224.665us 224.665us 2.208us 10.95% 2.208us 2.208us 1
aten::empty_strided 3.86% 32.140us 3.86% 32.140us 5.357us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.71% 213.894us 25.71% 213.894us 23.766us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.05% 17.041us 2.71% 22.553us 2.506us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.14% 9.503us 1.14% 9.503us 0.634us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.31% 10.920us 1.31% 10.920us 3.640us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.10% 9.180us 1.10% 9.180us 3.060us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.81% 6.740us 0.98% 8.160us 2.720us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 831.951us
Self CUDA time total: 20.158us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 363.100us 1005.93% 363.100us 363.100us 1
torch_eager 14.77% 122.163us 99.35% 821.971us 821.971us 0.000us 0.00% 38.688us 38.688us 1
aten::conv1d 0.72% 5.951us 17.29% 143.024us 47.675us 0.000us 0.00% 20.160us 6.720us 3
aten::convolution 1.22% 10.110us 16.57% 137.073us 45.691us 0.000us 0.00% 20.160us 6.720us 3
aten::_convolution 3.04% 25.151us 15.35% 126.963us 42.321us 0.000us 0.00% 20.160us 6.720us 3
aten::_conv_depthwise2d 4.80% 39.711us 10.31% 85.271us 28.424us 20.160us 55.85% 20.160us 6.720us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.160us 55.85% 20.160us 6.720us 3
aten::to 0.75% 6.172us 63.79% 527.804us 87.967us 0.000us 0.00% 18.528us 3.088us 6
aten::_to_copy 2.99% 24.751us 63.05% 521.632us 86.939us 0.000us 0.00% 18.528us 3.088us 6
aten::copy_ 6.14% 50.790us 56.45% 467.021us 77.837us 15.936us 44.15% 18.528us 3.088us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.512us 23.58% 8.512us 2.837us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 20.57% 7.424us 2.475us 3
Activity Buffer Request 27.93% 231.066us 27.93% 231.066us 231.066us 2.592us 7.18% 2.592us 2.592us 1
aten::empty_strided 3.61% 29.860us 3.61% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.33% 209.585us 25.33% 209.585us 23.287us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.11% 17.441us 2.75% 22.791us 2.532us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.15% 9.501us 1.15% 9.501us 0.633us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.26% 10.400us 1.26% 10.400us 3.467us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.30% 10.740us 1.30% 10.740us 3.580us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.76% 6.269us 0.93% 7.730us 2.577us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 827.381us
Self CUDA time total: 36.096us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.025us 883.88% 336.025us 336.025us 1
torch_eager 14.70% 120.902us 99.36% 817.351us 817.351us 0.000us 0.00% 40.610us 40.610us 1
aten::conv1d 0.71% 5.820us 14.44% 118.823us 39.608us 0.000us 0.00% 22.304us 7.435us 3
aten::convolution 1.12% 9.190us 13.74% 113.003us 37.668us 0.000us 0.00% 22.304us 7.435us 3
aten::_convolution 2.83% 23.270us 12.62% 103.813us 34.604us 0.000us 0.00% 22.304us 7.435us 3
aten::_conv_depthwise2d 2.83% 23.309us 7.79% 64.072us 21.357us 22.304us 58.67% 22.304us 7.435us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.304us 58.67% 22.304us 7.435us 3
aten::to 0.73% 5.990us 66.75% 549.075us 91.513us 0.000us 0.00% 18.306us 3.051us 6
aten::_to_copy 2.91% 23.953us 66.02% 543.085us 90.514us 0.000us 0.00% 18.306us 3.051us 6
aten::copy_ 6.07% 49.902us 59.57% 490.042us 81.674us 15.713us 41.33% 18.306us 3.051us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.353us 21.97% 8.353us 2.784us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 19.36% 7.360us 2.453us 3
Activity Buffer Request 30.85% 253.806us 30.85% 253.806us 253.806us 2.593us 6.82% 2.593us 2.593us 1
aten::empty_strided 3.54% 29.090us 3.54% 29.090us 4.848us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.29% 208.074us 25.29% 208.074us 23.119us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.19% 18.051us 2.84% 23.371us 2.597us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.11% 9.160us 1.11% 9.160us 0.611us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.21% 9.961us 1.21% 9.961us 3.320us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.10% 9.062us 1.10% 9.062us 3.021us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.80% 6.580us 0.96% 7.920us 2.640us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 822.611us
Self CUDA time total: 38.017us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.486us 522.89% 335.486us 335.486us 1
torch_eager 15.29% 123.163us 99.38% 800.491us 800.491us 0.000us 0.00% 68.256us 68.256us 1
aten::conv1d 0.73% 5.840us 14.87% 119.763us 39.921us 0.000us 0.00% 41.760us 13.920us 3
aten::convolution 1.21% 9.761us 14.14% 113.923us 37.974us 0.000us 0.00% 41.760us 13.920us 3
aten::_convolution 2.84% 22.911us 12.93% 104.162us 34.721us 0.000us 0.00% 41.760us 13.920us 3
aten::_conv_depthwise2d 2.80% 22.570us 8.02% 64.572us 21.524us 41.760us 65.09% 41.760us 13.920us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.760us 65.09% 41.760us 13.920us 3
aten::to 0.73% 5.842us 65.67% 528.904us 88.151us 0.000us 0.00% 26.496us 4.416us 6
aten::_to_copy 2.94% 23.712us 64.94% 523.062us 87.177us 0.000us 0.00% 26.496us 4.416us 6
aten::copy_ 6.02% 48.492us 58.29% 469.521us 78.253us 22.400us 34.91% 26.496us 4.416us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.968us 18.65% 11.968us 3.989us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 16.26% 10.432us 3.477us 3
Activity Buffer Request 29.33% 236.206us 29.33% 236.206us 236.206us 4.096us 6.38% 4.096us 4.096us 1
aten::empty_strided 3.70% 29.829us 3.70% 29.829us 4.971us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.91% 208.693us 25.91% 208.693us 23.188us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.18% 17.569us 2.86% 23.069us 2.563us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.14% 9.222us 1.14% 9.222us 0.615us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.20% 9.631us 1.20% 9.631us 3.210us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.06% 8.501us 1.06% 8.501us 2.834us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.83% 6.660us 0.99% 7.990us 2.663us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 805.451us
Self CUDA time total: 64.160us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.218us 487.48% 340.218us 340.218us 1
torch_eager 15.18% 124.853us 99.38% 817.682us 817.682us 0.000us 0.00% 73.887us 73.887us 1
aten::conv1d 0.72% 5.910us 14.57% 119.903us 39.968us 0.000us 0.00% 47.328us 15.776us 3
aten::convolution 1.21% 9.960us 13.86% 113.993us 37.998us 0.000us 0.00% 47.328us 15.776us 3
aten::_convolution 2.81% 23.101us 12.64% 104.033us 34.678us 0.000us 0.00% 47.328us 15.776us 3
aten::_conv_depthwise2d 2.62% 21.561us 7.83% 64.432us 21.477us 47.328us 67.81% 47.328us 15.776us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.328us 67.81% 47.328us 15.776us 3
aten::to 0.75% 6.180us 66.30% 545.475us 90.913us 0.000us 0.00% 26.559us 4.426us 6
aten::_to_copy 2.97% 24.459us 65.55% 539.295us 89.882us 0.000us 0.00% 26.559us 4.426us 6
aten::copy_ 6.14% 50.491us 58.93% 484.862us 80.810us 22.463us 32.19% 26.559us 4.426us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.032us 17.24% 12.032us 4.011us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 14.95% 10.431us 3.477us 3
Activity Buffer Request 30.21% 248.576us 30.21% 248.576us 248.576us 4.096us 5.87% 4.096us 4.096us 1
aten::empty_strided 3.64% 29.974us 3.64% 29.974us 4.996us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.32% 208.345us 25.32% 208.345us 23.149us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.09% 17.201us 2.72% 22.401us 2.489us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.11% 9.120us 1.11% 9.120us 0.608us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.32% 10.899us 1.32% 10.899us 3.633us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.15% 9.422us 1.15% 9.422us 3.141us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.80% 6.580us 0.98% 8.070us 2.690us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 822.752us
Self CUDA time total: 69.791us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.276us 192.10% 357.276us 357.276us 1
torch_eager 7.25% 148.445us 99.75% 2.043ms 2.043ms 0.000us 0.00% 196.063us 196.063us 1
aten::conv1d 0.28% 5.714us 6.04% 123.725us 41.242us 0.000us 0.00% 133.535us 44.512us 3
aten::convolution 0.50% 10.209us 5.76% 118.011us 39.337us 0.000us 0.00% 133.535us 44.512us 3
aten::_convolution 1.22% 24.922us 5.26% 107.802us 35.934us 0.000us 0.00% 133.535us 44.512us 3
aten::_conv_depthwise2d 1.06% 21.740us 3.25% 66.540us 22.180us 133.535us 71.80% 133.535us 44.512us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.535us 71.80% 133.535us 44.512us 3
aten::to 0.32% 6.558us 85.01% 1.741ms 290.215us 0.000us 0.00% 62.528us 10.421us 6
aten::_to_copy 1.28% 26.242us 84.69% 1.735ms 289.122us 0.000us 0.00% 62.528us 10.421us 6
aten::copy_ 2.37% 48.539us 81.91% 1.678ms 279.634us 52.448us 28.20% 62.528us 10.421us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.536us 15.88% 29.536us 9.845us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 12.32% 22.912us 7.637us 3
Activity Buffer Request 70.45% 1.443ms 70.45% 1.443ms 1.443ms 10.080us 5.42% 10.080us 10.080us 1
aten::empty_strided 1.50% 30.691us 1.50% 30.691us 5.115us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.22% 209.265us 10.22% 209.265us 23.252us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.93% 19.072us 1.20% 24.640us 2.738us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.45% 9.247us 0.45% 9.247us 0.616us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.55% 11.270us 0.55% 11.270us 3.757us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.51% 10.520us 0.51% 10.520us 3.507us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.29% 5.931us 0.35% 7.230us 2.410us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.048ms
Self CUDA time total: 185.983us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 358.235us 170.21% 358.235us 358.235us 1
torch_eager 15.50% 124.275us 99.34% 796.461us 796.461us 0.000us 0.00% 224.253us 224.253us 1
aten::conv1d 0.70% 5.590us 14.78% 118.483us 39.494us 0.000us 0.00% 154.174us 51.391us 3
aten::convolution 1.24% 9.921us 14.08% 112.893us 37.631us 0.000us 0.00% 154.174us 51.391us 3
aten::_convolution 2.81% 22.549us 12.84% 102.972us 34.324us 0.000us 0.00% 154.174us 51.391us 3
aten::_conv_depthwise2d 2.82% 22.632us 8.11% 65.062us 21.687us 154.174us 73.26% 154.174us 51.391us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.174us 73.26% 154.174us 51.391us 3
aten::to 0.74% 5.971us 65.46% 524.833us 87.472us 0.000us 0.00% 70.079us 11.680us 6
aten::_to_copy 3.23% 25.880us 64.72% 518.862us 86.477us 0.000us 0.00% 70.079us 11.680us 6
aten::copy_ 6.33% 50.713us 57.67% 462.401us 77.067us 56.287us 26.74% 70.079us 11.680us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 33.248us 15.80% 33.248us 11.083us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.039us 10.95% 23.039us 7.680us 3
Activity Buffer Request 28.19% 225.995us 28.19% 225.995us 225.995us 13.792us 6.55% 13.792us 13.792us 1
aten::empty_strided 3.81% 30.581us 3.81% 30.581us 5.097us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.98% 208.263us 25.98% 208.263us 23.140us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.24% 17.992us 2.91% 23.301us 2.589us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.16% 9.309us 1.16% 9.309us 0.621us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.31% 10.480us 1.31% 10.480us 3.493us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.17% 9.380us 1.17% 9.380us 3.127us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.74% 5.910us 0.92% 7.370us 2.457us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 801.751us
Self CUDA time total: 210.461us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 7.15% 131.473us 52.77% 970.085us 970.085us 0.000us 0.00% 1.521ms 1.521ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.421ms 100.40% 1.421ms 1.421ms 1
aten::to 0.36% 6.571us 37.17% 683.219us 113.870us 0.000us 0.00% 824.180us 137.363us 6
aten::_to_copy 1.61% 29.612us 36.81% 676.648us 112.775us 0.000us 0.00% 824.180us 137.363us 6
aten::copy_ 2.81% 51.569us 25.14% 462.051us 77.009us 718.613us 50.76% 824.180us 137.363us 6
aten::conv1d 0.36% 6.680us 6.82% 125.423us 41.808us 0.000us 0.00% 696.981us 232.327us 3
aten::convolution 0.57% 10.460us 6.46% 118.743us 39.581us 0.000us 0.00% 696.981us 232.327us 3
aten::_convolution 1.31% 24.040us 5.89% 108.283us 36.094us 0.000us 0.00% 696.981us 232.327us 3
aten::_conv_depthwise2d 1.25% 22.981us 3.69% 67.913us 22.638us 696.981us 49.24% 696.981us 232.327us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 696.981us 49.24% 696.981us 232.327us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 410.458us 29.00% 410.458us 136.819us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 308.155us 21.77% 308.155us 102.718us 3
Activity Buffer Request 11.91% 218.936us 11.91% 218.936us 218.936us 105.567us 7.46% 105.567us 105.567us 1
aten::empty_strided 2.01% 37.011us 10.06% 184.985us 30.831us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.74% 215.777us 11.74% 215.777us 23.975us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.99% 18.200us 1.31% 24.000us 2.667us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.53% 9.740us 0.53% 9.740us 0.649us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.59% 10.839us 0.59% 10.839us 3.613us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.54% 9.862us 0.54% 9.862us 3.287us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.34% 6.240us 0.42% 7.700us 2.567us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.838ms
Self CUDA time total: 1.416ms
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 6.74% 124.615us 43.66% 806.720us 806.720us 0.000us 0.00% 1.502ms 1.502ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.433ms 100.41% 1.433ms 1.433ms 1
aten::to 0.34% 6.269us 28.35% 523.751us 87.292us 0.000us 0.00% 764.786us 127.464us 6
aten::_to_copy 1.27% 23.480us 28.01% 517.482us 86.247us 0.000us 0.00% 764.786us 127.464us 6
aten::copy_ 2.74% 50.661us 25.15% 464.712us 77.452us 690.099us 48.36% 764.786us 127.464us 6
aten::conv1d 0.32% 5.870us 7.00% 129.374us 43.125us 0.000us 0.00% 737.040us 245.680us 3
aten::convolution 0.54% 9.999us 6.68% 123.504us 41.168us 0.000us 0.00% 737.040us 245.680us 3
aten::_convolution 1.31% 24.293us 6.14% 113.505us 37.835us 0.000us 0.00% 737.040us 245.680us 3
aten::_conv_depthwise2d 1.62% 30.010us 3.95% 73.060us 24.353us 737.040us 51.64% 737.040us 245.680us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 737.040us 51.64% 737.040us 245.680us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 399.673us 28.01% 399.673us 133.224us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 290.426us 20.35% 290.426us 96.809us 3
Activity Buffer Request 12.15% 224.466us 12.15% 224.466us 224.466us 74.687us 5.23% 74.687us 74.687us 1
aten::empty_strided 1.59% 29.290us 1.59% 29.290us 4.882us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.52% 212.785us 11.52% 212.785us 23.643us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.94% 17.281us 1.23% 22.771us 2.530us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.55% 10.081us 0.55% 10.081us 0.672us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.57% 10.440us 0.57% 10.440us 3.480us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.51% 9.410us 0.51% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.33% 6.150us 0.41% 7.641us 2.547us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.848ms
Self CUDA time total: 1.427ms
impl wl p50(ms) ok
torch_eager cuda_B2_D2048_S128_W2 0.09 True
torch_eager cuda_B2_D2048_S128_W4 0.08 True
torch_eager cuda_B2_D2048_S2048_W2 0.15 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
torch_eager cuda_B2_D2048_S512_W2 0.09 True
torch_eager cuda_B2_D2048_S512_W4 0.09 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.09 True
torch_eager cuda_B2_D64_S2048_W2 0.09 True
torch_eager cuda_B2_D64_S2048_W4 0.09 True
torch_eager cuda_B2_D64_S512_W2 0.09 True
torch_eager cuda_B2_D64_S512_W4 0.09 True
torch_eager cuda_B4_D2048_S128_W2 0.09 True
torch_eager cuda_B4_D2048_S128_W4 0.09 True
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
torch_eager cuda_B4_D2048_S512_W2 0.10 True
torch_eager cuda_B4_D2048_S512_W4 0.10 True
torch_eager cuda_B4_D64_S128_W2 0.09 True
torch_eager cuda_B4_D64_S128_W4 0.08 True
torch_eager cuda_B4_D64_S2048_W2 0.09 True
torch_eager cuda_B4_D64_S2048_W4 0.09 True
torch_eager cuda_B4_D64_S512_W2 0.09 True
torch_eager cuda_B4_D64_S512_W4 0.09 True