Running causal_conv1d benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 465.824us 2410.10% 465.824us 465.824us 1
torch_eager 10.38% 221.098us 99.69% 2.123ms 2.123ms 0.000us 0.00% 21.632us 21.632us 1
aten::to 0.54% 11.460us 78.80% 1.678ms 279.633us 0.000us 0.00% 14.304us 2.384us 6
aten::_to_copy 2.14% 45.672us 78.26% 1.666ms 277.723us 0.000us 0.00% 14.304us 2.384us 6
aten::copy_ 2.97% 63.201us 73.51% 1.565ms 260.883us 12.000us 62.09% 14.304us 2.384us 6
aten::conv1d 0.45% 9.560us 8.33% 177.314us 59.105us 0.000us 0.00% 7.328us 2.443us 3
aten::convolution 0.76% 16.270us 7.88% 167.754us 55.918us 0.000us 0.00% 7.328us 2.443us 3
aten::_convolution 1.63% 34.781us 7.11% 151.484us 50.495us 0.000us 0.00% 7.328us 2.443us 3
aten::_conv_depthwise2d 2.18% 46.460us 4.51% 96.001us 32.000us 7.328us 37.91% 7.328us 2.443us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 37.91% 7.328us 2.443us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.45% 6.272us 2.091us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.64% 5.728us 1.909us 3
Activity Buffer Request 67.39% 1.435ms 67.39% 1.435ms 1.435ms 2.304us 11.92% 2.304us 2.304us 1
aten::empty_strided 2.60% 55.371us 2.60% 55.371us 9.228us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 4.37% 93.031us 4.37% 93.031us 10.337us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.44% 30.589us 1.81% 38.620us 4.291us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.63% 13.371us 0.63% 13.371us 0.891us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.55% 11.811us 0.55% 11.811us 3.937us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.56% 11.940us 0.56% 11.940us 3.980us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.37% 7.972us 0.46% 9.712us 3.237us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.129ms
Self CUDA time total: 19.328us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.863us 1691.38% 332.863us 332.863us 1
torch_eager 6.60% 126.115us 99.71% 1.906ms 1.906ms 0.000us 0.00% 21.792us 21.792us 1
aten::to 0.31% 5.930us 85.54% 1.635ms 272.467us 0.000us 0.00% 13.760us 2.293us 6
aten::_to_copy 1.30% 24.791us 85.23% 1.629ms 271.478us 0.000us 0.00% 13.760us 2.293us 6
aten::copy_ 2.71% 51.809us 82.30% 1.573ms 262.158us 11.648us 59.19% 13.760us 2.293us 6
aten::conv1d 0.31% 5.929us 6.17% 117.852us 39.284us 0.000us 0.00% 8.032us 2.677us 3
aten::convolution 0.53% 10.111us 5.86% 111.923us 37.308us 0.000us 0.00% 8.032us 2.677us 3
aten::_convolution 1.20% 22.951us 5.33% 101.812us 33.937us 0.000us 0.00% 8.032us 2.677us 3
aten::_conv_depthwise2d 1.20% 22.860us 3.35% 64.021us 21.340us 8.032us 40.81% 8.032us 2.677us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.032us 40.81% 8.032us 2.677us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 30.89% 6.080us 2.027us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.29% 5.568us 1.856us 3
Activity Buffer Request 77.00% 1.472ms 77.00% 1.472ms 1.472ms 2.112us 10.73% 2.112us 2.112us 1
aten::empty_strided 1.63% 31.132us 1.63% 31.132us 5.189us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.70% 70.762us 3.70% 70.762us 7.862us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.87% 16.659us 1.16% 22.190us 2.466us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.46% 8.781us 0.46% 8.781us 0.585us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.55% 10.521us 0.55% 10.521us 3.507us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.49% 9.390us 0.49% 9.390us 3.130us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.29% 5.540us 0.35% 6.670us 2.223us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.911ms
Self CUDA time total: 19.680us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 355.199us 1913.89% 355.199us 355.199us 1
torch_eager 6.67% 125.171us 99.71% 1.872ms 1.872ms 0.000us 0.00% 20.511us 20.511us 1
aten::to 0.32% 6.091us 84.23% 1.581ms 263.570us 0.000us 0.00% 13.600us 2.267us 6
aten::_to_copy 1.32% 24.859us 83.90% 1.575ms 262.555us 0.000us 0.00% 13.600us 2.267us 6
aten::copy_ 2.70% 50.760us 80.88% 1.518ms 253.083us 11.648us 62.76% 13.600us 2.267us 6
aten::conv1d 0.30% 5.670us 7.37% 138.423us 46.141us 0.000us 0.00% 6.911us 2.304us 3
aten::convolution 0.52% 9.720us 7.07% 132.753us 44.251us 0.000us 0.00% 6.911us 2.304us 3
aten::_convolution 1.24% 23.210us 6.55% 123.033us 41.011us 0.000us 0.00% 6.911us 2.304us 3
aten::_conv_depthwise2d 1.26% 23.712us 4.48% 84.033us 28.011us 6.911us 37.24% 6.911us 2.304us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.911us 37.24% 6.911us 2.304us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 32.24% 5.984us 1.995us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 30.52% 5.664us 1.888us 3
Activity Buffer Request 75.59% 1.419ms 75.59% 1.419ms 1.419ms 1.952us 10.52% 1.952us 1.952us 1
aten::empty_strided 1.70% 31.973us 1.70% 31.973us 5.329us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.83% 72.002us 3.83% 72.002us 8.000us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.89% 16.661us 1.15% 21.682us 2.409us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.48% 8.941us 0.48% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.49% 28.041us 1.49% 28.041us 9.347us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.47% 8.840us 0.47% 8.840us 2.947us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.32% 5.960us 0.40% 7.470us 2.490us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.878ms
Self CUDA time total: 18.559us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.841us 1714.87% 335.841us 335.841us 1
torch_eager 6.09% 125.084us 99.75% 2.047ms 2.047ms 0.000us 0.00% 21.728us 21.728us 1
aten::to 0.29% 6.012us 86.59% 1.777ms 296.210us 0.000us 0.00% 14.049us 2.341us 6
aten::_to_copy 1.18% 24.318us 86.30% 1.771ms 295.209us 0.000us 0.00% 14.049us 2.341us 6
aten::copy_ 2.44% 50.170us 83.64% 1.717ms 286.105us 11.905us 60.79% 14.049us 2.341us 6
aten::conv1d 0.29% 5.981us 5.73% 117.633us 39.211us 0.000us 0.00% 7.679us 2.560us 3
aten::convolution 0.48% 9.909us 5.44% 111.652us 37.217us 0.000us 0.00% 7.679us 2.560us 3
aten::_convolution 1.11% 22.712us 4.96% 101.743us 33.914us 0.000us 0.00% 7.679us 2.560us 3
aten::_conv_depthwise2d 1.08% 22.231us 3.11% 63.781us 21.260us 7.679us 39.21% 7.679us 2.560us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.679us 39.21% 7.679us 2.560us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 31.54% 6.176us 2.059us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.729us 29.25% 5.729us 1.910us 3
Activity Buffer Request 70.17% 1.440ms 70.17% 1.440ms 1.440ms 2.144us 10.95% 2.144us 2.144us 1
aten::empty_strided 1.48% 30.301us 1.48% 30.301us 5.050us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.02% 246.676us 12.02% 246.676us 27.408us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.85% 17.450us 1.12% 22.930us 2.548us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 8.940us 0.44% 8.940us 0.596us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.47% 9.630us 0.47% 9.630us 3.210us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.56% 11.490us 0.56% 11.490us 3.830us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 5.710us 0.34% 6.930us 2.310us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.053ms
Self CUDA time total: 19.584us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 329.565us 1339.31% 329.565us 329.565us 1
torch_eager 6.13% 122.184us 99.75% 1.990ms 1.990ms 0.000us 0.00% 26.911us 26.911us 1
aten::to 0.30% 5.979us 86.40% 1.724ms 287.259us 0.000us 0.00% 15.359us 2.560us 6
aten::_to_copy 1.37% 27.300us 86.10% 1.718ms 286.262us 0.000us 0.00% 15.359us 2.560us 6
aten::copy_ 2.45% 48.801us 83.22% 1.660ms 276.655us 13.055us 53.05% 15.359us 2.560us 6
aten::conv1d 0.29% 5.841us 5.86% 116.932us 38.977us 0.000us 0.00% 11.552us 3.851us 3
aten::convolution 0.50% 9.929us 5.57% 111.091us 37.030us 0.000us 0.00% 11.552us 3.851us 3
aten::_convolution 1.16% 23.192us 5.07% 101.162us 33.721us 0.000us 0.00% 11.552us 3.851us 3
aten::_conv_depthwise2d 1.12% 22.341us 3.11% 62.030us 20.677us 11.552us 46.95% 11.552us 3.851us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 46.95% 11.552us 3.851us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.688us 27.18% 6.688us 2.229us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.367us 25.87% 6.367us 2.122us 3
Activity Buffer Request 71.71% 1.430ms 71.71% 1.430ms 1.430ms 2.304us 9.36% 2.304us 2.304us 1
aten::empty_strided 1.52% 30.342us 1.52% 30.342us 5.057us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.06% 200.744us 10.06% 200.744us 22.305us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.86% 17.251us 1.14% 22.681us 2.520us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.45% 9.051us 0.45% 9.051us 0.603us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.48% 9.579us 0.48% 9.579us 3.193us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.50% 10.050us 0.50% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.30% 6.019us 0.36% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.995ms
Self CUDA time total: 24.607us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 358.812us 1379.20% 358.812us 358.812us 1
torch_eager 6.94% 139.423us 99.75% 2.005ms 2.005ms 0.000us 0.00% 28.256us 28.256us 1
aten::to 0.33% 6.550us 85.45% 1.717ms 286.205us 0.000us 0.00% 15.199us 2.533us 6
aten::_to_copy 1.20% 24.182us 85.13% 1.711ms 285.114us 0.000us 0.00% 15.199us 2.533us 6
aten::copy_ 2.59% 52.130us 82.30% 1.654ms 275.648us 12.959us 49.81% 15.199us 2.533us 6
aten::conv1d 0.30% 6.120us 5.97% 119.993us 39.998us 0.000us 0.00% 13.057us 4.352us 3
aten::convolution 0.48% 9.660us 5.67% 113.873us 37.958us 0.000us 0.00% 13.057us 4.352us 3
aten::_convolution 1.13% 22.802us 5.19% 104.213us 34.738us 0.000us 0.00% 13.057us 4.352us 3
aten::_conv_depthwise2d 1.09% 21.932us 3.25% 65.242us 21.747us 13.057us 50.19% 13.057us 4.352us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.057us 50.19% 13.057us 4.352us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.623us 25.46% 6.623us 2.208us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 24.35% 6.336us 2.112us 3
Activity Buffer Request 70.68% 1.420ms 70.68% 1.420ms 1.420ms 2.240us 8.61% 2.240us 2.240us 1
aten::empty_strided 1.62% 32.611us 1.62% 32.611us 5.435us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.17% 204.364us 10.17% 204.364us 22.707us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.88% 17.647us 1.15% 23.189us 2.577us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.47% 9.382us 0.47% 9.382us 0.625us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.58% 11.651us 0.58% 11.651us 3.884us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 8.769us 0.44% 8.769us 2.923us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.32% 6.420us 0.39% 7.890us 2.630us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.010ms
Self CUDA time total: 26.016us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.896us 853.65% 328.896us 328.896us 1
torch_eager 6.29% 121.493us 99.73% 1.928ms 1.928ms 0.000us 0.00% 41.088us 41.088us 1
aten::conv1d 0.31% 5.961us 6.00% 115.903us 38.634us 0.000us 0.00% 22.688us 7.563us 3
aten::convolution 0.50% 9.600us 5.69% 109.942us 36.647us 0.000us 0.00% 22.688us 7.563us 3
aten::_convolution 1.16% 22.510us 5.19% 100.342us 33.447us 0.000us 0.00% 22.688us 7.563us 3
aten::_conv_depthwise2d 1.17% 22.551us 3.25% 62.881us 20.960us 22.688us 58.89% 22.688us 7.563us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.688us 58.89% 22.688us 7.563us 3
aten::to 0.33% 6.421us 86.08% 1.664ms 277.308us 0.000us 0.00% 18.400us 3.067us 6
aten::_to_copy 1.25% 24.161us 85.75% 1.657ms 276.238us 0.000us 0.00% 18.400us 3.067us 6
aten::copy_ 2.57% 49.759us 82.93% 1.603ms 267.166us 15.840us 41.11% 18.400us 3.067us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 21.93% 8.448us 2.816us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 19.19% 7.392us 2.464us 3
Activity Buffer Request 71.07% 1.374ms 71.07% 1.374ms 1.374ms 2.560us 6.64% 2.560us 2.560us 1
aten::empty_strided 1.57% 30.271us 1.57% 30.271us 5.045us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.43% 201.525us 10.43% 201.525us 22.392us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.86% 16.701us 1.14% 22.001us 2.445us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.45% 8.751us 0.45% 8.751us 0.583us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.48% 9.290us 0.48% 9.290us 3.097us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.47% 9.060us 0.47% 9.060us 3.020us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 5.459us 0.35% 6.690us 2.230us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.933ms
Self CUDA time total: 38.528us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.458us 810.83% 334.458us 334.458us 1
torch_eager 6.32% 125.394us 99.75% 1.978ms 1.978ms 0.000us 0.00% 43.841us 43.841us 1
aten::conv1d 0.30% 5.899us 5.88% 116.562us 38.854us 0.000us 0.00% 25.600us 8.533us 3
aten::convolution 0.49% 9.810us 5.58% 110.663us 36.888us 0.000us 0.00% 25.600us 8.533us 3
aten::_convolution 1.13% 22.411us 5.09% 100.853us 33.618us 0.000us 0.00% 25.600us 8.533us 3
aten::_conv_depthwise2d 1.14% 22.520us 3.20% 63.392us 21.131us 25.600us 62.06% 25.600us 8.533us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.600us 62.06% 25.600us 8.533us 3
aten::to 0.30% 5.959us 86.14% 1.708ms 284.675us 0.000us 0.00% 18.241us 3.040us 6
aten::_to_copy 1.33% 26.372us 85.84% 1.702ms 283.682us 0.000us 0.00% 18.241us 3.040us 6
aten::copy_ 2.49% 49.420us 83.02% 1.646ms 274.363us 15.649us 37.94% 18.241us 3.040us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.321us 20.17% 8.321us 2.774us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 17.77% 7.328us 2.443us 3
Activity Buffer Request 71.51% 1.418ms 71.51% 1.418ms 1.418ms 2.592us 6.28% 2.592us 2.592us 1
aten::empty_strided 1.49% 29.540us 1.49% 29.540us 4.923us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.06% 199.427us 10.06% 199.427us 22.159us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.92% 18.199us 1.18% 23.330us 2.592us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 8.651us 0.44% 8.651us 0.577us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.54% 10.640us 0.54% 10.640us 3.547us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.48% 9.610us 0.48% 9.610us 3.203us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 5.590us 0.34% 6.770us 2.257us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.983ms
Self CUDA time total: 41.249us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.849us 326.92% 338.849us 338.849us 1
torch_eager 5.95% 117.585us 99.74% 1.970ms 1.970ms 0.000us 0.00% 109.697us 109.697us 1
aten::conv1d 0.30% 5.970us 6.05% 119.502us 39.834us 0.000us 0.00% 71.232us 23.744us 3
aten::convolution 0.49% 9.700us 5.75% 113.532us 37.844us 0.000us 0.00% 71.232us 23.744us 3
aten::_convolution 1.15% 22.781us 5.26% 103.832us 34.611us 0.000us 0.00% 71.232us 23.744us 3
aten::_conv_depthwise2d 1.18% 23.259us 3.31% 65.420us 21.807us 71.232us 68.72% 71.232us 23.744us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 71.232us 68.72% 71.232us 23.744us 3
aten::to 0.31% 6.199us 86.38% 1.706ms 284.313us 0.000us 0.00% 38.465us 6.411us 6
aten::_to_copy 1.31% 25.891us 86.06% 1.700ms 283.280us 0.000us 0.00% 38.465us 6.411us 6
aten::copy_ 2.57% 50.812us 83.17% 1.643ms 273.758us 32.417us 31.28% 38.465us 6.411us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.760us 17.13% 17.760us 5.920us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.657us 14.14% 14.657us 4.886us 3
Activity Buffer Request 71.61% 1.414ms 71.61% 1.414ms 1.414ms 6.048us 5.84% 6.048us 6.048us 1
aten::empty_strided 1.58% 31.240us 1.58% 31.240us 5.207us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.13% 200.155us 10.13% 200.155us 22.239us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.87% 17.181us 1.15% 22.621us 2.513us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.45% 8.941us 0.45% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.51% 10.050us 0.51% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.47% 9.370us 0.47% 9.370us 3.123us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 5.551us 0.35% 6.851us 2.284us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.975ms
Self CUDA time total: 103.649us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.597us 314.53% 357.597us 357.597us 1
torch_eager 6.01% 120.196us 99.73% 1.995ms 1.995ms 0.000us 0.00% 119.645us 119.645us 1
aten::conv1d 0.28% 5.578us 6.85% 137.112us 45.704us 0.000us 0.00% 81.344us 27.115us 3
aten::convolution 0.47% 9.452us 6.58% 131.534us 43.845us 0.000us 0.00% 81.344us 27.115us 3
aten::_convolution 1.16% 23.298us 6.10% 122.082us 40.694us 0.000us 0.00% 81.344us 27.115us 3
aten::_conv_depthwise2d 1.16% 23.221us 4.15% 82.932us 27.644us 81.344us 71.55% 81.344us 27.115us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 81.344us 71.55% 81.344us 27.115us 3
aten::to 0.33% 6.509us 85.46% 1.710ms 284.935us 0.000us 0.00% 38.301us 6.383us 6
aten::_to_copy 1.29% 25.870us 85.14% 1.703ms 283.850us 0.000us 0.00% 38.301us 6.383us 6
aten::copy_ 2.58% 51.531us 82.27% 1.646ms 274.308us 32.350us 28.45% 38.301us 6.383us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.727us 15.59% 17.727us 5.909us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.623us 12.86% 14.623us 4.874us 3
Activity Buffer Request 70.95% 1.419ms 70.95% 1.419ms 1.419ms 5.951us 5.23% 5.951us 5.951us 1
aten::empty_strided 1.57% 31.380us 1.57% 31.380us 5.230us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.95% 199.044us 9.95% 199.044us 22.116us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.89% 17.740us 1.16% 23.191us 2.577us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.47% 9.433us 0.47% 9.433us 0.629us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.53% 10.531us 0.53% 10.531us 3.510us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.26% 25.130us 1.26% 25.130us 8.377us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.30% 6.010us 0.38% 7.612us 2.537us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.000ms
Self CUDA time total: 113.694us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 5.97% 120.782us 97.66% 1.975ms 1.975ms 0.000us 0.00% 434.301us 434.301us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 421.021us 106.85% 421.021us 421.021us 1
aten::conv1d 0.30% 6.069us 5.79% 117.202us 39.067us 0.000us 0.00% 251.007us 83.669us 3
aten::convolution 0.47% 9.471us 5.49% 111.133us 37.044us 0.000us 0.00% 251.007us 83.669us 3
aten::_convolution 1.10% 22.180us 5.03% 101.662us 33.887us 0.000us 0.00% 251.007us 83.669us 3
aten::_conv_depthwise2d 1.13% 22.779us 3.17% 64.182us 21.394us 251.007us 63.71% 251.007us 83.669us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 251.007us 63.71% 251.007us 83.669us 3
aten::to 0.31% 6.200us 84.52% 1.710ms 284.917us 0.000us 0.00% 183.294us 30.549us 6
aten::_to_copy 1.19% 24.072us 84.22% 1.703ms 283.884us 0.000us 0.00% 183.294us 30.549us 6
aten::copy_ 2.45% 49.593us 81.56% 1.650ms 274.942us 143.007us 36.29% 183.294us 30.549us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 102.495us 26.01% 102.495us 34.165us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.512us 10.28% 40.512us 13.504us 3
Activity Buffer Request 70.36% 1.423ms 70.36% 1.423ms 1.423ms 40.287us 10.22% 40.287us 40.287us 1
aten::empty_strided 1.46% 29.579us 1.46% 29.579us 4.930us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.86% 199.474us 9.86% 199.474us 22.164us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.84% 17.021us 1.11% 22.432us 2.492us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.45% 9.090us 0.45% 9.090us 0.606us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.48% 9.720us 0.48% 9.720us 3.240us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.45% 9.202us 0.45% 9.202us 3.067us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.28% 5.680us 0.35% 7.060us 2.353us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.023ms
Self CUDA time total: 394.014us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 5.89% 122.072us 95.29% 1.975ms 1.975ms 0.000us 0.00% 486.458us 486.458us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 474.010us 106.16% 474.010us 474.010us 1
aten::conv1d 0.28% 5.830us 5.59% 115.853us 38.618us 0.000us 0.00% 299.291us 99.764us 3
aten::convolution 0.46% 9.610us 5.31% 110.023us 36.674us 0.000us 0.00% 299.291us 99.764us 3
aten::_convolution 1.08% 22.439us 4.85% 100.413us 33.471us 0.000us 0.00% 299.291us 99.764us 3
aten::_conv_depthwise2d 1.04% 21.490us 3.04% 62.983us 20.994us 299.291us 67.03% 299.291us 99.764us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 299.291us 67.03% 299.291us 99.764us 3
aten::to 0.31% 6.341us 82.51% 1.710ms 284.962us 0.000us 0.00% 187.167us 31.195us 6
aten::_to_copy 1.23% 25.592us 82.20% 1.703ms 283.906us 0.000us 0.00% 187.167us 31.195us 6
aten::copy_ 2.39% 49.481us 79.48% 1.647ms 274.512us 147.199us 32.97% 187.167us 31.195us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 106.911us 23.94% 106.911us 35.637us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.288us 9.02% 40.288us 13.429us 3
Activity Buffer Request 68.62% 1.422ms 68.62% 1.422ms 1.422ms 39.968us 8.95% 39.968us 39.968us 1
aten::empty_strided 1.48% 30.770us 1.48% 30.770us 5.128us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.53% 197.485us 9.53% 197.485us 21.943us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.81% 16.791us 1.08% 22.301us 2.478us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.44% 9.141us 0.44% 9.141us 0.609us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.47% 9.701us 0.47% 9.701us 3.234us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.48% 9.941us 0.48% 9.941us 3.314us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 5.510us 0.33% 6.790us 2.263us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.072ms
Self CUDA time total: 446.490us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 358.523us 1924.96% 358.523us 358.523us 1
torch_eager 17.94% 139.773us 99.33% 774.049us 774.049us 0.000us 0.00% 20.513us 20.513us 1
aten::to 0.94% 7.351us 62.88% 489.983us 81.664us 0.000us 0.00% 13.376us 2.229us 6
aten::_to_copy 3.20% 24.930us 61.93% 482.632us 80.439us 0.000us 0.00% 13.376us 2.229us 6
aten::copy_ 6.90% 53.742us 54.52% 424.881us 70.813us 11.488us 61.68% 13.376us 2.229us 6
aten::conv1d 0.75% 5.841us 15.01% 116.973us 38.991us 0.000us 0.00% 7.137us 2.379us 3
aten::convolution 1.33% 10.360us 14.26% 111.132us 37.044us 0.000us 0.00% 7.137us 2.379us 3
aten::_convolution 3.01% 23.430us 12.93% 100.772us 33.591us 0.000us 0.00% 7.137us 2.379us 3
aten::_conv_depthwise2d 2.81% 21.882us 7.98% 62.192us 20.731us 7.137us 38.32% 7.137us 2.379us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.137us 38.32% 7.137us 2.379us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 31.61% 5.888us 1.963us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.600us 30.07% 5.600us 1.867us 3
Activity Buffer Request 24.98% 194.695us 24.98% 194.695us 194.695us 1.888us 10.14% 1.888us 1.888us 1
aten::empty_strided 4.21% 32.821us 4.21% 32.821us 5.470us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.28% 197.004us 25.28% 197.004us 21.889us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.16% 16.850us 2.84% 22.160us 2.462us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.13% 8.821us 1.13% 8.821us 0.588us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.22% 9.521us 1.22% 9.521us 3.174us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.31% 10.229us 1.31% 10.229us 3.410us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.74% 5.740us 0.90% 7.020us 2.340us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 779.258us
Self CUDA time total: 18.625us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.763us 1698.07% 328.763us 328.763us 1
torch_eager 14.65% 115.015us 99.34% 779.670us 779.670us 0.000us 0.00% 21.248us 21.248us 1
aten::to 0.80% 6.290us 66.21% 519.631us 86.605us 0.000us 0.00% 13.406us 2.234us 6
aten::_to_copy 3.14% 24.649us 65.41% 513.341us 85.557us 0.000us 0.00% 13.406us 2.234us 6
aten::copy_ 6.80% 53.351us 58.20% 456.761us 76.127us 11.519us 59.50% 13.406us 2.234us 6
aten::conv1d 0.75% 5.880us 15.10% 118.484us 39.495us 0.000us 0.00% 7.842us 2.614us 3
aten::convolution 1.21% 9.513us 14.35% 112.604us 37.535us 0.000us 0.00% 7.842us 2.614us 3
aten::_convolution 2.83% 22.229us 13.14% 103.091us 34.364us 0.000us 0.00% 7.842us 2.614us 3
aten::_conv_depthwise2d 3.15% 24.720us 8.43% 66.141us 22.047us 7.842us 40.50% 7.842us 2.614us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.842us 40.50% 7.842us 2.614us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.887us 30.41% 5.887us 1.962us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.09% 5.632us 1.877us 3
Activity Buffer Request 29.55% 231.946us 29.55% 231.946us 231.946us 1.887us 9.75% 1.887us 1.887us 1
aten::empty_strided 4.07% 31.931us 4.07% 31.931us 5.322us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.68% 193.684us 24.68% 193.684us 21.520us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.11% 16.541us 2.75% 21.581us 2.398us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.09% 8.568us 1.09% 8.568us 0.571us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.27% 9.951us 1.27% 9.951us 3.317us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.18% 9.250us 1.18% 9.250us 3.083us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.72% 5.642us 0.89% 6.980us 2.327us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 784.850us
Self CUDA time total: 19.361us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.454us 1698.73% 330.454us 330.454us 1
torch_eager 14.50% 115.185us 99.38% 789.290us 789.290us 0.000us 0.00% 21.628us 21.628us 1
aten::to 0.75% 5.979us 66.62% 529.132us 88.189us 0.000us 0.00% 14.332us 2.389us 6
aten::_to_copy 3.11% 24.732us 65.87% 523.153us 87.192us 0.000us 0.00% 14.332us 2.389us 6
aten::copy_ 6.75% 53.590us 58.69% 466.101us 77.684us 12.157us 62.49% 14.332us 2.389us 6
aten::conv1d 0.72% 5.740us 14.75% 117.122us 39.041us 0.000us 0.00% 7.296us 2.432us 3
aten::convolution 1.18% 9.359us 14.02% 111.382us 37.127us 0.000us 0.00% 7.296us 2.432us 3
aten::_convolution 2.82% 22.362us 12.85% 102.023us 34.008us 0.000us 0.00% 7.296us 2.432us 3
aten::_conv_depthwise2d 2.86% 22.741us 8.10% 64.351us 21.450us 7.296us 37.51% 7.296us 2.432us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.296us 37.51% 7.296us 2.432us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.238us 32.07% 6.238us 2.079us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 30.43% 5.919us 1.973us 3
Activity Buffer Request 30.19% 239.746us 30.19% 239.746us 239.746us 2.175us 11.18% 2.175us 2.175us 1
aten::empty_strided 4.07% 32.320us 4.07% 32.320us 5.387us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.58% 195.235us 24.58% 195.235us 21.693us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.10% 16.713us 2.76% 21.891us 2.432us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.12% 8.919us 1.12% 8.919us 0.595us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.20% 9.570us 1.20% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.20% 9.570us 1.20% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.72% 5.709us 0.89% 7.030us 2.343us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 794.200us
Self CUDA time total: 19.453us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.021us 1622.51% 325.021us 325.021us 1
torch_eager 14.95% 114.725us 99.33% 762.279us 762.279us 0.000us 0.00% 22.176us 22.176us 1
aten::to 0.78% 5.949us 65.87% 505.530us 84.255us 0.000us 0.00% 14.272us 2.379us 6
aten::_to_copy 3.19% 24.509us 65.10% 499.581us 83.264us 0.000us 0.00% 14.272us 2.379us 6
aten::copy_ 6.59% 50.599us 57.97% 444.890us 74.148us 12.128us 60.54% 14.272us 2.379us 6
aten::conv1d 0.79% 6.100us 15.11% 115.973us 38.658us 0.000us 0.00% 7.904us 2.635us 3
aten::convolution 1.34% 10.290us 14.32% 109.873us 36.624us 0.000us 0.00% 7.904us 2.635us 3
aten::_convolution 2.97% 22.812us 12.98% 99.583us 33.194us 0.000us 0.00% 7.904us 2.635us 3
aten::_conv_depthwise2d 2.93% 22.501us 8.10% 62.182us 20.727us 7.904us 39.46% 7.904us 2.635us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 39.46% 7.904us 2.635us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 30.99% 6.208us 2.069us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.55% 5.920us 1.973us 3
Activity Buffer Request 28.71% 220.306us 28.71% 220.306us 220.306us 2.144us 10.70% 2.144us 2.144us 1
aten::empty_strided 3.93% 30.182us 3.93% 30.182us 5.030us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.32% 194.286us 25.32% 194.286us 21.587us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.11% 16.159us 2.76% 21.209us 2.357us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.09% 8.360us 1.09% 8.360us 0.557us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.23% 9.450us 1.23% 9.450us 3.150us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.29% 9.930us 1.29% 9.930us 3.310us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.71% 5.470us 0.87% 6.670us 2.223us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 767.429us
Self CUDA time total: 20.032us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 356.764us 983.15% 356.764us 356.764us 1
torch_eager 15.53% 123.844us 99.36% 792.350us 792.350us 0.000us 0.00% 38.944us 38.944us 1
aten::conv1d 0.79% 6.320us 15.33% 122.233us 40.744us 0.000us 0.00% 20.320us 6.773us 3
aten::convolution 1.24% 9.851us 14.54% 115.913us 38.638us 0.000us 0.00% 20.320us 6.773us 3
aten::_convolution 2.89% 23.052us 13.30% 106.062us 35.354us 0.000us 0.00% 20.320us 6.773us 3
aten::_conv_depthwise2d 2.97% 23.692us 8.39% 66.891us 22.297us 20.320us 56.00% 20.320us 6.773us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.320us 56.00% 20.320us 6.773us 3
aten::to 0.80% 6.349us 64.76% 516.391us 86.065us 0.000us 0.00% 18.624us 3.104us 6
aten::_to_copy 3.21% 25.572us 63.96% 510.042us 85.007us 0.000us 0.00% 18.624us 3.104us 6
aten::copy_ 6.54% 52.120us 56.52% 450.739us 75.123us 15.968us 44.00% 18.624us 3.104us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.607us 23.72% 8.607us 2.869us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 20.28% 7.361us 2.454us 3
Activity Buffer Request 27.46% 218.966us 27.46% 218.966us 218.966us 2.656us 7.32% 2.656us 2.656us 1
aten::empty_strided 4.23% 33.731us 4.23% 33.731us 5.622us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.38% 202.413us 25.38% 202.413us 22.490us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.20% 17.520us 2.88% 22.939us 2.549us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.21% 9.679us 1.21% 9.679us 0.645us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.40% 11.140us 1.40% 11.140us 3.713us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.17% 9.299us 1.17% 9.299us 3.100us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.75% 6.010us 0.93% 7.450us 2.483us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 797.430us
Self CUDA time total: 36.288us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.353us 866.25% 332.353us 332.353us 1
torch_eager 6.20% 124.083us 99.73% 1.997ms 1.997ms 0.000us 0.00% 40.959us 40.959us 1
aten::conv1d 0.30% 6.071us 5.74% 115.013us 38.338us 0.000us 0.00% 22.592us 7.531us 3
aten::convolution 0.48% 9.660us 5.44% 108.942us 36.314us 0.000us 0.00% 22.592us 7.531us 3
aten::_convolution 1.09% 21.840us 4.96% 99.282us 33.094us 0.000us 0.00% 22.592us 7.531us 3
aten::_conv_depthwise2d 1.15% 22.991us 3.11% 62.342us 20.781us 22.592us 58.88% 22.592us 7.531us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.592us 58.88% 22.592us 7.531us 3
aten::to 0.32% 6.339us 86.44% 1.731ms 288.505us 0.000us 0.00% 18.367us 3.061us 6
aten::_to_copy 1.25% 24.980us 86.12% 1.725ms 287.449us 0.000us 0.00% 18.367us 3.061us 6
aten::copy_ 2.51% 50.252us 83.36% 1.669ms 278.222us 15.775us 41.12% 18.367us 3.061us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.416us 21.94% 8.416us 2.805us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.359us 19.18% 7.359us 2.453us 3
Activity Buffer Request 72.13% 1.445ms 72.13% 1.445ms 1.445ms 2.592us 6.76% 2.592us 2.592us 1
aten::empty_strided 1.52% 30.382us 1.52% 30.382us 5.064us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.74% 194.985us 9.74% 194.985us 21.665us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.87% 17.330us 1.13% 22.630us 2.514us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.45% 8.941us 0.45% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.48% 9.610us 0.48% 9.610us 3.203us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.46% 9.250us 0.46% 9.250us 3.083us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 5.490us 0.34% 6.780us 2.260us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.003ms
Self CUDA time total: 38.367us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.952us 509.17% 328.952us 328.952us 1
torch_eager 15.31% 114.903us 99.32% 745.599us 745.599us 0.000us 0.00% 68.701us 68.701us 1
aten::conv1d 0.89% 6.660us 15.50% 116.373us 38.791us 0.000us 0.00% 42.238us 14.079us 3
aten::convolution 1.33% 9.952us 14.61% 109.713us 36.571us 0.000us 0.00% 42.238us 14.079us 3
aten::_convolution 2.95% 22.149us 13.29% 99.761us 33.254us 0.000us 0.00% 42.238us 14.079us 3
aten::_conv_depthwise2d 2.94% 22.090us 8.38% 62.891us 20.964us 42.238us 65.38% 42.238us 14.079us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 42.238us 65.38% 42.238us 14.079us 3
aten::to 0.80% 6.039us 65.05% 488.341us 81.390us 0.000us 0.00% 26.463us 4.410us 6
aten::_to_copy 3.23% 24.281us 64.25% 482.302us 80.384us 0.000us 0.00% 26.463us 4.410us 6
aten::copy_ 6.57% 49.302us 56.69% 425.561us 70.927us 22.367us 34.62% 26.463us 4.410us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.936us 18.48% 11.936us 3.979us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 16.15% 10.431us 3.477us 3
Activity Buffer Request 26.58% 199.565us 26.58% 199.565us 199.565us 4.096us 6.34% 4.096us 4.096us 1
aten::empty_strided 4.32% 32.460us 4.32% 32.460us 5.410us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 26.45% 198.565us 26.45% 198.565us 22.063us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.13% 16.001us 2.81% 21.091us 2.343us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.16% 8.690us 1.16% 8.690us 0.579us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.26% 9.490us 1.26% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.26% 9.440us 1.26% 9.440us 3.147us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.75% 5.611us 0.93% 6.981us 2.327us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 750.709us
Self CUDA time total: 64.605us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.798us 467.68% 328.798us 328.798us 1
torch_eager 14.69% 115.264us 99.37% 779.669us 779.669us 0.000us 0.00% 74.432us 74.432us 1
aten::conv1d 0.75% 5.869us 14.89% 116.853us 38.951us 0.000us 0.00% 47.840us 15.947us 3
aten::convolution 1.20% 9.412us 14.15% 110.984us 36.995us 0.000us 0.00% 47.840us 15.947us 3
aten::_convolution 2.99% 23.451us 12.95% 101.572us 33.857us 0.000us 0.00% 47.840us 15.947us 3
aten::_conv_depthwise2d 2.71% 21.281us 8.10% 63.532us 21.177us 47.840us 68.05% 47.840us 15.947us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.840us 68.05% 47.840us 15.947us 3
aten::to 0.74% 5.828us 66.46% 521.411us 86.902us 0.000us 0.00% 26.592us 4.432us 6
aten::_to_copy 3.27% 25.622us 65.71% 515.583us 85.931us 0.000us 0.00% 26.592us 4.432us 6
aten::copy_ 6.42% 50.382us 58.46% 458.651us 76.442us 22.464us 31.95% 26.592us 4.432us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.032us 17.11% 12.032us 4.011us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 14.84% 10.432us 3.477us 3
Activity Buffer Request 29.93% 234.846us 29.93% 234.846us 234.846us 4.128us 5.87% 4.128us 4.128us 1
aten::empty_strided 3.99% 31.310us 3.99% 31.310us 5.218us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.83% 194.803us 24.83% 194.803us 21.645us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.07% 16.243us 2.72% 21.332us 2.370us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.07% 8.401us 1.07% 8.401us 0.560us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.35% 10.581us 1.35% 10.581us 3.527us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.31% 10.290us 1.31% 10.290us 3.430us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.69% 5.406us 0.84% 6.568us 2.189us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 784.589us
Self CUDA time total: 70.304us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.882us 182.91% 341.882us 341.882us 1
torch_eager 15.14% 117.185us 99.33% 768.879us 768.879us 0.000us 0.00% 197.117us 197.117us 1
aten::conv1d 0.79% 6.110us 14.86% 114.993us 38.331us 0.000us 0.00% 134.270us 44.757us 3
aten::convolution 1.22% 9.451us 14.07% 108.883us 36.294us 0.000us 0.00% 134.270us 44.757us 3
aten::_convolution 2.87% 22.240us 12.85% 99.432us 33.144us 0.000us 0.00% 134.270us 44.757us 3
aten::_conv_depthwise2d 2.84% 21.991us 8.04% 62.222us 20.741us 134.270us 71.84% 134.270us 44.757us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 134.270us 71.84% 134.270us 44.757us 3
aten::to 0.77% 5.950us 65.77% 509.102us 84.850us 0.000us 0.00% 62.847us 10.474us 6
aten::_to_copy 3.29% 25.489us 65.00% 503.152us 83.859us 0.000us 0.00% 62.847us 10.474us 6
aten::copy_ 6.45% 49.889us 57.58% 445.721us 74.287us 52.639us 28.16% 62.847us 10.474us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.728us 15.91% 29.728us 9.909us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.911us 12.26% 22.911us 7.637us 3
Activity Buffer Request 28.61% 221.416us 28.61% 221.416us 221.416us 10.208us 5.46% 10.208us 10.208us 1
aten::empty_strided 4.13% 31.942us 4.13% 31.942us 5.324us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.24% 195.386us 25.24% 195.386us 21.710us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.14% 16.602us 2.90% 22.460us 2.496us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.19% 9.247us 1.19% 9.247us 0.616us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.23% 9.500us 1.23% 9.500us 3.167us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.26% 9.761us 1.26% 9.761us 3.254us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.71% 5.470us 0.87% 6.700us 2.233us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 774.039us
Self CUDA time total: 186.909us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 349.277us 165.88% 349.277us 349.277us 1
torch_eager 15.39% 117.165us 99.36% 756.609us 756.609us 0.000us 0.00% 224.029us 224.029us 1
aten::conv1d 0.74% 5.661us 15.33% 116.734us 38.911us 0.000us 0.00% 154.686us 51.562us 3
aten::convolution 1.20% 9.150us 14.59% 111.073us 37.024us 0.000us 0.00% 154.686us 51.562us 3
aten::_convolution 2.96% 22.532us 13.38% 101.923us 33.974us 0.000us 0.00% 154.686us 51.562us 3
aten::_conv_depthwise2d 2.86% 21.751us 8.47% 64.492us 21.497us 154.686us 73.47% 154.686us 51.562us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.686us 73.47% 154.686us 51.562us 3
aten::to 0.84% 6.379us 65.15% 496.150us 82.692us 0.000us 0.00% 69.343us 11.557us 6
aten::_to_copy 3.33% 25.371us 64.32% 489.771us 81.628us 0.000us 0.00% 69.343us 11.557us 6
aten::copy_ 6.44% 49.031us 56.76% 432.240us 72.040us 55.871us 26.53% 69.343us 11.557us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.831us 15.59% 32.831us 10.944us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.040us 10.94% 23.040us 7.680us 3
Activity Buffer Request 27.33% 208.145us 27.33% 208.145us 208.145us 13.472us 6.40% 13.472us 13.472us 1
aten::empty_strided 4.22% 32.160us 4.22% 32.160us 5.360us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.87% 197.025us 25.87% 197.025us 21.892us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.14% 16.329us 2.83% 21.520us 2.391us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.17% 8.932us 1.17% 8.932us 0.595us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.38% 10.500us 1.38% 10.500us 3.500us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.35% 10.280us 1.35% 10.280us 3.427us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.72% 5.468us 0.90% 6.839us 2.280us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 761.499us
Self CUDA time total: 210.557us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 6.72% 121.944us 52.58% 953.714us 953.714us 0.000us 0.00% 1.521ms 1.521ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.421ms 100.41% 1.421ms 1.421ms 1
aten::to 0.35% 6.300us 37.63% 682.555us 113.759us 0.000us 0.00% 824.097us 137.350us 6
aten::_to_copy 1.68% 30.549us 37.28% 676.255us 112.709us 0.000us 0.00% 824.097us 137.350us 6
aten::copy_ 2.98% 53.981us 24.83% 450.422us 75.070us 718.817us 50.79% 824.097us 137.350us 6
aten::conv1d 0.35% 6.281us 6.65% 120.554us 40.185us 0.000us 0.00% 696.543us 232.181us 3
aten::convolution 0.57% 10.251us 6.30% 114.273us 38.091us 0.000us 0.00% 696.543us 232.181us 3
aten::_convolution 1.27% 23.111us 5.73% 104.022us 34.674us 0.000us 0.00% 696.543us 232.181us 3
aten::_conv_depthwise2d 1.23% 22.359us 3.60% 65.321us 21.774us 696.543us 49.21% 696.543us 232.181us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 696.543us 49.21% 696.543us 232.181us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 409.920us 28.96% 409.920us 136.640us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 308.897us 21.82% 308.897us 102.966us 3
Activity Buffer Request 11.98% 217.246us 11.98% 217.246us 217.246us 105.280us 7.44% 105.280us 105.280us 1
aten::empty_strided 2.17% 39.370us 10.77% 195.284us 32.547us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.13% 201.976us 11.13% 201.976us 22.442us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.99% 18.030us 1.31% 23.761us 2.640us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.53% 9.620us 0.53% 9.620us 0.641us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.59% 10.751us 0.59% 10.751us 3.584us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.52% 9.430us 0.52% 9.430us 3.143us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.31% 5.670us 0.39% 7.030us 2.343us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.814ms
Self CUDA time total: 1.415ms
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 4.05% 123.714us 65.96% 2.016ms 2.016ms 0.000us 0.00% 1.502ms 1.502ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.433ms 100.43% 1.433ms 1.433ms 1
aten::to 0.21% 6.507us 56.82% 1.737ms 289.475us 0.000us 0.00% 764.927us 127.488us 6
aten::_to_copy 0.85% 25.961us 56.61% 1.730ms 288.391us 0.000us 0.00% 764.927us 127.488us 6
aten::copy_ 1.76% 53.800us 54.73% 1.673ms 278.832us 689.887us 48.36% 764.927us 127.488us 6
aten::conv1d 0.20% 6.220us 4.18% 127.663us 42.554us 0.000us 0.00% 736.735us 245.578us 3
aten::convolution 0.34% 10.420us 3.97% 121.443us 40.481us 0.000us 0.00% 736.735us 245.578us 3
aten::_convolution 0.75% 22.860us 3.63% 111.023us 37.008us 0.000us 0.00% 736.735us 245.578us 3
aten::_conv_depthwise2d 0.96% 29.441us 2.37% 72.583us 24.194us 736.735us 51.64% 736.735us 245.578us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 736.735us 51.64% 736.735us 245.578us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 397.471us 27.86% 397.471us 132.490us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 292.416us 20.50% 292.416us 97.472us 3
Activity Buffer Request 47.26% 1.445ms 47.26% 1.445ms 1.445ms 75.040us 5.26% 75.040us 75.040us 1
aten::empty_strided 1.03% 31.391us 1.03% 31.391us 5.232us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 6.45% 197.169us 6.45% 197.169us 21.908us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.57% 17.300us 0.75% 22.850us 2.539us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.30% 9.200us 0.30% 9.200us 0.613us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.32% 9.780us 0.32% 9.780us 3.260us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.36% 10.870us 0.36% 10.870us 3.623us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.19% 5.770us 0.23% 7.180us 2.393us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.057ms
Self CUDA time total: 1.427ms
impl wl p50(ms) ok
torch_eager cuda_B2_D2048_S128_W2 0.08 True
torch_eager cuda_B2_D2048_S128_W4 0.08 True
torch_eager cuda_B2_D2048_S2048_W2 0.15 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
torch_eager cuda_B2_D2048_S512_W2 0.08 True
torch_eager cuda_B2_D2048_S512_W4 0.08 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.09 True
torch_eager cuda_B2_D64_S2048_W2 0.08 True
torch_eager cuda_B2_D64_S2048_W4 0.08 True
torch_eager cuda_B2_D64_S512_W2 0.09 True
torch_eager cuda_B2_D64_S512_W4 0.08 True
torch_eager cuda_B4_D2048_S128_W2 0.08 True
torch_eager cuda_B4_D2048_S128_W4 0.08 True
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
torch_eager cuda_B4_D2048_S512_W2 0.09 True
torch_eager cuda_B4_D2048_S512_W4 0.10 True
torch_eager cuda_B4_D64_S128_W2 0.08 True
torch_eager cuda_B4_D64_S128_W4 0.08 True
torch_eager cuda_B4_D64_S2048_W2 0.08 True
torch_eager cuda_B4_D64_S2048_W4 0.09 True
torch_eager cuda_B4_D64_S512_W2 0.08 True
torch_eager cuda_B4_D64_S512_W4 0.08 True