Running rotary benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.033ms 1157.58% 1.033ms 1.033ms 1
torch_eager 14.26% 386.998us 99.70% 2.705ms 2.705ms 0.000us 0.00% 90.431us 90.431us 1
aten::mul 6.08% 164.867us 10.45% 283.577us 11.816us 46.976us 52.65% 46.976us 1.957us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.976us 52.65% 46.976us 1.957us 24
aten::copy_ 3.96% 107.533us 62.14% 1.686ms 93.665us 28.959us 32.46% 30.175us 1.676us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.303us 25.00% 22.303us 1.859us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.280us 14.89% 13.280us 1.107us 12
aten::clone 1.58% 42.971us 61.19% 1.660ms 276.703us 0.000us 0.00% 7.872us 1.312us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 7.46% 6.656us 1.109us 6
aten::sub 1.73% 46.871us 2.69% 72.911us 12.152us 6.656us 7.46% 6.656us 1.109us 6
aten::add 1.35% 36.531us 2.16% 58.672us 9.779us 6.624us 7.42% 6.624us 1.104us 6
Activity Buffer Request 53.14% 1.442ms 53.14% 1.442ms 1.442ms 1.216us 1.36% 1.216us 1.216us 1
aten::empty_strided 2.28% 61.772us 2.28% 61.772us 10.295us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 2.73% 74.144us 2.73% 74.144us 12.357us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 3.20% 86.920us 4.13% 112.081us 4.670us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.93% 25.161us 0.93% 25.161us 1.048us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.45% 229.371us 8.45% 229.371us 4.779us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.30% 8.270us 0.30% 8.270us 8.270us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.713ms
Self CUDA time total: 89.215us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 920.346us 1019.88% 920.346us 920.346us 1
torch_eager 11.67% 287.669us 99.75% 2.459ms 2.459ms 0.000us 0.00% 91.392us 91.392us 1
aten::mul 5.97% 147.150us 10.47% 258.131us 10.755us 47.681us 52.84% 47.681us 1.987us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.681us 52.84% 47.681us 1.987us 24
aten::copy_ 4.01% 98.743us 66.94% 1.650ms 91.665us 29.184us 32.34% 30.335us 1.685us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.433us 24.86% 22.433us 1.869us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.376us 14.82% 13.376us 1.115us 12
aten::clone 0.96% 23.772us 64.13% 1.581ms 263.446us 0.000us 0.00% 7.902us 1.317us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 7.48% 6.751us 1.125us 6
aten::sub 1.51% 37.314us 2.51% 61.954us 10.326us 6.720us 7.45% 6.720us 1.120us 6
aten::add 1.33% 32.821us 2.21% 54.451us 9.075us 6.656us 7.38% 6.656us 1.109us 6
Activity Buffer Request 58.20% 1.434ms 58.20% 1.434ms 1.434ms 1.151us 1.28% 1.151us 1.151us 1
aten::empty_strided 1.33% 32.830us 1.33% 32.830us 5.472us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 2.21% 54.420us 2.21% 54.420us 9.070us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.84% 69.900us 3.65% 89.853us 3.744us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.81% 19.953us 0.81% 19.953us 0.831us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.92% 219.731us 8.92% 219.731us 4.578us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.25% 6.050us 0.25% 6.050us 6.050us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.465ms
Self CUDA time total: 90.241us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 909.568us 966.47% 909.568us 909.568us 1
torch_eager 11.23% 276.876us 99.79% 2.460ms 2.460ms 0.000us 0.00% 95.424us 95.424us 1
aten::mul 6.27% 154.461us 10.66% 262.794us 10.950us 48.800us 51.85% 48.800us 2.033us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.800us 51.85% 48.800us 2.033us 24
aten::copy_ 4.02% 99.094us 67.67% 1.668ms 92.677us 30.912us 32.85% 32.224us 1.790us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.008us 24.45% 23.008us 1.917us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.400us 15.30% 14.400us 1.200us 12
aten::clone 0.93% 22.950us 64.64% 1.593ms 265.583us 0.000us 0.00% 9.216us 1.536us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 8.40% 7.904us 1.317us 6
aten::sub 1.56% 38.564us 2.52% 62.034us 10.339us 7.200us 7.65% 7.200us 1.200us 6
aten::add 1.24% 30.660us 2.12% 52.250us 8.708us 7.200us 7.65% 7.200us 1.200us 6
Activity Buffer Request 58.87% 1.451ms 58.87% 1.451ms 1.451ms 1.312us 1.39% 1.312us 1.312us 1
aten::empty_strided 1.24% 30.531us 1.24% 30.531us 5.089us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 2.20% 54.240us 2.20% 54.240us 9.040us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.65% 65.401us 3.42% 84.323us 3.513us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.77% 18.922us 0.77% 18.922us 0.788us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.80% 216.993us 8.80% 216.993us 4.521us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.21% 5.190us 0.21% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.465ms
Self CUDA time total: 94.112us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 892.572us 880.74% 892.572us 892.572us 1
torch_eager 11.35% 283.366us 99.78% 2.492ms 2.492ms 0.000us 0.00% 102.687us 102.687us 1
aten::mul 5.93% 148.202us 10.19% 254.513us 10.605us 52.956us 52.25% 52.956us 2.207us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.956us 52.25% 52.956us 2.207us 24
aten::copy_ 3.94% 98.395us 68.27% 1.705ms 94.725us 32.482us 32.05% 33.826us 1.879us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.641us 24.31% 24.641us 2.053us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.905us 15.69% 15.905us 1.325us 12
aten::clone 0.86% 21.380us 65.50% 1.636ms 272.651us 0.000us 0.00% 9.185us 1.531us 6
aten::add 1.24% 31.000us 2.12% 53.041us 8.840us 8.032us 7.93% 8.032us 1.339us 6
aten::sub 1.40% 35.052us 2.32% 58.022us 9.670us 7.873us 7.77% 7.873us 1.312us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.841us 7.74% 7.841us 1.307us 6
Activity Buffer Request 52.43% 1.309ms 52.43% 1.309ms 1.309ms 1.344us 1.33% 1.344us 1.344us 1
aten::empty_strided 1.32% 33.071us 1.32% 33.071us 5.512us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 9.52% 237.764us 9.52% 237.764us 39.627us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.60% 64.825us 3.35% 83.624us 3.484us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.75% 18.799us 0.75% 18.799us 0.783us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.44% 210.793us 8.44% 210.793us 4.392us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.22% 5.611us 0.22% 5.611us 5.611us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.498ms
Self CUDA time total: 101.343us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 907.478us 966.25% 907.478us 907.478us 1
torch_eager 11.02% 305.318us 99.81% 2.765ms 2.765ms 0.000us 0.00% 95.230us 95.230us 1
aten::mul 5.24% 145.172us 9.20% 254.787us 10.616us 49.023us 52.20% 49.023us 2.043us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.023us 52.20% 49.023us 2.043us 24
aten::copy_ 3.74% 103.536us 70.23% 1.945ms 108.067us 30.719us 32.71% 32.031us 1.779us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 24.40% 22.912us 1.909us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.176us 15.09% 14.176us 1.181us 12
aten::clone 1.09% 30.110us 67.87% 1.880ms 313.329us 0.000us 0.00% 9.119us 1.520us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 8.31% 7.807us 1.301us 6
aten::sub 1.24% 34.480us 2.10% 58.270us 9.712us 7.104us 7.56% 7.104us 1.184us 6
aten::add 1.09% 30.091us 1.87% 51.880us 8.647us 7.072us 7.53% 7.072us 1.179us 6
Activity Buffer Request 52.12% 1.444ms 52.12% 1.444ms 1.444ms 1.312us 1.40% 1.312us 1.312us 1
aten::empty_strided 1.13% 31.430us 1.13% 31.430us 5.238us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 12.15% 336.439us 12.15% 336.439us 56.073us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.48% 68.768us 3.17% 87.719us 3.655us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.68% 18.951us 0.68% 18.951us 0.790us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.82% 216.674us 7.82% 216.674us 4.514us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.19% 5.210us 0.19% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.770ms
Self CUDA time total: 93.918us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 917.786us 906.49% 917.786us 917.786us 1
torch_eager 10.59% 290.695us 99.81% 2.741ms 2.741ms 0.000us 0.00% 102.558us 102.558us 1
aten::mul 5.39% 148.136us 9.30% 255.477us 10.645us 52.735us 52.09% 52.735us 2.197us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.735us 52.09% 52.735us 2.197us 24
aten::copy_ 4.15% 114.085us 70.69% 1.941ms 107.839us 32.512us 32.11% 33.824us 1.879us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.704us 24.40% 24.704us 2.059us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.999us 15.80% 15.999us 1.333us 12
aten::clone 0.78% 21.500us 67.65% 1.858ms 309.627us 0.000us 0.00% 9.120us 1.520us 6
aten::sub 1.39% 38.270us 2.26% 62.070us 10.345us 8.063us 7.96% 8.063us 1.344us 6
aten::add 1.13% 31.111us 1.93% 52.881us 8.813us 7.936us 7.84% 7.936us 1.323us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.71% 7.808us 1.301us 6
Activity Buffer Request 52.71% 1.447ms 52.71% 1.447ms 1.447ms 1.312us 1.30% 1.312us 1.312us 1
aten::empty_strided 1.19% 32.762us 1.19% 32.762us 5.460us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 11.56% 317.516us 11.56% 317.516us 52.919us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.38% 65.270us 3.07% 84.260us 3.511us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.69% 18.990us 0.69% 18.990us 0.791us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.83% 214.935us 7.83% 214.935us 4.478us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.19% 5.200us 0.19% 5.200us 5.200us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.746ms
Self CUDA time total: 101.246us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 896.601us 744.17% 896.601us 896.601us 1
torch_eager 10.66% 286.835us 99.81% 2.687ms 2.687ms 0.000us 0.00% 122.275us 122.275us 1
aten::mul 5.47% 147.118us 9.41% 253.291us 10.554us 61.985us 51.45% 61.985us 2.583us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.985us 51.45% 61.985us 2.583us 24
aten::copy_ 3.72% 100.260us 70.38% 1.894ms 105.246us 39.265us 32.59% 41.057us 2.281us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.834us 23.93% 28.834us 2.403us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.233us 15.96% 19.233us 1.603us 12
aten::clone 0.83% 22.211us 67.89% 1.827ms 304.542us 0.000us 0.00% 12.223us 2.037us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 8.66% 10.431us 1.738us 6
aten::add 1.14% 30.799us 1.94% 52.140us 8.690us 9.632us 7.99% 9.632us 1.605us 6
aten::sub 1.37% 36.770us 2.23% 59.970us 9.995us 9.601us 7.97% 9.601us 1.600us 6
Activity Buffer Request 53.18% 1.431ms 53.18% 1.431ms 1.431ms 1.792us 1.49% 1.792us 1.792us 1
aten::empty_strided 1.21% 32.491us 1.21% 32.491us 5.415us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 11.26% 303.147us 11.26% 303.147us 50.525us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.49% 66.932us 3.17% 85.280us 3.553us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.68% 18.348us 0.68% 18.348us 0.765us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.81% 210.347us 7.81% 210.347us 4.382us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.19% 5.020us 0.19% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.692ms
Self CUDA time total: 120.483us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 885.202us 514.56% 885.202us 885.202us 1
torch_eager 18.81% 279.303us 99.64% 1.480ms 1.480ms 0.000us 0.00% 174.944us 174.944us 1
aten::mul 9.70% 144.115us 16.98% 252.116us 10.505us 89.439us 51.99% 89.439us 3.727us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.439us 51.99% 89.439us 3.727us 24
aten::copy_ 6.85% 101.723us 47.28% 702.206us 39.011us 57.632us 33.50% 60.544us 3.364us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.608us 23.60% 40.608us 3.384us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.961us 14.51% 24.961us 2.080us 12
aten::clone 1.41% 20.892us 42.46% 630.635us 105.106us 0.000us 0.00% 19.936us 3.323us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 9.90% 17.024us 2.837us 6
aten::add 2.07% 30.702us 3.51% 52.142us 8.690us 12.545us 7.29% 12.545us 2.091us 6
aten::sub 2.41% 35.732us 4.00% 59.442us 9.907us 12.416us 7.22% 12.416us 2.069us 6
Activity Buffer Request 17.15% 254.675us 17.15% 254.675us 254.675us 2.912us 1.69% 2.912us 2.912us 1
aten::empty_strided 2.07% 30.780us 2.07% 30.780us 5.130us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 19.36% 287.456us 19.36% 287.456us 47.909us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.32% 64.164us 5.58% 82.803us 3.450us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.26% 18.639us 1.26% 18.639us 0.777us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 14.24% 211.503us 14.24% 211.503us 4.406us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.36% 5.410us 0.36% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.485ms
Self CUDA time total: 172.032us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 907.735us 751.64% 907.735us 907.735us 1
torch_eager 18.35% 272.536us 99.65% 1.480ms 1.480ms 0.000us 0.00% 122.527us 122.527us 1
aten::mul 9.89% 146.883us 17.48% 259.553us 10.815us 62.078us 51.40% 62.078us 2.587us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.078us 51.40% 62.078us 2.587us 24
aten::copy_ 6.65% 98.730us 45.99% 682.885us 37.938us 39.328us 32.57% 41.088us 2.283us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.832us 23.87% 28.832us 2.403us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.361us 16.03% 19.361us 1.613us 12
aten::clone 2.58% 38.249us 42.54% 631.763us 105.294us 0.000us 0.00% 12.256us 2.043us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 8.69% 10.496us 1.749us 6
aten::add 2.13% 31.663us 3.60% 53.483us 8.914us 9.728us 8.06% 9.728us 1.621us 6
aten::sub 2.35% 34.954us 3.91% 58.043us 9.674us 9.633us 7.98% 9.633us 1.605us 6
Activity Buffer Request 16.88% 250.706us 16.88% 250.706us 250.706us 1.760us 1.46% 1.760us 1.760us 1
aten::empty_strided 2.15% 31.912us 2.15% 31.912us 5.319us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 18.48% 274.437us 18.48% 274.437us 45.739us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.31% 63.964us 5.59% 83.053us 3.461us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.29% 19.089us 1.29% 19.089us 0.795us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 14.59% 216.591us 14.59% 216.591us 4.512us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.35% 5.220us 0.35% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.485ms
Self CUDA time total: 120.767us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 894.399us 519.81% 894.399us 894.399us 1
torch_eager 10.51% 278.801us 99.79% 2.648ms 2.648ms 0.000us 0.00% 174.911us 174.911us 1
aten::mul 5.47% 145.104us 9.49% 251.734us 10.489us 89.535us 52.04% 89.535us 3.731us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.535us 52.04% 89.535us 3.731us 24
aten::copy_ 3.73% 98.901us 70.34% 1.866ms 103.682us 57.696us 33.53% 60.544us 3.364us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.704us 23.66% 40.704us 3.392us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.832us 14.43% 24.832us 2.069us 12
aten::clone 0.84% 22.190us 67.69% 1.796ms 299.337us 0.000us 0.00% 19.840us 3.307us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.88% 16.992us 2.832us 6
aten::sub 1.44% 38.162us 2.33% 61.942us 10.324us 12.448us 7.23% 12.448us 2.075us 6
aten::add 1.15% 30.549us 1.97% 52.171us 8.695us 12.384us 7.20% 12.384us 2.064us 6
Activity Buffer Request 54.02% 1.433ms 54.02% 1.433ms 1.433ms 2.848us 1.66% 2.848us 2.848us 1
aten::empty_strided 1.13% 30.052us 1.13% 30.052us 5.009us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 10.37% 275.065us 10.37% 275.065us 45.844us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.49% 65.991us 3.19% 84.601us 3.525us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.70% 18.610us 0.70% 18.610us 0.775us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.95% 211.023us 7.95% 211.023us 4.396us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.21% 5.640us 0.21% 5.640us 5.640us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.653ms
Self CUDA time total: 172.063us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 888.595us 313.84% 888.595us 888.595us 1
torch_eager 18.64% 271.692us 99.64% 1.452ms 1.452ms 0.000us 0.00% 301.536us 301.536us 1
aten::mul 9.98% 145.418us 17.29% 252.060us 10.503us 132.896us 46.94% 132.896us 5.537us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.896us 46.94% 132.896us 5.537us 24
aten::copy_ 6.89% 100.362us 46.38% 676.084us 37.560us 109.376us 38.63% 127.776us 7.099us 18
aten::clone 1.48% 21.511us 41.22% 600.853us 100.142us 0.000us 0.00% 70.560us 11.760us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.216us 20.21% 57.216us 4.768us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.160us 18.42% 52.160us 8.693us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.864us 14.43% 40.864us 3.405us 12
aten::sub 2.41% 35.143us 4.02% 58.572us 9.762us 20.512us 7.24% 20.512us 3.419us 6
aten::add 2.12% 30.932us 3.62% 52.783us 8.797us 20.352us 7.19% 20.352us 3.392us 6
Activity Buffer Request 16.97% 247.406us 16.97% 247.406us 247.406us 18.400us 6.50% 18.400us 18.400us 1
aten::empty_strided 2.15% 31.370us 2.15% 31.370us 5.228us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 18.35% 267.496us 18.35% 267.496us 44.583us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.85% 70.742us 6.06% 88.302us 3.679us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.20% 17.560us 1.20% 17.560us 0.732us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 14.59% 212.742us 14.59% 212.742us 4.432us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.36% 5.280us 0.36% 5.280us 5.280us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.458ms
Self CUDA time total: 283.136us
======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.856us 167.33% 944.856us 944.856us 1
torch_eager 19.10% 286.874us 99.66% 1.497ms 1.497ms 0.000us 0.00% 588.218us 588.218us 1
aten::copy_ 6.48% 97.352us 44.49% 668.224us 37.124us 273.885us 48.50% 297.437us 16.524us 18
aten::mul 11.54% 173.280us 19.20% 288.361us 12.015us 224.990us 39.84% 224.990us 9.375us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 224.990us 39.84% 224.990us 9.375us 24
aten::clone 1.34% 20.121us 39.51% 593.393us 98.899us 0.000us 0.00% 206.910us 34.485us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 183.358us 32.47% 183.358us 30.560us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.527us 16.03% 90.527us 7.544us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.791us 11.65% 65.791us 5.483us 12
aten::sub 2.45% 36.872us 4.07% 61.073us 10.179us 33.407us 5.92% 33.407us 5.568us 6
aten::add 2.13% 32.018us 3.64% 54.631us 9.105us 32.384us 5.74% 32.384us 5.397us 6
Activity Buffer Request 16.63% 249.816us 16.63% 249.816us 249.816us 23.552us 4.17% 23.552us 23.552us 1
aten::empty_strided 2.02% 30.350us 2.02% 30.350us 5.058us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 17.28% 259.545us 17.28% 259.545us 43.258us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.52% 67.913us 5.81% 87.211us 3.634us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.28% 19.298us 1.28% 19.298us 0.804us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 14.87% 223.406us 14.87% 223.406us 4.654us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.34% 5.141us 0.34% 5.141us 5.141us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.502ms
Self CUDA time total: 564.666us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 916.448us 990.96% 916.448us 916.448us 1
torch_eager 10.63% 281.892us 99.80% 2.647ms 2.647ms 0.000us 0.00% 93.601us 93.601us 1
aten::mul 5.58% 148.028us 9.67% 256.571us 10.690us 49.634us 53.67% 49.634us 2.068us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.634us 53.67% 49.634us 2.068us 24
aten::copy_ 3.99% 105.971us 69.88% 1.854ms 102.991us 29.439us 31.83% 30.559us 1.698us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.655us 24.50% 22.655us 1.888us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.408us 14.50% 13.408us 1.117us 12
aten::clone 0.82% 21.802us 66.79% 1.772ms 295.325us 0.000us 0.00% 7.904us 1.317us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 7.34% 6.784us 1.131us 6
aten::sub 1.36% 36.061us 2.24% 59.441us 9.907us 6.720us 7.27% 6.720us 1.120us 6
aten::add 1.25% 33.260us 2.10% 55.590us 9.265us 6.688us 7.23% 6.688us 1.115us 6
Activity Buffer Request 54.00% 1.433ms 54.00% 1.433ms 1.433ms 1.120us 1.21% 1.120us 1.120us 1
aten::empty_strided 1.13% 29.861us 1.13% 29.861us 4.977us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 9.52% 252.488us 9.52% 252.488us 42.081us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.59% 68.801us 3.33% 88.471us 3.686us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.74% 19.670us 0.74% 19.670us 0.820us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 8.18% 216.965us 8.18% 216.965us 4.520us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.20% 5.410us 0.20% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.653ms
Self CUDA time total: 92.481us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 888.809us 923.35% 888.809us 888.809us 1
torch_eager 19.05% 273.129us 99.67% 1.429ms 1.429ms 0.000us 0.00% 97.571us 97.571us 1
aten::mul 10.09% 144.695us 17.61% 252.506us 10.521us 51.232us 53.22% 51.232us 2.135us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.232us 53.22% 51.232us 2.135us 24
aten::copy_ 6.72% 96.301us 45.37% 650.385us 36.132us 30.786us 31.98% 32.098us 1.783us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.944us 23.84% 22.944us 1.912us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.241us 14.79% 14.241us 1.187us 12
aten::clone 1.39% 19.911us 40.43% 579.513us 96.586us 0.000us 0.00% 9.154us 1.526us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.842us 8.15% 7.842us 1.307us 6
aten::add 2.26% 32.360us 3.79% 54.320us 9.053us 7.136us 7.41% 7.136us 1.189us 6
aten::sub 2.55% 36.551us 4.17% 59.791us 9.965us 7.105us 7.38% 7.105us 1.184us 6
Activity Buffer Request 16.56% 237.415us 16.56% 237.415us 237.415us 1.312us 1.36% 1.312us 1.312us 1
aten::empty_strided 2.18% 31.230us 2.18% 31.230us 5.205us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 17.96% 257.447us 17.96% 257.447us 42.908us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.71% 67.539us 6.11% 87.581us 3.649us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.40% 20.042us 1.40% 20.042us 0.835us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 14.80% 212.233us 14.80% 212.233us 4.422us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.33% 4.690us 0.33% 4.690us 4.690us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.434ms
Self CUDA time total: 96.259us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 903.536us 870.95% 903.536us 903.536us 1
torch_eager 18.87% 271.956us 99.65% 1.436ms 1.436ms 0.000us 0.00% 105.053us 105.053us 1
aten::mul 10.20% 146.935us 17.83% 256.897us 10.704us 55.262us 53.27% 55.262us 2.303us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.262us 53.27% 55.262us 2.303us 24
aten::copy_ 6.83% 98.437us 45.05% 649.198us 36.067us 32.478us 31.31% 33.790us 1.877us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.670us 23.78% 24.670us 2.056us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.001us 15.42% 16.001us 1.333us 12
aten::clone 1.50% 21.580us 40.06% 577.333us 96.222us 0.000us 0.00% 9.120us 1.520us 6
aten::sub 2.49% 35.841us 4.72% 67.992us 11.332us 8.001us 7.71% 8.001us 1.333us 6
aten::add 2.31% 33.350us 3.86% 55.670us 9.278us 8.000us 7.71% 8.000us 1.333us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.53% 7.808us 1.301us 6
Activity Buffer Request 16.46% 237.265us 16.46% 237.265us 237.265us 1.312us 1.26% 1.312us 1.312us 1
aten::empty_strided 2.16% 31.090us 2.16% 31.090us 5.182us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 17.50% 252.196us 17.50% 252.196us 42.033us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.40% 63.461us 5.67% 81.650us 3.402us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.26% 18.189us 1.26% 18.189us 0.758us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.66% 225.733us 15.66% 225.733us 4.703us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.35% 5.060us 0.35% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.441ms
Self CUDA time total: 103.741us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 903.548us 729.80% 903.548us 903.548us 1
torch_eager 10.56% 280.674us 99.81% 2.652ms 2.652ms 0.000us 0.00% 125.567us 125.567us 1
aten::mul 5.49% 145.805us 9.46% 251.467us 10.478us 65.184us 52.65% 65.184us 2.716us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.184us 52.65% 65.184us 2.716us 24
aten::copy_ 3.75% 99.563us 70.08% 1.862ms 103.468us 39.422us 31.84% 41.182us 2.288us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.928us 23.37% 28.928us 2.411us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.201us 15.51% 19.201us 1.600us 12
aten::clone 0.92% 24.379us 67.48% 1.793ms 298.872us 0.000us 0.00% 12.254us 2.042us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.494us 8.48% 10.494us 1.749us 6
aten::add 1.15% 30.622us 1.96% 52.162us 8.694us 9.633us 7.78% 9.633us 1.606us 6
aten::sub 1.45% 38.422us 2.36% 62.661us 10.443us 9.568us 7.73% 9.568us 1.595us 6
Activity Buffer Request 54.94% 1.460ms 54.94% 1.460ms 1.460ms 1.760us 1.42% 1.760us 1.760us 1
aten::empty_strided 1.16% 30.801us 1.16% 30.801us 5.133us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 9.14% 242.866us 9.14% 242.866us 40.478us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.56% 67.990us 3.30% 87.783us 3.658us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.74% 19.793us 0.74% 19.793us 0.825us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.96% 211.432us 7.96% 211.432us 4.405us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.19% 5.160us 0.19% 5.160us 5.160us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.658ms
Self CUDA time total: 123.807us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 889.436us 855.74% 889.436us 889.436us 1
torch_eager 19.42% 274.045us 99.59% 1.406ms 1.406ms 0.000us 0.00% 105.282us 105.282us 1
aten::mul 10.41% 146.921us 18.18% 256.563us 10.690us 55.486us 53.38% 55.486us 2.312us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.486us 53.38% 55.486us 2.312us 24
aten::copy_ 6.82% 96.302us 44.56% 628.895us 34.939us 32.513us 31.28% 33.857us 1.881us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.705us 23.77% 24.705us 2.059us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.939us 15.34% 15.939us 1.328us 12
aten::clone 1.41% 19.928us 39.46% 556.871us 92.812us 0.000us 0.00% 9.152us 1.525us 6
aten::sub 2.56% 36.082us 4.16% 58.744us 9.791us 7.970us 7.67% 7.970us 1.328us 6
aten::add 2.23% 31.511us 3.85% 54.282us 9.047us 7.969us 7.67% 7.969us 1.328us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.51% 7.808us 1.301us 6
Activity Buffer Request 15.99% 225.676us 15.99% 225.676us 225.676us 1.344us 1.29% 1.344us 1.344us 1
aten::empty_strided 2.17% 30.631us 2.17% 30.631us 5.105us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 17.52% 247.335us 17.52% 247.335us 41.223us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.52% 63.850us 5.84% 82.475us 3.436us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.32% 18.625us 1.32% 18.625us 0.776us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.21% 214.657us 15.21% 214.657us 4.472us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.41% 5.810us 0.41% 5.810us 5.810us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.411ms
Self CUDA time total: 103.938us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 888.135us 717.15% 888.135us 888.135us 1
torch_eager 18.91% 268.465us 99.65% 1.415ms 1.415ms 0.000us 0.00% 125.666us 125.666us 1
aten::mul 10.15% 144.114us 17.70% 251.265us 10.469us 65.346us 52.77% 65.346us 2.723us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.346us 52.77% 65.346us 2.723us 24
aten::copy_ 6.90% 97.992us 45.41% 644.725us 35.818us 39.328us 31.76% 41.152us 2.286us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.800us 23.26% 28.800us 2.400us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.168us 15.48% 19.168us 1.597us 12
aten::clone 1.46% 20.690us 40.33% 572.532us 95.422us 0.000us 0.00% 12.352us 2.059us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 8.50% 10.528us 1.755us 6
aten::add 2.19% 31.029us 3.69% 52.390us 8.732us 9.600us 7.75% 9.600us 1.600us 6
aten::sub 2.50% 35.469us 4.13% 58.580us 9.763us 9.568us 7.73% 9.568us 1.595us 6
Activity Buffer Request 15.69% 222.765us 15.69% 222.765us 222.765us 1.824us 1.47% 1.824us 1.824us 1
aten::empty_strided 2.29% 32.500us 2.29% 32.500us 5.417us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 18.50% 262.716us 18.50% 262.716us 43.786us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.70% 66.710us 6.07% 86.108us 3.588us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.37% 19.398us 1.37% 19.398us 0.808us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 14.99% 212.875us 14.99% 212.875us 4.435us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.35% 5.010us 0.35% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.420ms
Self CUDA time total: 123.842us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 910.045us 513.35% 910.045us 910.045us 1
torch_eager 9.66% 280.213us 99.83% 2.894ms 2.894ms 0.000us 0.00% 180.188us 180.188us 1
aten::mul 5.18% 150.102us 9.00% 260.863us 10.869us 94.655us 53.39% 94.655us 3.944us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.655us 53.39% 94.655us 3.944us 24
aten::copy_ 3.40% 98.673us 72.45% 2.101ms 116.706us 57.885us 32.65% 60.797us 3.378us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.799us 23.01% 40.799us 3.400us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.736us 13.95% 24.736us 2.061us 12
aten::clone 0.79% 22.860us 70.00% 2.030ms 338.262us 0.000us 0.00% 19.998us 3.333us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.086us 9.64% 17.086us 2.848us 6
aten::add 1.13% 32.880us 1.89% 54.761us 9.127us 12.416us 7.00% 12.416us 2.069us 6
aten::sub 1.18% 34.239us 1.98% 57.551us 9.592us 12.320us 6.95% 12.320us 2.053us 6
Activity Buffer Request 58.76% 1.704ms 58.76% 1.704ms 1.704ms 2.912us 1.64% 2.912us 2.912us 1
aten::empty_strided 1.11% 32.150us 1.11% 32.150us 5.358us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 8.21% 238.144us 8.21% 238.144us 39.691us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.29% 66.481us 2.94% 85.213us 3.551us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.65% 18.732us 0.65% 18.732us 0.781us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.46% 216.224us 7.46% 216.224us 4.505us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.17% 5.070us 0.17% 5.070us 5.070us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.899ms
Self CUDA time total: 177.276us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 919.612us 310.44% 919.612us 919.612us 1
torch_eager 10.49% 286.464us 99.82% 2.726ms 2.726ms 0.000us 0.00% 313.057us 313.057us 1
aten::mul 5.34% 145.716us 9.29% 253.789us 10.575us 145.182us 49.01% 145.182us 6.049us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.182us 49.01% 145.182us 6.049us 24
aten::copy_ 3.69% 100.696us 70.60% 1.928ms 107.115us 109.985us 37.13% 126.817us 7.045us 18
aten::clone 0.88% 23.951us 68.02% 1.858ms 309.597us 0.000us 0.00% 69.474us 11.579us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.343us 19.36% 57.343us 4.779us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.642us 17.77% 52.642us 8.774us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.058us 13.86% 41.058us 3.421us 12
aten::sub 1.33% 36.191us 2.18% 59.621us 9.937us 20.609us 6.96% 20.609us 3.435us 6
aten::add 1.14% 31.230us 1.95% 53.190us 8.865us 20.449us 6.90% 20.449us 3.408us 6
Activity Buffer Request 56.07% 1.531ms 56.07% 1.531ms 1.531ms 16.832us 5.68% 16.832us 16.832us 1
aten::empty_strided 1.17% 32.070us 1.17% 32.070us 5.345us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 8.59% 234.696us 8.59% 234.696us 39.116us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.53% 69.062us 3.26% 88.922us 3.705us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.73% 19.860us 0.73% 19.860us 0.827us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 7.86% 214.752us 7.86% 214.752us 4.474us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.18% 4.930us 0.18% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.731ms
Self CUDA time total: 296.225us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 889.394us 501.79% 889.394us 889.394us 1
torch_eager 17.97% 266.975us 99.65% 1.481ms 1.481ms 0.000us 0.00% 180.092us 180.092us 1
aten::mul 9.80% 145.611us 16.96% 251.937us 10.497us 94.974us 53.58% 94.974us 3.957us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.974us 53.58% 94.974us 3.957us 24
aten::copy_ 6.75% 100.282us 47.98% 712.837us 39.602us 57.694us 32.55% 60.542us 3.363us 18
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.736us 22.98% 40.736us 3.395us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.576us 13.87% 24.576us 2.048us 12
aten::clone 1.38% 20.549us 43.06% 639.725us 106.621us 0.000us 0.00% 19.806us 3.301us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.958us 9.57% 16.958us 2.826us 6
aten::sub 2.49% 37.040us 4.14% 61.531us 10.255us 12.289us 6.93% 12.289us 2.048us 6
aten::add 2.11% 31.282us 3.59% 53.402us 8.900us 12.287us 6.93% 12.287us 2.048us 6
Activity Buffer Request 19.87% 295.257us 19.87% 295.257us 295.257us 2.848us 1.61% 2.848us 2.848us 1
aten::empty_strided 2.04% 30.372us 2.04% 30.372us 5.062us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 17.34% 257.637us 17.34% 257.637us 42.940us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.31% 64.000us 5.58% 82.951us 3.456us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.28% 18.951us 1.28% 18.951us 0.790us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 14.31% 212.598us 14.31% 212.598us 4.429us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.35% 5.130us 0.35% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.486ms
Self CUDA time total: 177.244us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 909.016us 306.24% 909.016us 909.016us 1
torch_eager 19.05% 269.264us 99.66% 1.409ms 1.409ms 0.000us 0.00% 314.684us 314.684us 1
aten::mul 10.56% 149.323us 19.02% 268.875us 11.203us 145.440us 49.00% 145.440us 6.060us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.440us 49.00% 145.440us 6.060us 24
aten::copy_ 6.96% 98.305us 44.09% 623.125us 34.618us 110.751us 37.31% 128.606us 7.145us 18
aten::clone 1.45% 20.520us 38.80% 548.422us 91.404us 0.000us 0.00% 71.453us 11.909us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.153us 19.25% 57.153us 4.763us 12
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.598us 18.06% 53.598us 8.933us 6
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.638us 13.69% 40.638us 3.387us 12
aten::add 2.27% 32.070us 3.85% 54.390us 9.065us 20.352us 6.86% 20.352us 3.392us 6
aten::sub 2.35% 33.277us 4.05% 57.282us 9.547us 20.286us 6.83% 20.286us 3.381us 6
Activity Buffer Request 15.96% 225.655us 15.96% 225.655us 225.655us 17.855us 6.02% 17.855us 17.855us 1
aten::empty_strided 2.15% 30.350us 2.15% 30.350us 5.058us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 16.79% 237.294us 16.79% 237.294us 39.549us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.69% 66.249us 6.00% 84.797us 3.533us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.31% 18.548us 1.31% 18.548us 0.773us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 16.11% 227.748us 16.11% 227.748us 4.745us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.34% 4.840us 0.34% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.413ms
Self CUDA time total: 296.829us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 916.757us 157.09% 916.757us 916.757us 1
torch_eager 19.46% 274.242us 99.65% 1.404ms 1.404ms 0.000us 0.00% 607.350us 607.350us 1
aten::copy_ 7.01% 98.793us 43.42% 611.905us 33.995us 268.603us 46.03% 292.379us 16.243us 18
aten::mul 10.57% 148.926us 18.84% 265.480us 11.062us 249.086us 42.68% 249.086us 10.379us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 249.086us 42.68% 249.086us 10.379us 24
aten::clone 1.44% 20.340us 38.12% 537.253us 89.542us 0.000us 0.00% 202.173us 33.696us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 178.397us 30.57% 178.397us 29.733us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.206us 15.46% 90.206us 7.517us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.885us 11.29% 65.885us 5.490us 12
aten::sub 2.63% 37.022us 4.37% 61.602us 10.267us 33.151us 5.68% 33.151us 5.525us 6
aten::add 2.33% 32.810us 3.92% 55.180us 9.197us 32.734us 5.61% 32.734us 5.456us 6
Activity Buffer Request 15.58% 219.605us 15.58% 219.605us 219.605us 23.776us 4.07% 23.776us 23.776us 1
aten::empty_strided 2.10% 29.631us 2.10% 29.631us 4.938us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 16.49% 232.396us 16.49% 232.396us 38.733us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 4.73% 66.612us 6.10% 85.953us 3.581us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 1.37% 19.341us 1.37% 19.341us 0.806us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 15.94% 224.615us 15.94% 224.615us 4.679us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 0.35% 4.910us 0.35% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.409ms
Self CUDA time total: 583.574us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 12.10% 272.127us 61.47% 1.382ms 1.382ms 0.000us 0.00% 1.837ms 1.837ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.810ms 102.21% 1.810ms 1.810ms 1
aten::copy_ 4.74% 106.692us 27.02% 607.756us 33.764us 794.110us 44.84% 859.966us 47.776us 18
aten::mul 6.35% 142.895us 11.18% 251.386us 10.474us 829.085us 46.82% 829.085us 34.545us 24
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 829.085us 46.82% 829.085us 34.545us 24
aten::clone 0.94% 21.099us 23.42% 526.743us 87.790us 0.000us 0.00% 627.678us 104.613us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 561.822us 31.73% 561.822us 93.637us 6
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 232.288us 13.12% 232.288us 19.357us 12
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 147.650us 8.34% 147.650us 12.304us 12
aten::sub 1.58% 35.541us 2.61% 58.661us 9.777us 89.538us 5.06% 89.538us 14.923us 6
Activity Buffer Request 9.29% 208.845us 9.29% 208.845us 208.845us 65.856us 3.72% 65.856us 65.856us 1
aten::add 1.43% 32.251us 2.42% 54.461us 9.077us 58.112us 3.28% 58.112us 9.685us 6
aten::empty_strided 1.39% 31.342us 1.39% 31.342us 5.224us 0.000us 0.00% 0.000us 0.000us 6
cudaMemcpyAsync 10.27% 230.957us 10.27% 230.957us 38.493us 0.000us 0.00% 0.000us 0.000us 6
aten::slice 2.99% 67.270us 3.80% 85.550us 3.565us 0.000us 0.00% 0.000us 0.000us 24
aten::as_strided 0.81% 18.280us 0.81% 18.280us 0.762us 0.000us 0.00% 0.000us 0.000us 24
cudaLaunchKernel 9.56% 215.083us 9.56% 215.083us 4.481us 0.000us 0.00% 0.000us 0.000us 48
cudaDeviceSynchronize 38.53% 866.589us 38.53% 866.589us 866.589us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.249ms
Self CUDA time total: 1.771ms
impl wl p50(ms) ok
torch_eager cuda_B1_S128_H32_D128_R64 0.21 True
torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
torch_eager cuda_B1_S128_H8_D128_R64 0.22 True
torch_eager cuda_B1_S128_H8_D64_R32 0.17 True
torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True
torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True
torch_eager cuda_B1_S2048_H8_D64_R32 0.21 True
torch_eager cuda_B1_S512_H32_D128_R64 0.21 True
torch_eager cuda_B1_S512_H32_D64_R32 0.21 True
torch_eager cuda_B1_S512_H8_D128_R64 0.21 True
torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
torch_eager cuda_B2_S128_H32_D128_R64 0.21 True
torch_eager cuda_B2_S128_H32_D64_R32 0.21 True
torch_eager cuda_B2_S128_H8_D128_R64 0.21 True
torch_eager cuda_B2_S128_H8_D64_R32 0.21 True
torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
torch_eager cuda_B2_S2048_H8_D64_R32 0.21 True
torch_eager cuda_B2_S512_H32_D128_R64 0.21 True
torch_eager cuda_B2_S512_H32_D64_R32 0.21 True
torch_eager cuda_B2_S512_H8_D128_R64 0.21 True
torch_eager cuda_B2_S512_H8_D64_R32 0.21 True