Tue Oct 28 14:07:54 2025
+Wed Oct 29 14:26:44 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 27C P0 80W / 350W | 0MiB / 46068MiB | 1% Default |
+| N/A 32C P0 133W / 350W | 0MiB / 46068MiB | 100% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
▼ output
▶ uv-logs
|
-Cell: benchmark | 6.88s
+Cell: benchmark | 6.86s
| ▶ run
Copy
Raw
@@ -3970,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 206.526us 1621.34% 206.526us 206.526us 1
- torch_eager 11.16% 213.167us 99.55% 1.902ms 1.902ms 0.000us 0.00% 15.042us 15.042us 1
- aten::silu 3.29% 62.892us 81.79% 1.563ms 520.961us 6.529us 51.26% 8.833us 2.944us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.529us 51.26% 6.529us 2.176us 3
- aten::mul 2.06% 39.382us 3.23% 61.724us 20.575us 6.209us 48.74% 6.209us 2.070us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.209us 48.74% 6.209us 2.070us 3
- Activity Buffer Request 76.05% 1.453ms 76.05% 1.453ms 1.453ms 2.304us 18.09% 2.304us 2.304us 1
- aten::slice 2.72% 51.931us 3.38% 64.581us 10.764us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.66% 12.650us 0.66% 12.650us 2.108us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 3.62% 69.144us 3.62% 69.144us 11.524us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.45% 8.521us 0.45% 8.521us 8.521us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 189.470us 1483.94% 189.470us 189.470us 1
+ torch_eager 11.64% 220.727us 99.60% 1.889ms 1.889ms 0.000us 0.00% 15.103us 15.103us 1
+ aten::silu 3.36% 63.732us 81.84% 1.552ms 517.326us 6.559us 51.37% 8.894us 2.965us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.559us 51.37% 6.559us 2.186us 3
+ aten::mul 1.83% 34.608us 3.05% 57.780us 19.260us 6.209us 48.63% 6.209us 2.070us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.209us 48.63% 6.209us 2.070us 3
+ Activity Buffer Request 76.17% 1.444ms 76.17% 1.444ms 1.444ms 2.335us 18.29% 2.335us 2.335us 1
+ aten::slice 2.47% 46.790us 3.07% 58.281us 9.714us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.61% 11.491us 0.61% 11.491us 1.915us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.54% 67.043us 3.54% 67.043us 11.174us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.40% 7.531us 0.40% 7.531us 7.531us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.911ms
-Self CUDA time total: 12.738us
+Self CPU time total: 1.896ms
+Self CUDA time total: 12.768us
@@ -3993,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.965us 1227.21% 151.965us 151.965us 1
- torch_eager 7.02% 119.974us 99.63% 1.704ms 1.704ms 0.000us 0.00% 14.558us 14.558us 1
- aten::silu 2.35% 40.140us 88.12% 1.507ms 502.320us 6.399us 51.68% 8.574us 2.858us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.895us 1299.43% 160.895us 160.895us 1
+ torch_eager 6.82% 117.243us 99.71% 1.713ms 1.713ms 0.000us 0.00% 14.558us 14.558us 1
+ aten::silu 2.46% 42.340us 88.23% 1.516ms 505.362us 6.399us 51.68% 8.575us 2.858us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
- aten::mul 1.61% 27.481us 2.72% 46.541us 15.514us 5.984us 48.32% 5.984us 1.995us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
- Activity Buffer Request 84.14% 1.439ms 84.14% 1.439ms 1.439ms 2.175us 17.56% 2.175us 2.175us 1
- aten::slice 1.43% 24.471us 1.78% 30.412us 5.069us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.35% 5.941us 0.35% 5.941us 0.990us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 2.74% 46.851us 2.74% 46.851us 7.809us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.37% 6.320us 0.37% 6.320us 6.320us 0.000us 0.00% 0.000us 0.000us 1
+ aten::mul 1.64% 28.101us 2.83% 48.681us 16.227us 5.983us 48.32% 5.983us 1.994us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
+ Activity Buffer Request 84.10% 1.445ms 84.10% 1.445ms 1.445ms 2.176us 17.57% 2.176us 2.176us 1
+ aten::slice 1.47% 25.252us 1.82% 31.222us 5.204us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.35% 5.970us 0.35% 5.970us 0.995us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 2.87% 49.290us 2.87% 49.290us 8.215us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.29% 5.020us 0.29% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.710ms
-Self CUDA time total: 12.383us
+Self CPU time total: 1.718ms
+Self CUDA time total: 12.382us
@@ -4016,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.008us 1139.77% 151.008us 151.008us 1
- torch_eager 6.34% 107.173us 99.70% 1.687ms 1.687ms 0.000us 0.00% 15.522us 15.522us 1
- aten::silu 2.38% 40.332us 88.83% 1.503ms 500.911us 6.817us 51.45% 9.090us 3.030us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.817us 51.45% 6.817us 2.272us 3
- aten::mul 1.57% 26.503us 2.73% 46.253us 15.418us 6.432us 48.55% 6.432us 2.144us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.55% 6.432us 2.144us 3
- Activity Buffer Request 84.91% 1.436ms 84.91% 1.436ms 1.436ms 2.273us 17.16% 2.273us 2.273us 1
- aten::slice 1.43% 24.250us 1.81% 30.550us 5.092us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.37% 6.300us 0.37% 6.300us 1.050us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 2.70% 45.731us 2.70% 45.731us 7.622us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.30% 5.000us 0.30% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.982us 1195.38% 157.982us 157.982us 1
+ torch_eager 6.51% 110.244us 99.65% 1.686ms 1.686ms 0.000us 0.00% 15.488us 15.488us 1
+ aten::silu 2.52% 42.653us 88.50% 1.498ms 499.192us 6.784us 51.33% 9.056us 3.019us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.33% 6.784us 2.261us 3
+ aten::mul 1.66% 28.021us 2.76% 46.791us 15.597us 6.432us 48.67% 6.432us 2.144us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.67% 6.432us 2.144us 3
+ Activity Buffer Request 84.30% 1.427ms 84.30% 1.427ms 1.427ms 2.272us 17.19% 2.272us 2.272us 1
+ aten::slice 1.51% 25.627us 1.87% 31.700us 5.283us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.36% 6.073us 0.36% 6.073us 1.012us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 2.78% 47.050us 2.78% 47.050us 7.842us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.35% 5.950us 0.35% 5.950us 5.950us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.692ms
-Self CUDA time total: 13.249us
+Self CUDA time total: 13.216us
@@ -4039,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.149us 1202.68% 153.149us 153.149us 1
- torch_eager 6.34% 109.104us 99.71% 1.717ms 1.717ms 0.000us 0.00% 14.941us 14.941us 1
- aten::silu 2.38% 40.982us 88.93% 1.531ms 510.411us 6.558us 51.50% 8.765us 2.922us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.558us 51.50% 6.558us 2.186us 3
- aten::mul 1.52% 26.241us 2.68% 46.222us 15.407us 6.176us 48.50% 6.176us 2.059us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.50% 6.176us 2.059us 3
- Activity Buffer Request 73.41% 1.264ms 73.41% 1.264ms 1.264ms 2.207us 17.33% 2.207us 2.207us 1
- aten::slice 1.43% 24.560us 1.77% 30.400us 5.067us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.34% 5.840us 0.34% 5.840us 0.973us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 14.29% 246.139us 14.29% 246.139us 41.023us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.29% 4.920us 0.29% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.902us 1258.67% 159.902us 159.902us 1
+ torch_eager 6.73% 114.317us 99.66% 1.694ms 1.694ms 0.000us 0.00% 14.912us 14.912us 1
+ aten::silu 2.46% 41.881us 88.34% 1.501ms 500.465us 6.560us 51.64% 8.768us 2.923us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.64% 6.560us 2.187us 3
+ aten::mul 1.68% 28.581us 2.79% 47.441us 15.814us 6.144us 48.36% 6.144us 2.048us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.36% 6.144us 2.048us 3
+ Activity Buffer Request 74.33% 1.263ms 74.33% 1.263ms 1.263ms 2.208us 17.38% 2.208us 2.208us 1
+ aten::slice 1.44% 24.468us 1.80% 30.638us 5.106us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.36% 6.170us 0.36% 6.170us 1.028us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 12.65% 214.994us 12.65% 214.994us 35.832us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.34% 5.830us 0.34% 5.830us 5.830us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.722ms
-Self CUDA time total: 12.734us
+Self CPU time total: 1.700ms
+Self CUDA time total: 12.704us
@@ -4062,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.310us 1126.87% 149.310us 149.310us 1
- torch_eager 5.88% 107.113us 99.73% 1.817ms 1.817ms 0.000us 0.00% 15.555us 15.555us 1
- aten::silu 2.34% 42.602us 89.83% 1.636ms 545.432us 6.785us 51.21% 9.090us 3.030us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.785us 51.21% 6.785us 2.262us 3
- aten::mul 1.33% 24.312us 2.33% 42.512us 14.171us 6.465us 48.79% 6.465us 2.155us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.79% 6.465us 2.155us 3
- Activity Buffer Request 78.20% 1.424ms 78.20% 1.424ms 1.424ms 2.305us 17.40% 2.305us 2.305us 1
- aten::slice 1.35% 24.650us 1.68% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.33% 6.010us 0.33% 6.010us 1.002us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 10.29% 187.406us 10.29% 187.406us 31.234us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.27% 4.950us 0.27% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.053us 1185.48% 157.053us 157.053us 1
+ torch_eager 6.08% 111.294us 99.69% 1.824ms 1.824ms 0.000us 0.00% 15.552us 15.552us 1
+ aten::silu 2.39% 43.729us 89.42% 1.636ms 545.306us 6.784us 51.21% 9.088us 3.029us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.21% 6.784us 2.261us 3
+ aten::mul 1.44% 26.361us 2.52% 46.181us 15.394us 6.464us 48.79% 6.464us 2.155us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.79% 6.464us 2.155us 3
+ Activity Buffer Request 77.97% 1.426ms 77.97% 1.426ms 1.426ms 2.304us 17.39% 2.304us 2.304us 1
+ aten::slice 1.34% 24.571us 1.66% 30.441us 5.074us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.32% 5.870us 0.32% 5.870us 0.978us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.14% 185.544us 10.14% 185.544us 30.924us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.31% 5.601us 0.31% 5.601us 5.601us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.822ms
-Self CUDA time total: 13.250us
+Self CPU time total: 1.829ms
+Self CUDA time total: 13.248us
@@ -4085,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.804us 924.73% 143.804us 143.804us 1
- torch_eager 21.50% 103.524us 99.01% 476.736us 476.736us 0.000us 0.00% 18.271us 18.271us 1
- aten::silu 8.70% 41.893us 62.70% 301.891us 100.630us 7.999us 51.44% 10.719us 3.573us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.999us 51.44% 7.999us 2.666us 3
- aten::mul 5.07% 24.390us 8.83% 42.521us 14.174us 7.552us 48.56% 7.552us 2.517us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.56% 7.552us 2.517us 3
- Activity Buffer Request 22.22% 106.973us 22.22% 106.973us 106.973us 2.720us 17.49% 2.720us 2.720us 1
- aten::slice 4.80% 23.090us 5.98% 28.800us 4.800us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 1.19% 5.710us 1.19% 5.710us 0.952us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 35.55% 171.156us 35.55% 171.156us 28.526us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.99% 4.760us 0.99% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.390us 977.47% 151.390us 151.390us 1
+ torch_eager 22.03% 109.975us 99.02% 494.363us 494.363us 0.000us 0.00% 18.176us 18.176us 1
+ aten::silu 8.41% 41.971us 61.88% 308.937us 102.979us 7.936us 51.24% 10.624us 3.541us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.24% 7.936us 2.645us 3
+ aten::mul 5.23% 26.101us 8.92% 44.531us 14.844us 7.552us 48.76% 7.552us 2.517us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.76% 7.552us 2.517us 3
+ Activity Buffer Request 22.19% 110.773us 22.19% 110.773us 110.773us 2.688us 17.36% 2.688us 2.688us 1
+ aten::slice 5.05% 25.220us 6.19% 30.920us 5.153us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 1.14% 5.700us 1.14% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 34.98% 174.623us 34.98% 174.623us 29.104us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.98% 4.900us 0.98% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 481.496us
-Self CUDA time total: 15.551us
+Self CPU time total: 499.263us
+Self CUDA time total: 15.488us
@@ -4108,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.372us 1067.46% 153.372us 153.372us 1
- torch_eager 5.96% 108.164us 99.73% 1.810ms 1.810ms 0.000us 0.00% 16.832us 16.832us 1
- aten::silu 2.30% 41.731us 89.59% 1.626ms 541.925us 7.360us 51.22% 9.824us 3.275us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 51.22% 7.360us 2.453us 3
- aten::mul 1.41% 25.542us 2.47% 44.792us 14.931us 7.008us 48.78% 7.008us 2.336us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.78% 7.008us 2.336us 3
- Activity Buffer Request 78.82% 1.430ms 78.82% 1.430ms 1.430ms 2.464us 17.15% 2.464us 2.464us 1
- aten::slice 1.37% 24.840us 1.70% 30.900us 5.150us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.33% 6.060us 0.33% 6.060us 1.010us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.53% 172.976us 9.53% 172.976us 28.829us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.27% 4.960us 0.27% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 163.583us 1143.70% 163.583us 163.583us 1
+ torch_eager 6.28% 116.052us 99.70% 1.841ms 1.841ms 0.000us 0.00% 16.767us 16.767us 1
+ aten::silu 2.27% 41.942us 89.09% 1.645ms 548.450us 7.327us 51.23% 9.791us 3.264us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 51.23% 7.327us 2.442us 3
+ aten::mul 1.55% 28.681us 2.62% 48.392us 16.131us 6.976us 48.77% 6.976us 2.325us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.77% 6.976us 2.325us 3
+ Activity Buffer Request 78.22% 1.445ms 78.22% 1.445ms 1.445ms 2.464us 17.23% 2.464us 2.464us 1
+ aten::slice 1.38% 25.430us 1.70% 31.392us 5.232us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.32% 5.962us 0.32% 5.962us 0.994us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.67% 178.614us 9.67% 178.614us 29.769us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.30% 5.570us 0.30% 5.570us 5.570us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.815ms
-Self CUDA time total: 14.368us
+Self CPU time total: 1.847ms
+Self CUDA time total: 14.303us
@@ -4131,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 146.240us 942.27% 146.240us 146.240us 1
- torch_eager 22.59% 104.486us 98.96% 457.726us 457.726us 0.000us 0.00% 18.208us 18.208us 1
- aten::silu 8.78% 40.590us 60.43% 279.519us 93.173us 7.936us 51.13% 10.624us 3.541us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.13% 7.936us 2.645us 3
- aten::mul 5.53% 25.579us 9.45% 43.730us 14.577us 7.584us 48.87% 7.584us 2.528us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.87% 7.584us 2.528us 3
- Activity Buffer Request 18.85% 87.193us 18.85% 87.193us 87.193us 2.688us 17.32% 2.688us 2.688us 1
- aten::slice 5.23% 24.201us 6.48% 29.991us 4.999us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 1.25% 5.790us 1.25% 5.790us 0.965us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 36.73% 169.887us 36.73% 169.887us 28.314us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 1.04% 4.800us 1.04% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.172us 969.60% 150.172us 150.172us 1
+ torch_eager 23.07% 110.204us 98.98% 472.752us 472.752us 0.000us 0.00% 18.176us 18.176us 1
+ aten::silu 9.08% 43.371us 60.20% 287.547us 95.849us 7.936us 51.24% 10.624us 3.541us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.24% 7.936us 2.645us 3
+ aten::mul 5.48% 26.181us 9.38% 44.801us 14.934us 7.552us 48.76% 7.552us 2.517us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.76% 7.552us 2.517us 3
+ Activity Buffer Request 19.26% 92.002us 19.26% 92.002us 92.002us 2.688us 17.36% 2.688us 2.688us 1
+ aten::slice 5.00% 23.870us 6.32% 30.200us 5.033us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 1.33% 6.330us 1.33% 6.330us 1.055us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 35.76% 170.794us 35.76% 170.794us 28.466us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 1.02% 4.871us 1.02% 4.871us 4.871us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 462.526us
-Self CUDA time total: 15.520us
+Self CPU time total: 477.623us
+Self CUDA time total: 15.488us
@@ -4154,20 +4154,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 181.470us 803.28% 181.470us 181.470us 1
- torch_eager 5.97% 109.125us 99.74% 1.823ms 1.823ms 0.000us 0.00% 26.526us 26.526us 1
- aten::silu 2.38% 43.492us 88.50% 1.617ms 539.072us 11.647us 51.56% 15.582us 5.194us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.647us 51.56% 11.647us 3.882us 3
- aten::mul 1.42% 25.882us 3.51% 64.123us 21.374us 10.944us 48.44% 10.944us 3.648us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 48.44% 10.944us 3.648us 3
- Activity Buffer Request 77.67% 1.419ms 77.67% 1.419ms 1.419ms 3.935us 17.42% 3.935us 3.935us 1
- aten::slice 1.42% 25.910us 1.76% 32.089us 5.348us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.34% 6.179us 0.34% 6.179us 1.030us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 10.54% 192.606us 10.54% 192.606us 32.101us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.26% 4.790us 0.26% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.000us 713.30% 160.000us 160.000us 1
+ torch_eager 5.99% 109.975us 99.73% 1.831ms 1.831ms 0.000us 0.00% 26.335us 26.335us 1
+ aten::silu 2.30% 42.230us 89.52% 1.643ms 547.763us 11.583us 51.64% 15.487us 5.162us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.583us 51.64% 11.583us 3.861us 3
+ aten::mul 1.54% 28.250us 2.52% 46.180us 15.393us 10.848us 48.36% 10.848us 3.616us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.848us 48.36% 10.848us 3.616us 3
+ Activity Buffer Request 78.83% 1.447ms 78.83% 1.447ms 1.447ms 3.904us 17.40% 3.904us 3.904us 1
+ aten::slice 1.37% 25.211us 1.70% 31.261us 5.210us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.33% 6.050us 0.33% 6.050us 1.008us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.37% 171.964us 9.37% 171.964us 28.661us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.27% 4.930us 0.27% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.827ms
-Self CUDA time total: 22.591us
+Self CPU time total: 1.836ms
+Self CUDA time total: 22.431us
impl wl p50(ms) ok
@@ -4184,7 +4184,7 @@ torch_eager cuda_T512_D768 0.05 True
-Installed 37 packages in 192ms
+Installed 37 packages in 230ms
diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg
index 2eb6d36da2a386c6f3b7ffe7a4f2ecf07fbe531d..b809b51f58837145ae3fdbcb04aa1aec4a5e023e 100644
--- a/activation/results/artifacts/combine/latency.svg
+++ b/activation/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:431dea6a591fc822f7d0d0d6f793e8c11170edb647c627b5a44ad9883df2c3fc
-size 20697
+oid sha256:f62c7d85fc4a76cf7a1060a62df99ff0d32133ab94bb502b68dcd53171c39602
+size 21424
diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html
index f11a4ea4cf1c2f2bfbc419d5616f99db4990e15c..35064093e9085dbed21e2edd8a0a4e6c497bbb9d 100644
--- a/activation/results/combined_results.html
+++ b/activation/results/combined_results.html
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
- 2025-10-28T14:09:13.211569
+ 2025-10-29T14:27:49.999657
image/svg+xml
@@ -4021,83 +4021,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
-
+
- 0.025
+ 0.025
-
+
-
+
- 0.030
+ 0.030
-
+
-
+
- 0.035
+ 0.035
-
+
-
+
- 0.040
+ 0.040
-
+
-
+
- 0.045
+ 0.045
-
+
-
+
- 0.050
+ 0.050
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.055
@@ -4105,37 +4118,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
+
-
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
@@ -4150,30 +4163,30 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
Attention Implementation Latency
-
+
-
-
+
+
-
+
- hf_kernels_swiglu
+ hf_kernels_swiglu
-
-
+
+
-
+
- torch_eager
+ torch_eager
@@ -4193,7 +4206,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: combine | 4.28s
+Cell: combine | 4.24s
| ▶ run
Copy
Raw
@@ -4319,7 +4332,7 @@ Implementations included:
-Installed 37 packages in 195ms
+Installed 37 packages in 218ms
@@ -4332,7 +4345,7 @@ Installed 37 packages in 195ms
- 2025-10-28T14:09:13.211569
+ 2025-10-29T14:27:49.999657
image/svg+xml
@@ -4481,83 +4494,96 @@ Installed 37 packages in 195ms
-
+
-
+
- 0.025
+ 0.025
-
+
-
+
- 0.030
+ 0.030
-
+
-
+
- 0.035
+ 0.035
-
+
-
+
- 0.040
+ 0.040
-
+
-
+
- 0.045
+ 0.045
-
+
-
+
- 0.050
+ 0.050
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.055
@@ -4565,37 +4591,37 @@ Installed 37 packages in 195ms
-
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
+
-
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
@@ -4610,30 +4636,30 @@ Installed 37 packages in 195ms
-
+
Attention Implementation Latency
-
+
-
-
+
+
-
+
- hf_kernels_swiglu
+ hf_kernels_swiglu
-
-
+
+
-
+
- torch_eager
+ torch_eager
diff --git a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
index 062646d5a3f22298019a79ab8e52f52ea42bd834..3c3e9cb1937f70bc8a6005f64424ae1ae23f373f 100644
--- a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
+++ b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
@@ -1,24 +1,24 @@
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06712200001857127, "p50": 0.06883200001084333, "p90": 0.06976199995278876, "mean": 0.06901199997173535, "iqr": 0.0014600000213249587, "raw_times": [0.06976199995278876, 0.07104199994500959, 0.06712200001857127, 0.0683019999314638, 0.06883200001084333], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0738530000035098, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08455299996512622, "p50": 0.08599400007369695, "p90": 0.0868530000843748, "mean": 0.08612520005044644, "iqr": 0.0014299999975264654, "raw_times": [0.08780300004218589, 0.08455299996512622, 0.0868530000843748, 0.08542300008684833, 0.08599400007369695], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941300006881647, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08494299993344612, "p50": 0.08714299997336639, "p90": 0.08724299993900786, "mean": 0.086546999955317, "iqr": 0.0020200000108161476, "raw_times": [0.08522299992819171, 0.08714299997336639, 0.08818300000257295, 0.08724299993900786, 0.08494299993344612], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09105300000555872, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08327299997290538, "p50": 0.084122999965075, "p90": 0.08580299993354856, "mean": 0.08452299998680246, "iqr": 0.0023699999474047218, "raw_times": [0.08327299997290538, 0.084122999965075, 0.08598300007633952, 0.08580299993354856, 0.08343299998614384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08891300001323543, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08298299997022696, "p50": 0.08508299993081891, "p90": 0.08600299997851835, "mean": 0.0849267999683434, "iqr": 0.0016210000239880173, "raw_times": [0.08298299997022696, 0.08508299993081891, 0.08600299997851835, 0.08438199995453033, 0.08618300000762247], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08780300004218589, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08270299997548136, "p50": 0.08315299999139825, "p90": 0.0846430000365217, "mean": 0.08407499999520951, "iqr": 0.0019010000187336118, "raw_times": [0.08315299999139825, 0.08713399995485815, 0.08270299997548136, 0.08274200001778809, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08981299993138236, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08372299998882227, "p50": 0.08510199995725998, "p90": 0.08608299992829416, "mean": 0.08701479998762807, "iqr": 0.0011499998890940333, "raw_times": [0.08493300003920012, 0.09523300002456381, 0.08510199995725998, 0.08372299998882227, 0.08608299992829416], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08923300003971235, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08285199999136239, "p50": 0.08483300007355865, "p90": 0.08511300006830425, "mean": 0.08449480001218035, "iqr": 0.0016500000583619112, "raw_times": [0.08285199999136239, 0.08346300000994233, 0.08483300007355865, 0.08621299991773412, 0.08511300006830425], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08870299996033282, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08278300003894401, "p50": 0.08427300008406746, "p90": 0.08444299999155191, "mean": 0.08422300002166594, "iqr": 0.0002599999788799323, "raw_times": [0.08444299999155191, 0.08418300001267198, 0.08278300003894401, 0.08543299998109433, 0.08427300008406746], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08903299999474257, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08352199995442788, "p50": 0.0842329999386493, "p90": 0.08553300006042264, "mean": 0.08496079999531503, "iqr": 0.0014400000054592965, "raw_times": [0.08409300005496334, 0.08742299996811198, 0.08553300006042264, 0.08352199995442788, 0.0842329999386493], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985400006622513, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14414499992199126, "p50": 0.14512600000671227, "p90": 0.14515400005166157, "mean": 0.1465472000063528, "iqr": 0.0008580000212532468, "raw_times": [0.14512600000671227, 0.14414499992199126, 0.14429600003040832, 0.15401500002099056, 0.14515400005166157], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.14571500003057736, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16020600003230356, "p50": 0.16135600003508443, "p90": 0.16139600006681576, "mean": 0.16140360005465482, "iqr": 0.00029099999210302485, "raw_times": [0.16139600006681576, 0.1629550000643576, 0.16110500007471273, 0.16020600003230356, 0.16135600003508443], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1623660000404925, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07979300005445111, "p50": 0.08039299996198679, "p90": 0.08136300004935038, "mean": 0.08070500002759218, "iqr": 0.001150000002780871, "raw_times": [0.0802130000465695, 0.0817630000256031, 0.07979300005445111, 0.08039299996198679, 0.08136300004935038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0855329999467358, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0800829999434427, "p50": 0.08147299990923784, "p90": 0.08197300007850572, "mean": 0.08146099996793055, "iqr": 0.00109000018255756, "raw_times": [0.0800829999434427, 0.08197300007850572, 0.08147299990923784, 0.08289300001251831, 0.08088299989594816], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08291199992527254, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0799729999698684, "p50": 0.08137199995417177, "p90": 0.081513000054656, "mean": 0.08127659998535819, "iqr": 0.0006500000608866685, "raw_times": [0.0799729999698684, 0.08266199995432544, 0.081513000054656, 0.08086299999376934, 0.08137199995417177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08939400004237541, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08067300007041922, "p50": 0.08162300002823031, "p90": 0.08189199991193163, "mean": 0.08365860001049441, "iqr": 0.0008099999604382901, "raw_times": [0.08067300007041922, 0.08108199995149334, 0.08189199991193163, 0.08162300002823031, 0.09302300009039755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08415299998887349, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0806030000148894, "p50": 0.08186299999124458, "p90": 0.08199299998068454, "mean": 0.08162900001025264, "iqr": 0.001009999891721236, "raw_times": [0.08270299997548136, 0.08186299999124458, 0.0806030000148894, 0.08199299998068454, 0.08098300008896331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10199300004387624, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08040199998049502, "p50": 0.08168299996214046, "p90": 0.08185199999388715, "mean": 0.08171659999334224, "iqr": 0.0013889999763705418, "raw_times": [0.0804630000175166, 0.08418300001267198, 0.08168299996214046, 0.08040199998049502, 0.08185199999388715], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08522300004187855, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08097300008103048, "p50": 0.08150300004672317, "p90": 0.08173299988811777, "mean": 0.08153900000706926, "iqr": 0.0005599998758043512, "raw_times": [0.08117300001231342, 0.08231300000716146, 0.08150300004672317, 0.08173299988811777, 0.08097300008103048], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08440300007350743, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0802130000465695, "p50": 0.08124300006784324, "p90": 0.08242299998073577, "mean": 0.08162480000919459, "iqr": 0.0012000000424450263, "raw_times": [0.0802130000465695, 0.08302200001253368, 0.08242299998073577, 0.08124300006784324, 0.08122299993829074], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08460300000479037, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09226300005593657, "p50": 0.09320300000581483, "p90": 0.0934630000983816, "mean": 0.09316100004070904, "iqr": 0.0007800000503266347, "raw_times": [0.09419299999535724, 0.09320300000581483, 0.0934630000983816, 0.09226300005593657, 0.09268300004805496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0951240000404141, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09887299995625654, "p50": 0.09917300008055463, "p90": 0.09990300009121711, "mean": 0.09939520000443736, "iqr": 0.0009100001534534385, "raw_times": [0.09887299995625654, 0.09917300008055463, 0.09990300009121711, 0.10003399995639484, 0.09899299993776367], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1023739999936879, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4842959999677987, "p50": 0.4860569999891595, "p90": 0.4878769999550059, "mean": 0.48646659997757524, "iqr": 0.002959999960694404, "raw_times": [0.4849169999943115, 0.4860569999891595, 0.4878769999550059, 0.4842959999677987, 0.4891859999816006], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4877669999814316, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4968179999877975, "p50": 0.49805800006197387, "p90": 0.4990780000753148, "mean": 0.4983496000022569, "iqr": 0.001141000097959477, "raw_times": [0.4979369999773553, 0.49985699990884314, 0.4990780000753148, 0.49805800006197387, 0.4968179999877975], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.49727700002222264, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:35Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047051000024112, "p50": 0.048391000007086404, "p90": 0.048571999968771706, "mean": 0.04890720000503279, "iqr": 0.0005509999709829572, "raw_times": [0.0525010000274051, 0.048571999968771706, 0.04802099999778875, 0.048391000007086404, 0.047051000024112], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058991999992485944, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05480199996554802, "p50": 0.05610099998420992, "p90": 0.05628099995647062, "mean": 0.056069199968078465, "iqr": 0.0006299999881775875, "raw_times": [0.057510999965870724, 0.05628099995647062, 0.05610099998420992, 0.055650999968293036, 0.05480199996554802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06025200002568454, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05469199999197372, "p50": 0.056971999981669796, "p90": 0.057361000017408514, "mean": 0.056363600003805914, "iqr": 0.0025490000439276628, "raw_times": [0.05469199999197372, 0.057361000017408514, 0.056971999981669796, 0.05798100005449669, 0.05481199997348085], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0596809999819925, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05416100003685642, "p50": 0.05502099998011545, "p90": 0.05511200004093553, "mean": 0.05489540000098714, "iqr": 0.00016000007008187822, "raw_times": [0.05416100003685642, 0.05495199997085365, 0.055230999976174644, 0.05511200004093553, 0.05502099998011545], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05659100003185813, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052401000004920206, "p50": 0.055000999964249786, "p90": 0.056301000029179704, "mean": 0.054810999995424936, "iqr": 0.0023400000372930663, "raw_times": [0.052401000004920206, 0.056301000029179704, 0.056390999986888346, 0.05396099999188664, 0.055000999964249786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05838100003074942, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05270100001553146, "p50": 0.05342100001826111, "p90": 0.054111999986616865, "mean": 0.053611199996339565, "iqr": 0.0008709999974598759, "raw_times": [0.05324099998915699, 0.054580999972131394, 0.054111999986616865, 0.05342100001826111, 0.05270100001553146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0580610000042725, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051810999991630524, "p50": 0.05310099999178419, "p90": 0.053301000036753976, "mean": 0.05577720000928821, "iqr": 0.0007700000423938036, "raw_times": [0.051810999991630524, 0.05253099999436017, 0.06814200003191218, 0.053301000036753976, 0.05310099999178419], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056801999960498506, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053270999956112064, "p50": 0.05397199998924407, "p90": 0.05399100001568513, "mean": 0.054061200000887766, "iqr": 0.00023000001192485797, "raw_times": [0.05399100001568513, 0.05531100003963729, 0.053270999956112064, 0.05397199998924407, 0.05376100000376027], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056641000014678866, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052231000040592335, "p50": 0.052561000018158666, "p90": 0.0526110000009794, "mean": 0.0529970000116009, "iqr": 0.0002200000039920269, "raw_times": [0.052231000040592335, 0.052390999996987375, 0.05519100000128674, 0.052561000018158666, 0.0526110000009794], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05621100001462764, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052561000018158666, "p50": 0.053772000001117704, "p90": 0.05471100001841478, "mean": 0.057879400003457704, "iqr": 0.0021000000174353772, "raw_times": [0.0526110000009794, 0.05471100001841478, 0.053772000001117704, 0.052561000018158666, 0.07574199997861797], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06976200000963217, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05308099997591853, "p90": 0.05349200000637211, "mean": 0.05272739998645193, "iqr": 0.0014310000437944836, "raw_times": [0.052060999962577625, 0.05366199997069998, 0.05308099997591853, 0.051341000016691396, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05661099999088037, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05081099999415528, "p50": 0.053202000003693684, "p90": 0.05362099994954406, "mean": 0.05282339998302632, "iqr": 0.0011499999459374521, "raw_times": [0.05081099999415528, 0.052471000003606605, 0.05362099994954406, 0.054011999964131974, 0.053202000003693684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05642200000011144, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052221000032659504, "p50": 0.05397099999981947, "p90": 0.05448100000648992, "mean": 0.05380100001275423, "iqr": 0.001479999980347202, "raw_times": [0.052221000032659504, 0.05300100002614272, 0.055330999998659536, 0.05448100000648992, 0.05397099999981947], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421999995061924, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05164199995988383, "p50": 0.053460999993149017, "p90": 0.05421100001967716, "mean": 0.0532791999944493, "iqr": 0.00139000002263856, "raw_times": [0.0528209999970386, 0.054261000002497894, 0.05421100001967716, 0.05164199995988383, 0.053460999993149017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05741199998965385, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052460999995673774, "p50": 0.0557109999590466, "p90": 0.05585100001326282, "mean": 0.05600519999688913, "iqr": 0.002330000029360235, "raw_times": [0.052460999995673774, 0.0557109999590466, 0.05585100001326282, 0.053520999983902584, 0.06248200003255988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058421000005637325, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0540510000064387, "p50": 0.0541219999945497, "p90": 0.05425200004083308, "mean": 0.054337400013082515, "iqr": 0.0001910000264615519, "raw_times": [0.0540510000064387, 0.05425200004083308, 0.05520100000921957, 0.0541219999945497, 0.05406100001437153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05772200000819794, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05302099998516496, "p50": 0.05408099997339377, "p90": 0.0544409999747586, "mean": 0.05571119997966889, "iqr": 0.0008299999763039523, "raw_times": [0.05302099998516496, 0.0544409999747586, 0.06340199996657248, 0.053610999998454645, 0.05408099997339377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05787100002407897, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052391999986411975, "p50": 0.05292200000894809, "p90": 0.053131000015582686, "mean": 0.05318180001268047, "iqr": 0.00026899999738816405, "raw_times": [0.053131000015582686, 0.054602000034265075, 0.052391999986411975, 0.05286200001819452, 0.05292200000894809], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05755099999760205, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05195099998900332, "p50": 0.05479100002503401, "p90": 0.05620100000669481, "mean": 0.05852919999824735, "iqr": 0.0034000000255218765, "raw_times": [0.07690199998933167, 0.05620100000669481, 0.05479100002503401, 0.05195099998900332, 0.052800999981172936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057030999982998765, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05290100000365783, "p50": 0.05457100002104198, "p90": 0.055251000048883725, "mean": 0.055353400011881604, "iqr": 0.001779000058377278, "raw_times": [0.05290100000365783, 0.06057199999531804, 0.055251000048883725, 0.05347199999050645, 0.05457100002104198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056300999972336285, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052152000023397704, "p50": 0.05461199998535449, "p90": 0.05471200000783938, "mean": 0.05381760001910152, "iqr": 0.001900999961890193, "raw_times": [0.052152000023397704, 0.05461199998535449, 0.05480100003296684, 0.05471200000783938, 0.052811000045949186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05758200001082514, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05339099999446262, "p50": 0.05463100001179555, "p90": 0.055171999974845676, "mean": 0.05451339999353877, "iqr": 0.0011309999763398082, "raw_times": [0.05404099999850587, 0.055331999988084135, 0.05339099999446262, 0.05463100001179555, 0.055171999974845676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058501000012256554, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05277100001421786, "p50": 0.053860999969401746, "p90": 0.054361000024982786, "mean": 0.053951200004576094, "iqr": 0.0007190000133050489, "raw_times": [0.05277100001421786, 0.05512100000260034, 0.05364200001167774, 0.053860999969401746, 0.054361000024982786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057851999997637904, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:36Z", "run": "b5ef0d3c828f4daf8ce959b0e1a6bf4c", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124100005104992, "p50": 0.053691000005073874, "p90": 0.054261999991922494, "mean": 0.05327740001348502, "iqr": 0.0014609999539061391, "raw_times": [0.05124100005104992, 0.05439199998136246, 0.054261999991922494, 0.053691000005073874, 0.052801000038016355], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05714199994599767, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
diff --git a/causal_conv1d/impls/cells/benchmark.py b/causal_conv1d/impls/cells/benchmark.py
index 2e38669a505cbdf181a93e97f31ed1e67ecf4883..725b12c4018e4eec05c5ddccb0c88a8eae6f150d 100644
--- a/causal_conv1d/impls/cells/benchmark.py
+++ b/causal_conv1d/impls/cells/benchmark.py
@@ -4,37 +4,28 @@
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
+# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
-import torch.nn.functional as F
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+# Load the causal conv1d kernel
+causal_conv1d = get_kernel("kernels-community/causal-conv1d")
-def torch_causal_conv1d(input_tensor, weight, bias):
- # Convert to weight dtype for computation
- x = input_tensor.to(weight.dtype)
- dim = weight.shape[0]
- width = weight.shape[1]
- seqlen = input_tensor.shape[-1]
- # Depthwise causal conv1d using PyTorch
- out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
-
- # Truncate to original sequence length
- out = out[..., :seqlen]
-
- # Convert back to original dtype
- return out.to(input_tensor.dtype)
+def hf_kernels_causal_conv1d(input_tensor, weight, bias):
+ return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
run_benchmark(
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
- impl_name="torch_eager",
- impl_tags={"family": "pytorch", "backend": "eager"},
- impl_func=torch_causal_conv1d,
+ impl_name="hf_kernels_causal_conv1d",
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
+ impl_func=hf_kernels_causal_conv1d,
)
\ No newline at end of file
diff --git a/causal_conv1d/impls/hf_kernels_causal_conv1d.html b/causal_conv1d/impls/hf_kernels_causal_conv1d.html
index e50cedeff51b83afce46864a23939e763973b082..025d1f7d39597f6702f2ef95b801eca2a6d706e8 100644
--- a/causal_conv1d/impls/hf_kernels_causal_conv1d.html
+++ b/causal_conv1d/impls/hf_kernels_causal_conv1d.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: nv | 0.21s
+Cell: nv | 0.24s
| ▶ run
Copy
Raw
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
-
Tue Oct 28 14:08:09 2025
+Wed Oct 29 14:27:09 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 28C P0 80W / 350W | 0MiB / 46068MiB | 19% Default |
+| N/A 33C P0 109W / 350W | 0MiB / 46068MiB | 100% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
▼ output
▶ uv-logs
|
-Cell: benchmark | 9.91s
+Cell: benchmark | 5.79s
| ▶ run
Copy
Raw
@@ -3973,19 +3973,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 153.312us 3772.44% 153.312us 153.312us 1
- hf_kernels_causal_conv1d 8.26% 153.696us 99.59% 1.854ms 1.854ms 0.000us 0.00% 5.504us 5.504us 1
- CausalConv1dFn 6.06% 112.844us 91.33% 1.700ms 566.616us 0.000us 0.00% 5.504us 1.835us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.41% 26.281us 81.37% 1.514ms 504.821us 4.064us 100.00% 5.504us 1.835us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.064us 100.00% 4.064us 1.355us 3
- Activity Buffer Request 77.27% 1.438ms 77.27% 1.438ms 1.438ms 1.440us 35.43% 1.440us 1.440us 1
- aten::empty_like 1.15% 21.339us 3.90% 72.543us 24.181us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 2.75% 51.204us 2.75% 51.204us 17.068us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 2.69% 50.001us 2.69% 50.001us 16.667us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.41% 7.700us 0.41% 7.700us 7.700us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 151.393us 3724.31% 151.393us 151.393us 1
+ hf_kernels_causal_conv1d 8.95% 166.324us 99.62% 1.852ms 1.852ms 0.000us 0.00% 5.505us 5.505us 1
+ CausalConv1dFn 6.05% 112.563us 90.67% 1.686ms 561.934us 0.000us 0.00% 5.505us 1.835us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.41% 26.172us 80.97% 1.505ms 501.826us 4.065us 100.00% 5.505us 1.835us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.065us 100.00% 4.065us 1.355us 3
+ Activity Buffer Request 77.14% 1.434ms 77.14% 1.434ms 1.434ms 1.440us 35.42% 1.440us 1.440us 1
+ aten::empty_like 1.03% 19.059us 3.64% 67.761us 22.587us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 2.62% 48.702us 2.62% 48.702us 16.234us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 2.42% 45.061us 2.42% 45.061us 15.020us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.38% 7.150us 0.38% 7.150us 7.150us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.861ms
-Self CUDA time total: 4.064us
+Self CPU time total: 1.859ms
+Self CUDA time total: 4.065us
@@ -3995,19 +3995,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 128.895us 3412.63% 128.895us 128.895us 1
- hf_kernels_causal_conv1d 5.00% 84.832us 99.68% 1.692ms 1.692ms 0.000us 0.00% 5.026us 5.026us 1
- CausalConv1dFn 4.43% 75.123us 94.68% 1.607ms 535.685us 0.000us 0.00% 5.026us 1.675us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.59% 27.059us 88.41% 1.501ms 500.224us 3.777us 100.00% 5.026us 1.675us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.777us 100.00% 3.777us 1.259us 3
- Activity Buffer Request 84.88% 1.441ms 84.88% 1.441ms 1.441ms 1.249us 33.07% 1.249us 1.249us 1
- aten::empty_like 0.54% 9.230us 1.84% 31.262us 10.421us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.30% 22.032us 1.30% 22.032us 7.344us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 1.94% 32.892us 1.94% 32.892us 10.964us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.32% 5.440us 0.32% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 129.439us 3456.32% 129.439us 129.439us 1
+ hf_kernels_causal_conv1d 5.79% 99.043us 99.68% 1.706ms 1.706ms 0.000us 0.00% 4.994us 4.994us 1
+ CausalConv1dFn 4.71% 80.562us 93.90% 1.607ms 535.793us 0.000us 0.00% 4.994us 1.665us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.47% 25.130us 87.50% 1.498ms 499.285us 3.745us 100.00% 4.994us 1.665us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.745us 100.00% 3.745us 1.248us 3
+ Activity Buffer Request 84.17% 1.441ms 84.17% 1.441ms 1.441ms 1.249us 33.35% 1.249us 1.249us 1
+ aten::empty_like 0.47% 7.980us 1.69% 28.961us 9.654us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.23% 20.981us 1.23% 20.981us 6.994us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.86% 31.821us 1.86% 31.821us 10.607us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.32% 5.430us 0.32% 5.430us 5.430us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.697ms
-Self CUDA time total: 3.777us
+Self CPU time total: 1.712ms
+Self CUDA time total: 3.745us
@@ -4017,19 +4017,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.670us 3273.90% 124.670us 124.670us 1
- hf_kernels_causal_conv1d 4.86% 81.824us 99.65% 1.679ms 1.679ms 0.000us 0.00% 5.056us 5.056us 1
- CausalConv1dFn 4.28% 72.081us 94.80% 1.598ms 532.512us 0.000us 0.00% 5.056us 1.685us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.53% 25.732us 88.63% 1.494ms 497.871us 3.808us 100.00% 5.056us 1.685us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.808us 100.00% 3.808us 1.269us 3
- Activity Buffer Request 85.15% 1.435ms 85.15% 1.435ms 1.435ms 1.248us 32.77% 1.248us 1.248us 1
- aten::empty_like 0.59% 9.910us 1.89% 31.841us 10.614us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.30% 21.931us 1.30% 21.931us 7.310us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 1.96% 32.960us 1.96% 32.960us 10.987us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.35% 5.830us 0.35% 5.830us 5.830us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.098us 3285.62% 124.098us 124.098us 1
+ hf_kernels_causal_conv1d 5.52% 95.683us 99.69% 1.728ms 1.728ms 0.000us 0.00% 5.057us 5.057us 1
+ CausalConv1dFn 4.48% 77.582us 94.17% 1.632ms 544.020us 0.000us 0.00% 5.057us 1.686us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.43% 24.830us 87.99% 1.525ms 508.322us 3.777us 100.00% 5.057us 1.686us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.777us 100.00% 3.777us 1.259us 3
+ Activity Buffer Request 84.76% 1.469ms 84.76% 1.469ms 1.469ms 1.280us 33.89% 1.280us 1.280us 1
+ aten::empty_like 0.46% 7.920us 1.70% 29.511us 9.837us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.25% 21.591us 1.25% 21.591us 7.197us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.80% 31.261us 1.80% 31.261us 10.420us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.31% 5.301us 0.31% 5.301us 5.301us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.685ms
-Self CUDA time total: 3.808us
+Self CPU time total: 1.733ms
+Self CUDA time total: 3.777us
@@ -4039,19 +4039,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 131.358us 3479.68% 131.358us 131.358us 1
- hf_kernels_causal_conv1d 4.44% 83.422us 99.71% 1.875ms 1.875ms 0.000us 0.00% 5.054us 5.054us 1
- CausalConv1dFn 4.02% 75.643us 95.28% 1.792ms 597.348us 0.000us 0.00% 5.054us 1.685us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.36% 25.501us 89.54% 1.684ms 561.363us 3.775us 100.00% 5.054us 1.685us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.775us 100.00% 3.775us 1.258us 3
- Activity Buffer Request 75.66% 1.423ms 75.66% 1.423ms 1.423ms 1.279us 33.88% 1.279us 1.279us 1
- aten::empty_like 0.55% 10.279us 1.72% 32.311us 10.770us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.17% 22.032us 1.17% 22.032us 7.344us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 12.52% 235.449us 12.52% 235.449us 78.483us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.29% 5.400us 0.29% 5.400us 5.400us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 129.729us 3378.36% 129.729us 129.729us 1
+ hf_kernels_causal_conv1d 5.03% 97.232us 99.72% 1.927ms 1.927ms 0.000us 0.00% 5.120us 5.120us 1
+ CausalConv1dFn 4.11% 79.452us 94.69% 1.830ms 610.049us 0.000us 0.00% 5.120us 1.707us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.27% 24.481us 89.03% 1.721ms 573.588us 3.840us 100.00% 5.120us 1.707us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.840us 100.00% 3.840us 1.280us 3
+ Activity Buffer Request 76.40% 1.477ms 76.40% 1.477ms 1.477ms 1.280us 33.33% 1.280us 1.280us 1
+ aten::empty_like 0.41% 7.951us 1.55% 29.931us 9.977us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.14% 21.980us 1.14% 21.980us 7.327us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 11.36% 219.575us 11.36% 219.575us 73.192us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.28% 5.490us 0.28% 5.490us 5.490us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.881ms
-Self CUDA time total: 3.775us
+Self CPU time total: 1.933ms
+Self CUDA time total: 3.840us
@@ -4061,19 +4061,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 129.694us 2701.96% 129.694us 129.694us 1
- hf_kernels_causal_conv1d 4.57% 82.923us 99.70% 1.809ms 1.809ms 0.000us 0.00% 6.432us 6.432us 1
- CausalConv1dFn 4.25% 77.065us 95.13% 1.727ms 575.517us 0.000us 0.00% 6.432us 2.144us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.43% 25.889us 89.13% 1.618ms 539.172us 4.800us 100.00% 6.432us 2.144us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.800us 100.00% 4.800us 1.600us 3
- Activity Buffer Request 78.67% 1.428ms 78.67% 1.428ms 1.428ms 1.632us 34.00% 1.632us 1.632us 1
- aten::empty_like 0.53% 9.690us 1.76% 31.970us 10.657us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.23% 22.280us 1.23% 22.280us 7.427us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 9.03% 163.837us 9.03% 163.837us 54.612us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.30% 5.391us 0.30% 5.391us 5.391us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 126.080us 2644.30% 126.080us 126.080us 1
+ hf_kernels_causal_conv1d 5.18% 102.863us 99.75% 1.979ms 1.979ms 0.000us 0.00% 6.368us 6.368us 1
+ CausalConv1dFn 3.95% 78.303us 94.57% 1.876ms 625.402us 0.000us 0.00% 6.368us 2.123us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.22% 24.140us 89.14% 1.768ms 589.491us 4.768us 100.00% 6.368us 2.123us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.768us 100.00% 4.768us 1.589us 3
+ Activity Buffer Request 79.49% 1.577ms 79.49% 1.577ms 1.577ms 1.600us 33.56% 1.600us 1.600us 1
+ aten::empty_like 0.40% 7.900us 1.48% 29.430us 9.810us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.09% 21.530us 1.09% 21.530us 7.177us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.43% 167.184us 8.43% 167.184us 55.728us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.25% 4.910us 0.25% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.815ms
-Self CUDA time total: 4.800us
+Self CPU time total: 1.984ms
+Self CUDA time total: 4.768us
@@ -4083,19 +4083,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.655us 2439.95% 118.655us 118.655us 1
- hf_kernels_causal_conv1d 15.62% 77.102us 98.87% 488.177us 488.177us 0.000us 0.00% 6.495us 6.495us 1
- CausalConv1dFn 14.62% 72.193us 83.25% 411.075us 137.025us 0.000us 0.00% 6.495us 2.165us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.27% 26.040us 62.53% 308.751us 102.917us 4.863us 100.00% 6.495us 2.165us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.863us 100.00% 4.863us 1.621us 3
- Activity Buffer Request 25.28% 124.815us 25.28% 124.815us 124.815us 1.632us 33.56% 1.632us 1.632us 1
- aten::empty_like 1.61% 7.949us 6.10% 30.131us 10.044us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.49% 22.182us 4.49% 22.182us 7.394us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 31.98% 157.896us 31.98% 157.896us 52.632us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.13% 5.580us 1.13% 5.580us 5.580us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.055us 2488.80% 121.055us 121.055us 1
+ hf_kernels_causal_conv1d 13.09% 78.123us 99.20% 592.205us 592.205us 0.000us 0.00% 6.528us 6.528us 1
+ CausalConv1dFn 13.01% 77.643us 86.11% 514.082us 171.361us 0.000us 0.00% 6.528us 2.176us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.18% 24.929us 68.36% 408.089us 136.030us 4.864us 100.00% 6.528us 2.176us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.864us 100.00% 4.864us 1.621us 3
+ Activity Buffer Request 36.63% 218.665us 36.63% 218.665us 218.665us 1.664us 34.21% 1.664us 1.664us 1
+ aten::empty_like 1.31% 7.839us 4.75% 28.350us 9.450us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.44% 20.511us 3.44% 20.511us 6.837us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 27.55% 164.495us 27.55% 164.495us 54.832us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.80% 4.790us 0.80% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 493.757us
-Self CUDA time total: 4.863us
+Self CPU time total: 596.995us
+Self CUDA time total: 4.864us
@@ -4105,19 +4105,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 126.463us 1179.69% 126.463us 126.463us 1
- hf_kernels_causal_conv1d 4.44% 79.793us 99.69% 1.793ms 1.793ms 0.000us 0.00% 14.304us 14.304us 1
- CausalConv1dFn 3.96% 71.252us 95.25% 1.713ms 571.037us 0.000us 0.00% 14.304us 4.768us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.37% 24.661us 89.51% 1.610ms 536.652us 10.720us 100.00% 14.304us 4.768us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.720us 100.00% 10.720us 3.573us 3
- Activity Buffer Request 79.30% 1.426ms 79.30% 1.426ms 1.426ms 3.584us 33.43% 3.584us 3.584us 1
- aten::empty_like 0.54% 9.750us 1.77% 31.901us 10.634us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.23% 22.151us 1.23% 22.151us 7.384us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 8.84% 159.036us 8.84% 159.036us 53.012us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.31% 5.660us 0.31% 5.660us 5.660us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 128.031us 1201.49% 128.031us 128.031us 1
+ hf_kernels_causal_conv1d 5.58% 105.873us 99.72% 1.893ms 1.893ms 0.000us 0.00% 14.208us 14.208us 1
+ CausalConv1dFn 4.13% 78.341us 94.14% 1.787ms 595.748us 0.000us 0.00% 14.208us 4.736us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.45% 27.570us 88.49% 1.680ms 559.957us 10.656us 100.00% 14.208us 4.736us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.656us 100.00% 10.656us 3.552us 3
+ Activity Buffer Request 77.94% 1.480ms 77.94% 1.480ms 1.480ms 3.552us 33.33% 3.552us 3.552us 1
+ aten::empty_like 0.41% 7.812us 1.53% 29.032us 9.677us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.12% 21.220us 1.12% 21.220us 7.073us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 9.09% 172.624us 9.09% 172.624us 57.541us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.28% 5.330us 0.28% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.799ms
-Self CUDA time total: 10.720us
+Self CPU time total: 1.898ms
+Self CUDA time total: 10.656us
@@ -4127,19 +4127,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.490us 1115.98% 122.490us 122.490us 1
- hf_kernels_causal_conv1d 17.58% 82.141us 98.94% 462.145us 462.145us 0.000us 0.00% 14.656us 14.656us 1
- CausalConv1dFn 15.46% 72.195us 81.35% 380.004us 126.668us 0.000us 0.00% 14.656us 4.885us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.51% 25.720us 59.56% 278.229us 92.743us 10.976us 100.00% 14.656us 4.885us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.976us 100.00% 10.976us 3.659us 3
- Activity Buffer Request 20.67% 96.553us 20.67% 96.553us 96.553us 3.680us 33.53% 3.680us 3.680us 1
- aten::empty_like 1.79% 8.340us 6.33% 29.580us 9.860us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.55% 21.240us 4.55% 21.240us 7.080us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 33.39% 155.956us 33.39% 155.956us 51.985us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.06% 4.970us 1.06% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.524us 1119.66% 122.524us 122.524us 1
+ hf_kernels_causal_conv1d 19.00% 100.263us 99.02% 522.563us 522.563us 0.000us 0.00% 14.623us 14.623us 1
+ CausalConv1dFn 14.56% 76.813us 80.02% 422.300us 140.767us 0.000us 0.00% 14.623us 4.874us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.04% 26.621us 60.06% 316.927us 105.642us 10.943us 100.00% 14.623us 4.874us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.943us 100.00% 10.943us 3.648us 3
+ Activity Buffer Request 24.63% 129.993us 24.63% 129.993us 129.993us 3.680us 33.63% 3.680us 3.680us 1
+ aten::empty_like 1.53% 8.070us 5.41% 28.560us 9.520us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.88% 20.490us 3.88% 20.490us 6.830us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 30.38% 160.313us 30.38% 160.313us 53.438us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.98% 5.160us 0.98% 5.160us 5.160us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 467.115us
-Self CUDA time total: 10.976us
+Self CPU time total: 527.723us
+Self CUDA time total: 10.943us
@@ -4149,18 +4149,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 128.671us 1165.50% 128.671us 128.671us 1
- hf_kernels_causal_conv1d 4.51% 81.351us 99.72% 1.798ms 1.798ms 0.000us 0.00% 14.784us 14.784us 1
- CausalConv1dFn 4.05% 73.093us 95.21% 1.717ms 572.174us 0.000us 0.00% 14.784us 4.928us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.34% 24.081us 89.39% 1.612ms 537.183us 11.040us 100.00% 14.784us 4.928us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 130.879us 1185.50% 130.879us 130.879us 1
+ hf_kernels_causal_conv1d 6.10% 112.423us 99.71% 1.839ms 1.839ms 0.000us 0.00% 14.752us 14.752us 1
+ CausalConv1dFn 4.42% 81.553us 93.62% 1.726ms 575.457us 0.000us 0.00% 14.752us 4.917us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.34% 24.629us 87.45% 1.613ms 537.533us 11.040us 100.00% 14.752us 4.917us 3
void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.040us 100.00% 11.040us 3.680us 3
- Activity Buffer Request 79.34% 1.430ms 79.34% 1.430ms 1.430ms 3.744us 33.91% 3.744us 3.744us 1
- aten::empty_like 0.49% 8.921us 1.77% 31.881us 10.627us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.27% 22.960us 1.27% 22.960us 7.653us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 8.72% 157.177us 8.72% 157.177us 52.392us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.28% 4.970us 0.28% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1
+ Activity Buffer Request 77.44% 1.428ms 77.44% 1.428ms 1.428ms 3.712us 33.62% 3.712us 3.712us 1
+ aten::empty_like 0.46% 8.560us 1.75% 32.220us 10.740us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.28% 23.660us 1.28% 23.660us 7.887us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.67% 159.915us 8.67% 159.915us 53.305us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.29% 5.260us 0.29% 5.260us 5.260us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.803ms
+Self CPU time total: 1.844ms
Self CUDA time total: 11.040us
@@ -4171,19 +4171,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.762us 1085.65% 125.762us 125.762us 1
- hf_kernels_causal_conv1d 16.83% 79.002us 98.82% 463.887us 463.887us 0.000us 0.00% 15.360us 15.360us 1
- CausalConv1dFn 15.62% 73.323us 81.99% 384.885us 128.295us 0.000us 0.00% 15.360us 5.120us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.37% 25.230us 59.95% 281.430us 93.810us 11.584us 100.00% 15.360us 5.120us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.584us 100.00% 11.584us 3.861us 3
- Activity Buffer Request 20.79% 97.593us 20.79% 97.593us 97.593us 3.776us 32.60% 3.776us 3.776us 1
- aten::empty_like 1.82% 8.531us 6.42% 30.132us 10.044us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.60% 21.601us 4.60% 21.601us 7.200us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 33.79% 158.607us 33.79% 158.607us 52.869us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.18% 5.530us 1.18% 5.530us 5.530us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.988us 1097.16% 124.988us 124.988us 1
+ hf_kernels_causal_conv1d 14.68% 75.042us 98.95% 505.802us 505.802us 0.000us 0.00% 15.232us 15.232us 1
+ CausalConv1dFn 15.20% 77.712us 84.27% 430.760us 143.587us 0.000us 0.00% 15.232us 5.077us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.71% 24.091us 63.54% 324.777us 108.259us 11.392us 100.00% 15.232us 5.077us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.392us 100.00% 11.392us 3.797us 3
+ Activity Buffer Request 26.66% 136.263us 26.66% 136.263us 136.263us 3.840us 33.71% 3.840us 3.840us 1
+ aten::empty_like 1.46% 7.441us 5.53% 28.271us 9.424us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.08% 20.830us 4.08% 20.830us 6.943us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 32.17% 164.423us 32.17% 164.423us 54.808us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.05% 5.351us 1.05% 5.351us 5.351us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 469.417us
-Self CUDA time total: 11.584us
+Self CPU time total: 511.153us
+Self CUDA time total: 11.392us
@@ -4193,19 +4193,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 134.046us 264.80% 134.046us 134.046us 1
- hf_kernels_causal_conv1d 4.19% 76.942us 99.71% 1.832ms 1.832ms 0.000us 0.00% 84.285us 84.285us 1
- CausalConv1dFn 4.10% 75.381us 95.52% 1.755ms 585.044us 0.000us 0.00% 84.285us 28.095us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.30% 23.952us 89.70% 1.648ms 549.413us 50.622us 100.00% 84.285us 28.095us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 50.622us 100.00% 50.622us 16.874us 3
- Activity Buffer Request 78.71% 1.446ms 78.71% 1.446ms 1.446ms 33.663us 66.50% 33.663us 33.663us 1
- aten::empty_like 0.54% 9.991us 1.71% 31.512us 10.504us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.17% 21.521us 1.17% 21.521us 7.174us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 9.69% 177.966us 9.69% 177.966us 59.322us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.29% 5.380us 0.29% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 131.775us 262.12% 131.775us 131.775us 1
+ hf_kernels_causal_conv1d 8.81% 77.263us 99.39% 871.362us 871.362us 0.000us 0.00% 83.680us 83.680us 1
+ CausalConv1dFn 8.68% 76.121us 90.57% 794.099us 264.700us 0.000us 0.00% 83.680us 27.893us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 3.02% 26.501us 78.58% 688.947us 229.649us 50.272us 100.00% 83.680us 27.893us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 50.272us 100.00% 50.272us 16.757us 3
+ Activity Buffer Request 55.77% 488.972us 55.77% 488.972us 488.972us 33.408us 66.45% 33.408us 33.408us 1
+ aten::empty_like 0.92% 8.040us 3.31% 29.031us 9.677us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 2.39% 20.991us 2.39% 20.991us 6.997us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 19.79% 173.474us 19.79% 173.474us 57.825us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.61% 5.370us 0.61% 5.370us 5.370us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.837ms
-Self CUDA time total: 50.622us
+Self CPU time total: 876.732us
+Self CUDA time total: 50.272us
@@ -4215,19 +4215,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.639us 241.17% 124.639us 124.639us 1
- hf_kernels_causal_conv1d 12.15% 73.652us 99.08% 600.632us 600.632us 0.000us 0.00% 86.272us 86.272us 1
- CausalConv1dFn 11.76% 71.283us 86.93% 526.980us 175.660us 0.000us 0.00% 86.272us 28.757us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 4.05% 24.580us 70.27% 425.965us 141.988us 51.680us 100.00% 86.272us 28.757us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 51.680us 100.00% 51.680us 17.227us 3
- Activity Buffer Request 38.62% 234.139us 38.62% 234.139us 234.139us 34.592us 66.93% 34.592us 34.592us 1
- aten::empty_like 1.31% 7.952us 4.90% 29.732us 9.911us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 3.59% 21.780us 3.59% 21.780us 7.260us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 27.59% 167.246us 27.59% 167.246us 55.749us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.92% 5.560us 0.92% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 127.295us 247.23% 127.295us 127.295us 1
+ hf_kernels_causal_conv1d 15.09% 77.332us 99.04% 507.562us 507.562us 0.000us 0.00% 86.016us 86.016us 1
+ CausalConv1dFn 14.68% 75.241us 83.95% 430.230us 143.410us 0.000us 0.00% 86.016us 28.672us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.05% 25.861us 63.40% 324.927us 108.309us 51.488us 100.00% 86.016us 28.672us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 51.488us 100.00% 51.488us 17.163us 3
+ Activity Buffer Request 25.26% 129.463us 25.26% 129.463us 129.463us 34.528us 67.06% 34.528us 34.528us 1
+ aten::empty_like 1.67% 8.561us 5.87% 30.062us 10.021us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.20% 21.501us 4.20% 21.501us 7.167us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 33.09% 169.603us 33.09% 169.603us 56.534us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.96% 4.929us 0.96% 4.929us 4.929us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 606.192us
-Self CUDA time total: 51.680us
+Self CPU time total: 512.491us
+Self CUDA time total: 51.488us
@@ -4237,18 +4237,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.184us 3001.64% 117.184us 117.184us 1
- hf_kernels_causal_conv1d 11.99% 71.634us 99.07% 591.661us 591.661us 0.000us 0.00% 5.152us 5.152us 1
- CausalConv1dFn 11.65% 69.552us 87.08% 520.027us 173.342us 0.000us 0.00% 5.152us 1.717us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 4.09% 24.400us 70.30% 419.834us 139.945us 3.904us 100.00% 5.152us 1.717us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.214us 3104.87% 121.214us 121.214us 1
+ hf_kernels_causal_conv1d 8.71% 75.123us 99.37% 856.672us 856.672us 0.000us 0.00% 5.184us 5.184us 1
+ CausalConv1dFn 8.55% 73.741us 90.66% 781.549us 260.516us 0.000us 0.00% 5.184us 1.728us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 2.92% 25.150us 78.63% 677.857us 225.952us 3.904us 100.00% 5.184us 1.728us 3
void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.904us 100.00% 3.904us 1.301us 3
- Activity Buffer Request 39.52% 236.029us 39.52% 236.029us 236.029us 1.248us 31.97% 1.248us 1.248us 1
- aten::empty_like 1.39% 8.281us 5.13% 30.641us 10.214us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 3.74% 22.360us 3.74% 22.360us 7.453us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 26.69% 159.405us 26.69% 159.405us 53.135us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.93% 5.550us 0.93% 5.550us 5.550us 0.000us 0.00% 0.000us 0.000us 1
+ Activity Buffer Request 56.24% 484.832us 56.24% 484.832us 484.832us 1.280us 32.79% 1.280us 1.280us 1
+ aten::empty_like 1.08% 9.311us 3.47% 29.951us 9.984us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 2.39% 20.640us 2.39% 20.640us 6.880us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 19.47% 167.875us 19.47% 167.875us 55.958us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.63% 5.440us 0.63% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 597.211us
+Self CPU time total: 862.112us
Self CUDA time total: 3.904us
@@ -4259,19 +4259,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 129.214us 3308.94% 129.214us 129.214us 1
- hf_kernels_causal_conv1d 14.44% 74.841us 98.93% 512.678us 512.678us 0.000us 0.00% 5.154us 5.154us 1
- CausalConv1dFn 14.14% 73.283us 84.49% 437.837us 145.946us 0.000us 0.00% 5.154us 1.718us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 6.57% 34.031us 64.55% 334.472us 111.491us 3.905us 100.00% 5.154us 1.718us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.905us 100.00% 3.905us 1.302us 3
- Activity Buffer Request 27.83% 144.225us 27.83% 144.225us 144.225us 1.249us 31.98% 1.249us 1.249us 1
- aten::empty_like 1.69% 8.750us 5.81% 30.082us 10.027us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.12% 21.332us 4.12% 21.332us 7.111us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 30.15% 156.216us 30.15% 156.216us 52.072us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.07% 5.520us 1.07% 5.520us 5.520us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.438us 3086.10% 121.438us 121.438us 1
+ hf_kernels_causal_conv1d 15.37% 74.422us 98.89% 478.921us 478.921us 0.000us 0.00% 5.183us 5.183us 1
+ CausalConv1dFn 15.69% 75.972us 83.52% 404.499us 134.833us 0.000us 0.00% 5.183us 1.728us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.44% 26.330us 61.72% 298.936us 99.645us 3.935us 100.00% 5.183us 1.728us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.935us 100.00% 3.935us 1.312us 3
+ Activity Buffer Request 23.74% 114.963us 23.74% 114.963us 114.963us 1.248us 31.72% 1.248us 1.248us 1
+ aten::empty_like 1.57% 7.609us 6.11% 29.591us 9.864us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.54% 21.982us 4.54% 21.982us 7.327us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 32.55% 157.643us 32.55% 157.643us 52.548us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.11% 5.391us 1.11% 5.391us 5.391us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 518.198us
-Self CUDA time total: 3.905us
+Self CPU time total: 484.312us
+Self CUDA time total: 3.935us
@@ -4281,19 +4281,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.525us 2939.61% 118.525us 118.525us 1
- hf_kernels_causal_conv1d 13.97% 75.404us 99.13% 534.960us 534.960us 0.000us 0.00% 5.376us 5.376us 1
- CausalConv1dFn 13.10% 70.683us 85.16% 459.556us 153.185us 0.000us 0.00% 5.376us 1.792us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 4.73% 25.549us 66.42% 358.442us 119.481us 4.032us 100.00% 5.376us 1.792us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3
- Activity Buffer Request 32.81% 177.046us 32.81% 177.046us 177.046us 1.344us 33.33% 1.344us 1.344us 1
- aten::empty_like 1.62% 8.721us 5.64% 30.431us 10.144us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.02% 21.710us 4.02% 21.710us 7.237us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 28.88% 155.847us 28.88% 155.847us 51.949us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.87% 4.710us 0.87% 4.710us 4.710us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 152.157us 3744.94% 152.157us 152.157us 1
+ hf_kernels_causal_conv1d 10.88% 77.931us 99.21% 710.327us 710.327us 0.000us 0.00% 5.407us 5.407us 1
+ CausalConv1dFn 11.39% 81.522us 88.32% 632.396us 210.799us 0.000us 0.00% 5.407us 1.802us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 3.86% 27.639us 72.73% 520.742us 173.581us 4.063us 100.00% 5.407us 1.802us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3
+ Activity Buffer Request 44.05% 315.408us 44.05% 315.408us 315.408us 1.344us 33.08% 1.344us 1.344us 1
+ aten::empty_like 1.15% 8.200us 4.21% 30.132us 10.044us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.06% 21.932us 3.06% 21.932us 7.311us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 24.82% 177.695us 24.82% 177.695us 59.232us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.79% 5.681us 0.79% 5.681us 5.681us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 539.670us
-Self CUDA time total: 4.032us
+Self CPU time total: 716.008us
+Self CUDA time total: 4.063us
@@ -4303,19 +4303,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 115.905us 2852.70% 115.905us 115.905us 1
- hf_kernels_causal_conv1d 16.16% 74.143us 98.83% 453.315us 453.315us 0.000us 0.00% 5.407us 5.407us 1
- CausalConv1dFn 14.93% 68.471us 82.67% 379.172us 126.391us 0.000us 0.00% 5.407us 1.802us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.63% 25.811us 61.32% 281.280us 93.760us 4.063us 100.00% 5.407us 1.802us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3
- Activity Buffer Request 21.83% 100.113us 21.83% 100.113us 100.113us 1.344us 33.08% 1.344us 1.344us 1
- aten::empty_like 1.88% 8.641us 6.41% 29.421us 9.807us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.53% 20.780us 4.53% 20.780us 6.927us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 33.87% 155.356us 33.87% 155.356us 51.785us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.17% 5.370us 1.17% 5.370us 5.370us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.936us 2951.18% 119.936us 119.936us 1
+ hf_kernels_causal_conv1d 15.86% 75.552us 99.00% 471.672us 471.672us 0.000us 0.00% 5.440us 5.440us 1
+ CausalConv1dFn 16.03% 76.383us 83.14% 396.120us 132.040us 0.000us 0.00% 5.440us 1.813us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.35% 25.480us 61.26% 291.866us 97.289us 4.064us 100.00% 5.440us 1.813us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.064us 100.00% 4.064us 1.355us 3
+ Activity Buffer Request 23.14% 110.243us 23.14% 110.243us 110.243us 1.376us 33.86% 1.376us 1.376us 1
+ aten::empty_like 1.53% 7.269us 5.85% 27.871us 9.290us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.32% 20.602us 4.32% 20.602us 6.867us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 32.77% 156.143us 32.77% 156.143us 52.048us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.00% 4.760us 1.00% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 458.685us
-Self CUDA time total: 4.063us
+Self CPU time total: 476.432us
+Self CUDA time total: 4.064us
@@ -4325,19 +4325,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.141us 2271.97% 122.141us 122.141us 1
- hf_kernels_causal_conv1d 11.82% 75.911us 99.15% 636.712us 636.712us 0.000us 0.00% 7.200us 7.200us 1
- CausalConv1dFn 11.01% 70.722us 87.33% 560.801us 186.934us 0.000us 0.00% 7.200us 2.400us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 4.24% 27.210us 71.66% 460.136us 153.379us 5.376us 100.00% 7.200us 2.400us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.376us 100.00% 5.376us 1.792us 3
- Activity Buffer Request 43.06% 276.540us 43.06% 276.540us 276.540us 1.824us 33.93% 1.824us 1.824us 1
- aten::empty_like 1.25% 8.002us 4.66% 29.943us 9.981us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 3.42% 21.941us 3.42% 21.941us 7.314us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 24.35% 156.386us 24.35% 156.386us 52.129us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.85% 5.440us 0.85% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 129.888us 2401.78% 129.888us 129.888us 1
+ hf_kernels_causal_conv1d 13.50% 106.873us 99.32% 785.980us 785.980us 0.000us 0.00% 7.264us 7.264us 1
+ CausalConv1dFn 10.04% 79.422us 85.81% 679.107us 226.369us 0.000us 0.00% 7.264us 2.421us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 3.32% 26.310us 72.10% 570.564us 190.188us 5.408us 100.00% 7.264us 2.421us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.408us 100.00% 5.408us 1.803us 3
+ Activity Buffer Request 48.81% 386.260us 48.81% 386.260us 386.260us 1.856us 34.32% 1.856us 1.856us 1
+ aten::empty_like 1.01% 7.981us 3.68% 29.121us 9.707us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 2.67% 21.140us 2.67% 21.140us 7.047us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 19.96% 157.994us 19.96% 157.994us 52.665us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.68% 5.410us 0.68% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 642.152us
-Self CUDA time total: 5.376us
+Self CPU time total: 791.390us
+Self CUDA time total: 5.408us
@@ -4347,19 +4347,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.822us 2140.66% 117.822us 117.822us 1
- hf_kernels_causal_conv1d 16.30% 72.964us 98.80% 442.326us 442.326us 0.000us 0.00% 7.392us 7.392us 1
- CausalConv1dFn 16.19% 72.472us 82.50% 369.362us 123.121us 0.000us 0.00% 7.392us 2.464us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.63% 25.211us 59.71% 267.319us 89.106us 5.504us 100.00% 7.392us 2.464us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.504us 100.00% 5.504us 1.835us 3
- Activity Buffer Request 19.35% 86.632us 19.35% 86.632us 86.632us 1.888us 34.30% 1.888us 1.888us 1
- aten::empty_like 1.85% 8.281us 6.60% 29.571us 9.857us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.76% 21.290us 4.76% 21.290us 7.097us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 34.73% 155.476us 34.73% 155.476us 51.825us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.20% 5.391us 1.20% 5.391us 5.391us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.463us 2151.92% 118.463us 118.463us 1
+ hf_kernels_causal_conv1d 19.47% 96.181us 98.96% 488.812us 488.812us 0.000us 0.00% 7.393us 7.393us 1
+ CausalConv1dFn 15.19% 75.044us 79.49% 392.631us 130.877us 0.000us 0.00% 7.393us 2.464us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.31% 26.241us 58.39% 288.397us 96.132us 5.505us 100.00% 7.393us 2.464us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.505us 100.00% 5.505us 1.835us 3
+ Activity Buffer Request 21.50% 106.222us 21.50% 106.222us 106.222us 1.888us 34.30% 1.888us 1.888us 1
+ aten::empty_like 1.50% 7.390us 5.91% 29.190us 9.730us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.41% 21.800us 4.41% 21.800us 7.267us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 31.57% 155.934us 31.57% 155.934us 51.978us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.04% 5.140us 1.04% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 447.717us
-Self CUDA time total: 5.504us
+Self CPU time total: 493.952us
+Self CUDA time total: 5.505us
@@ -4369,19 +4369,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.728us 716.97% 125.728us 125.728us 1
- hf_kernels_causal_conv1d 11.80% 75.821us 99.14% 637.002us 637.002us 0.000us 0.00% 23.392us 23.392us 1
- CausalConv1dFn 11.24% 72.243us 87.34% 561.181us 187.060us 0.000us 0.00% 23.392us 7.797us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 4.08% 26.210us 71.24% 457.746us 152.582us 17.536us 100.00% 23.392us 7.797us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.536us 100.00% 17.536us 5.845us 3
- Activity Buffer Request 42.92% 275.770us 42.92% 275.770us 275.770us 5.856us 33.39% 5.856us 5.856us 1
- aten::empty_like 1.45% 9.311us 4.85% 31.192us 10.397us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 3.41% 21.881us 3.41% 21.881us 7.294us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 24.24% 155.766us 24.24% 155.766us 51.922us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.86% 5.550us 0.86% 5.550us 5.550us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 129.279us 741.28% 129.279us 129.279us 1
+ hf_kernels_causal_conv1d 5.08% 91.861us 99.73% 1.805ms 1.805ms 0.000us 0.00% 23.296us 23.296us 1
+ CausalConv1dFn 4.24% 76.815us 94.65% 1.713ms 571.078us 0.000us 0.00% 23.296us 7.765us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.42% 25.791us 88.76% 1.607ms 535.516us 17.440us 100.00% 23.296us 7.765us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.440us 100.00% 17.440us 5.813us 3
+ Activity Buffer Request 78.65% 1.424ms 78.65% 1.424ms 1.424ms 5.856us 33.58% 5.856us 5.856us 1
+ aten::empty_like 0.47% 8.500us 1.65% 29.870us 9.957us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.18% 21.370us 1.18% 21.370us 7.123us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.68% 157.163us 8.68% 157.163us 52.388us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 4.911us 0.27% 4.911us 4.911us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 642.552us
-Self CUDA time total: 17.536us
+Self CPU time total: 1.810ms
+Self CUDA time total: 17.440us
@@ -4391,19 +4391,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.901us 690.22% 123.901us 123.901us 1
- hf_kernels_causal_conv1d 16.99% 75.711us 98.78% 440.245us 440.245us 0.000us 0.00% 23.967us 23.967us 1
- CausalConv1dFn 15.81% 70.471us 81.79% 364.534us 121.511us 0.000us 0.00% 23.967us 7.989us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.65% 25.192us 59.40% 264.751us 88.250us 17.951us 100.00% 23.967us 7.989us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.951us 100.00% 17.951us 5.984us 3
- Activity Buffer Request 18.53% 82.593us 18.53% 82.593us 82.593us 6.016us 33.51% 6.016us 6.016us 1
- aten::empty_like 1.75% 7.802us 6.58% 29.312us 9.771us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.83% 21.510us 4.83% 21.510us 7.170us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 35.22% 156.966us 35.22% 156.966us 52.322us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.22% 5.440us 1.22% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 139.324us 772.01% 139.324us 139.324us 1
+ hf_kernels_causal_conv1d 18.68% 93.362us 99.02% 494.883us 494.883us 0.000us 0.00% 24.095us 24.095us 1
+ CausalConv1dFn 17.38% 86.843us 80.34% 401.521us 133.840us 0.000us 0.00% 24.095us 8.032us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.36% 26.789us 57.15% 285.628us 95.209us 18.047us 100.00% 24.095us 8.032us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.047us 100.00% 18.047us 6.016us 3
+ Activity Buffer Request 20.49% 102.403us 20.49% 102.403us 102.403us 6.048us 33.51% 6.048us 6.048us 1
+ aten::empty_like 1.48% 7.399us 5.81% 29.050us 9.683us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.33% 21.651us 4.33% 21.651us 7.217us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 31.30% 156.436us 31.30% 156.436us 52.145us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.98% 4.890us 0.98% 4.890us 4.890us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 445.685us
-Self CUDA time total: 17.951us
+Self CPU time total: 499.773us
+Self CUDA time total: 18.047us
@@ -4413,19 +4413,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 131.804us 730.34% 131.804us 131.804us 1
- hf_kernels_causal_conv1d 11.57% 77.592us 99.18% 665.133us 665.133us 0.000us 0.00% 24.094us 24.094us 1
- CausalConv1dFn 10.93% 73.321us 87.61% 587.541us 195.847us 0.000us 0.00% 24.094us 8.031us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 3.40% 22.811us 71.94% 482.478us 160.826us 18.047us 100.00% 24.094us 8.031us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.047us 100.00% 18.047us 6.016us 3
- Activity Buffer Request 44.54% 298.731us 44.54% 298.731us 298.731us 6.047us 33.51% 6.047us 6.047us 1
- aten::empty_like 1.35% 9.049us 4.73% 31.742us 10.581us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 3.38% 22.693us 3.38% 22.693us 7.564us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 24.00% 160.936us 24.00% 160.936us 53.645us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.82% 5.510us 0.82% 5.510us 5.510us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 135.103us 748.58% 135.103us 135.103us 1
+ hf_kernels_causal_conv1d 5.37% 98.434us 99.69% 1.829ms 1.829ms 0.000us 0.00% 24.097us 24.097us 1
+ CausalConv1dFn 4.35% 79.821us 94.33% 1.730ms 576.697us 0.000us 0.00% 24.097us 8.032us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.36% 24.912us 88.33% 1.620ms 540.010us 18.048us 100.00% 24.097us 8.032us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.048us 100.00% 18.048us 6.016us 3
+ Activity Buffer Request 77.78% 1.427ms 77.78% 1.427ms 1.427ms 6.049us 33.52% 6.049us 6.049us 1
+ aten::empty_like 0.47% 8.550us 1.65% 30.240us 10.080us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.18% 21.690us 1.18% 21.690us 7.230us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 9.19% 168.514us 9.19% 168.514us 56.171us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.31% 5.620us 0.31% 5.620us 5.620us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 670.643us
-Self CUDA time total: 18.047us
+Self CPU time total: 1.834ms
+Self CUDA time total: 18.048us
@@ -4435,19 +4435,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.267us 637.87% 122.267us 122.267us 1
- hf_kernels_causal_conv1d 16.94% 75.003us 98.82% 437.665us 437.665us 0.000us 0.00% 25.632us 25.632us 1
- CausalConv1dFn 15.90% 70.409us 81.89% 362.662us 120.887us 0.000us 0.00% 25.632us 8.544us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.97% 26.462us 59.15% 261.981us 87.327us 19.168us 100.00% 25.632us 8.544us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 19.168us 100.00% 19.168us 6.389us 3
- Activity Buffer Request 18.04% 79.883us 18.04% 79.883us 79.883us 6.464us 33.72% 6.464us 6.464us 1
- aten::empty_like 2.06% 9.102us 6.84% 30.272us 10.091us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.78% 21.170us 4.78% 21.170us 7.057us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 35.14% 155.636us 35.14% 155.636us 51.879us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.18% 5.220us 1.18% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 130.684us 694.54% 130.684us 130.684us 1
+ hf_kernels_causal_conv1d 18.98% 97.223us 99.02% 507.183us 507.183us 0.000us 0.00% 25.120us 25.120us 1
+ CausalConv1dFn 14.58% 74.692us 80.04% 409.960us 136.653us 0.000us 0.00% 25.120us 8.373us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 6.51% 33.321us 59.71% 305.838us 101.946us 18.816us 100.00% 25.120us 8.373us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.816us 100.00% 18.816us 6.272us 3
+ Activity Buffer Request 22.33% 114.353us 22.33% 114.353us 114.353us 6.304us 33.50% 6.304us 6.304us 1
+ aten::empty_like 1.71% 8.769us 5.75% 29.430us 9.810us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.03% 20.661us 4.03% 20.661us 6.887us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 30.88% 158.164us 30.88% 158.164us 52.721us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.98% 5.010us 0.98% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 442.885us
-Self CUDA time total: 19.168us
+Self CPU time total: 512.193us
+Self CUDA time total: 18.816us
@@ -4457,19 +4457,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 4.25% 77.621us 99.69% 1.822ms 1.822ms 0.000us 0.00% 163.007us 163.007us 1
- CausalConv1dFn 4.18% 76.374us 95.44% 1.744ms 581.328us 0.000us 0.00% 163.007us 54.336us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.34% 24.550us 89.50% 1.636ms 545.169us 97.983us 100.00% 163.007us 54.336us 3
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 142.719us 145.66% 142.719us 142.719us 1
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 97.983us 100.00% 97.983us 32.661us 3
- Activity Buffer Request 79.33% 1.450ms 79.33% 1.450ms 1.450ms 65.024us 66.36% 65.024us 65.024us 1
- aten::empty_like 0.51% 9.271us 1.76% 32.102us 10.701us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.25% 22.831us 1.25% 22.831us 7.610us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 8.83% 161.275us 8.83% 161.275us 53.758us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.31% 5.740us 0.31% 5.740us 5.740us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 6.14% 112.394us 99.70% 1.825ms 1.825ms 0.000us 0.00% 162.754us 162.754us 1
+ CausalConv1dFn 4.41% 80.651us 93.56% 1.713ms 570.927us 0.000us 0.00% 162.754us 54.251us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.37% 25.010us 87.54% 1.603ms 534.193us 97.985us 100.00% 162.754us 54.251us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 144.737us 147.71% 144.737us 144.737us 1
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 97.985us 100.00% 97.985us 32.662us 3
+ Activity Buffer Request 77.36% 1.416ms 77.36% 1.416ms 1.416ms 64.769us 66.10% 64.769us 64.769us 1
+ aten::empty_like 0.49% 8.901us 1.61% 29.551us 9.850us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.13% 20.650us 1.13% 20.650us 6.883us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.82% 161.445us 8.82% 161.445us 53.815us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.30% 5.480us 0.30% 5.480us 5.480us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.827ms
-Self CUDA time total: 97.983us
+Self CPU time total: 1.831ms
+Self CUDA time total: 97.985us
@@ -4479,19 +4479,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 17.00% 78.131us 98.89% 454.476us 454.476us 0.000us 0.00% 164.440us 164.440us 1
- CausalConv1dFn 15.89% 73.024us 81.89% 376.345us 125.448us 0.000us 0.00% 164.440us 54.813us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.76% 26.451us 59.63% 274.060us 91.353us 98.939us 100.00% 164.440us 54.813us 3
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 139.130us 140.62% 139.130us 139.130us 1
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 98.939us 100.00% 98.939us 32.980us 3
- Activity Buffer Request 18.20% 83.643us 18.20% 83.643us 83.643us 65.501us 66.20% 65.501us 65.501us 1
- aten::empty_like 1.75% 8.030us 6.37% 29.261us 9.754us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.62% 21.231us 4.62% 21.231us 7.077us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 35.68% 163.966us 35.68% 163.966us 54.655us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.11% 5.111us 1.11% 5.111us 5.111us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 19.17% 96.654us 98.90% 498.573us 498.573us 0.000us 0.00% 163.900us 163.900us 1
+ CausalConv1dFn 15.33% 77.291us 79.73% 401.919us 133.973us 0.000us 0.00% 163.900us 54.633us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.17% 26.053us 58.73% 296.088us 98.696us 98.813us 100.00% 163.900us 54.633us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 133.981us 135.59% 133.981us 133.981us 1
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 98.813us 100.00% 98.813us 32.938us 3
+ Activity Buffer Request 22.39% 112.882us 22.39% 112.882us 112.882us 65.087us 65.87% 65.087us 65.087us 1
+ aten::empty_like 1.55% 7.820us 5.66% 28.540us 9.513us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.11% 20.720us 4.11% 20.720us 6.907us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 31.17% 157.153us 31.17% 157.153us 52.384us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.10% 5.550us 1.10% 5.550us 5.550us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 459.587us
-Self CUDA time total: 98.939us
+Self CPU time total: 504.123us
+Self CUDA time total: 98.813us
impl wl p50(ms) ok
@@ -4502,11 +4502,11 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
-hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.06 True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.06 True
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.05 True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.06 True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.06 True
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
@@ -4517,20 +4517,18 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2 0.05 True
hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
-hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.06 True
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
-Downloading hf-xet (3.2MiB)
- Downloading hf-xet
-Installed 52 packages in 211ms
+Installed 15 packages in 14ms
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
-Fetching 11 files: 64%|██████▎ | 7/11 [00:02<00:01, 3.26it/s]
-Fetching 11 files: 100%|██████████| 11/11 [00:02<00:00, 5.12it/s]
+Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:01, 3.95it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 6.21it/s]
Artifacts:
causal_conv1d.jsonl
diff --git a/causal_conv1d/impls/torch_causal_conv1d.html b/causal_conv1d/impls/torch_causal_conv1d.html
index a14fe1d8732e839025c8dec1c927653b8a3a02ff..2dd29f110a68d2d6a2cb36ff92b20f1c54eab64b 100644
--- a/causal_conv1d/impls/torch_causal_conv1d.html
+++ b/causal_conv1d/impls/torch_causal_conv1d.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: nv | 0.21s
+Cell: nv | 0.24s
|
▶ run
Copy
Raw
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
-
Tue Oct 28 14:08:09 2025
+Wed Oct 29 14:27:09 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 28C P0 80W / 350W | 0MiB / 46068MiB | 19% Default |
+| N/A 33C P0 109W / 350W | 0MiB / 46068MiB | 100% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -3918,9 +3918,9 @@ Cell: nv | 0.21s
▼ code
▼ output
- ▶ uv-logs
+ ▶ uv-logs
|
-Cell: benchmark | 3.63s
+Cell: benchmark | 7.23s
| ▶ run
Copy
Raw
@@ -3982,29 +3982,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 448.254us 2311.66% 448.254us 448.254us 1
- torch_eager 10.53% 223.197us 99.60% 2.112ms 2.112ms 0.000us 0.00% 21.727us 21.727us 1
- aten::to 0.57% 12.032us 79.33% 1.682ms 280.390us 0.000us 0.00% 14.304us 2.384us 6
- aten::_to_copy 1.82% 38.532us 78.77% 1.670ms 278.384us 0.000us 0.00% 14.304us 2.384us 6
- aten::copy_ 2.94% 62.272us 74.35% 1.577ms 262.784us 11.968us 61.72% 14.304us 2.384us 6
- aten::conv1d 0.36% 7.640us 7.60% 161.165us 53.722us 0.000us 0.00% 7.423us 2.474us 3
- aten::convolution 0.68% 14.400us 7.24% 153.525us 51.175us 0.000us 0.00% 7.423us 2.474us 3
- aten::_convolution 1.64% 34.820us 6.56% 139.125us 46.375us 0.000us 0.00% 7.423us 2.474us 3
- aten::_conv_depthwise2d 1.64% 34.779us 4.03% 85.503us 28.501us 7.423us 38.28% 7.423us 2.474us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.423us 38.28% 7.423us 2.474us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.51% 6.304us 2.101us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.21% 5.664us 1.888us 3
- Activity Buffer Request 68.27% 1.448ms 68.27% 1.448ms 1.448ms 2.336us 12.05% 2.336us 2.336us 1
- aten::empty_strided 2.60% 55.071us 2.60% 55.071us 9.178us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 4.35% 92.254us 4.35% 92.254us 10.250us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 1.39% 29.522us 1.76% 37.262us 4.140us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.59% 12.410us 0.59% 12.410us 0.827us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.52% 10.960us 0.52% 10.960us 3.653us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.67% 14.291us 0.67% 14.291us 4.764us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.39% 8.321us 0.47% 9.881us 3.294us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 465.824us 2410.10% 465.824us 465.824us 1
+ torch_eager 10.38% 221.098us 99.69% 2.123ms 2.123ms 0.000us 0.00% 21.632us 21.632us 1
+ aten::to 0.54% 11.460us 78.80% 1.678ms 279.633us 0.000us 0.00% 14.304us 2.384us 6
+ aten::_to_copy 2.14% 45.672us 78.26% 1.666ms 277.723us 0.000us 0.00% 14.304us 2.384us 6
+ aten::copy_ 2.97% 63.201us 73.51% 1.565ms 260.883us 12.000us 62.09% 14.304us 2.384us 6
+ aten::conv1d 0.45% 9.560us 8.33% 177.314us 59.105us 0.000us 0.00% 7.328us 2.443us 3
+ aten::convolution 0.76% 16.270us 7.88% 167.754us 55.918us 0.000us 0.00% 7.328us 2.443us 3
+ aten::_convolution 1.63% 34.781us 7.11% 151.484us 50.495us 0.000us 0.00% 7.328us 2.443us 3
+ aten::_conv_depthwise2d 2.18% 46.460us 4.51% 96.001us 32.000us 7.328us 37.91% 7.328us 2.443us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 37.91% 7.328us 2.443us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.45% 6.272us 2.091us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.64% 5.728us 1.909us 3
+ Activity Buffer Request 67.39% 1.435ms 67.39% 1.435ms 1.435ms 2.304us 11.92% 2.304us 2.304us 1
+ aten::empty_strided 2.60% 55.371us 2.60% 55.371us 9.228us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 4.37% 93.031us 4.37% 93.031us 10.337us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.44% 30.589us 1.81% 38.620us 4.291us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.63% 13.371us 0.63% 13.371us 0.891us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.55% 11.811us 0.55% 11.811us 3.937us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.56% 11.940us 0.56% 11.940us 3.980us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.37% 7.972us 0.46% 9.712us 3.237us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.121ms
-Self CUDA time total: 19.391us
+Self CPU time total: 2.129ms
+Self CUDA time total: 19.328us
@@ -4014,29 +4014,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.942us 1707.32% 334.942us 334.942us 1
- torch_eager 7.85% 148.604us 99.72% 1.887ms 1.887ms 0.000us 0.00% 21.731us 21.731us 1
- aten::to 0.32% 6.111us 83.97% 1.589ms 264.793us 0.000us 0.00% 13.731us 2.288us 6
- aten::_to_copy 1.27% 24.112us 83.64% 1.583ms 263.774us 0.000us 0.00% 13.731us 2.288us 6
- aten::copy_ 2.68% 50.691us 80.81% 1.529ms 254.829us 11.618us 59.22% 13.731us 2.288us 6
- aten::conv1d 0.29% 5.540us 6.41% 121.373us 40.458us 0.000us 0.00% 8.000us 2.667us 3
- aten::convolution 0.50% 9.420us 6.12% 115.833us 38.611us 0.000us 0.00% 8.000us 2.667us 3
- aten::_convolution 1.30% 24.670us 5.62% 106.413us 35.471us 0.000us 0.00% 8.000us 2.667us 3
- aten::_conv_depthwise2d 1.20% 22.792us 3.44% 65.133us 21.711us 8.000us 40.78% 8.000us 2.667us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 40.78% 8.000us 2.667us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.049us 30.83% 6.049us 2.016us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.569us 28.39% 5.569us 1.856us 3
- Activity Buffer Request 75.63% 1.431ms 75.63% 1.431ms 1.431ms 2.113us 10.77% 2.113us 2.113us 1
- aten::empty_strided 1.56% 29.560us 1.56% 29.560us 4.927us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 3.72% 70.343us 3.72% 70.343us 7.816us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.90% 17.091us 1.18% 22.301us 2.478us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.48% 9.090us 0.48% 9.090us 0.606us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.50% 9.490us 0.50% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.52% 9.830us 0.52% 9.830us 3.277us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.34% 6.400us 0.42% 8.020us 2.673us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.863us 1691.38% 332.863us 332.863us 1
+ torch_eager 6.60% 126.115us 99.71% 1.906ms 1.906ms 0.000us 0.00% 21.792us 21.792us 1
+ aten::to 0.31% 5.930us 85.54% 1.635ms 272.467us 0.000us 0.00% 13.760us 2.293us 6
+ aten::_to_copy 1.30% 24.791us 85.23% 1.629ms 271.478us 0.000us 0.00% 13.760us 2.293us 6
+ aten::copy_ 2.71% 51.809us 82.30% 1.573ms 262.158us 11.648us 59.19% 13.760us 2.293us 6
+ aten::conv1d 0.31% 5.929us 6.17% 117.852us 39.284us 0.000us 0.00% 8.032us 2.677us 3
+ aten::convolution 0.53% 10.111us 5.86% 111.923us 37.308us 0.000us 0.00% 8.032us 2.677us 3
+ aten::_convolution 1.20% 22.951us 5.33% 101.812us 33.937us 0.000us 0.00% 8.032us 2.677us 3
+ aten::_conv_depthwise2d 1.20% 22.860us 3.35% 64.021us 21.340us 8.032us 40.81% 8.032us 2.677us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.032us 40.81% 8.032us 2.677us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 30.89% 6.080us 2.027us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.29% 5.568us 1.856us 3
+ Activity Buffer Request 77.00% 1.472ms 77.00% 1.472ms 1.472ms 2.112us 10.73% 2.112us 2.112us 1
+ aten::empty_strided 1.63% 31.132us 1.63% 31.132us 5.189us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.70% 70.762us 3.70% 70.762us 7.862us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.87% 16.659us 1.16% 22.190us 2.466us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.46% 8.781us 0.46% 8.781us 0.585us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.55% 10.521us 0.55% 10.521us 3.507us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.49% 9.390us 0.49% 9.390us 3.130us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.29% 5.540us 0.35% 6.670us 2.223us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.892ms
-Self CUDA time total: 19.618us
+Self CPU time total: 1.911ms
+Self CUDA time total: 19.680us
@@ -4046,29 +4046,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 333.691us 1797.81% 333.691us 333.691us 1
- torch_eager 7.79% 146.606us 99.69% 1.876ms 1.876ms 0.000us 0.00% 20.481us 20.481us 1
- aten::to 0.31% 5.760us 84.09% 1.582ms 263.706us 0.000us 0.00% 13.569us 2.262us 6
- aten::_to_copy 1.25% 23.550us 83.79% 1.576ms 262.746us 0.000us 0.00% 13.569us 2.262us 6
- aten::copy_ 2.67% 50.153us 80.95% 1.523ms 253.847us 11.649us 62.76% 13.569us 2.262us 6
- aten::conv1d 0.31% 5.780us 6.33% 119.033us 39.678us 0.000us 0.00% 6.912us 2.304us 3
- aten::convolution 0.52% 9.800us 6.02% 113.253us 37.751us 0.000us 0.00% 6.912us 2.304us 3
- aten::_convolution 1.28% 24.000us 5.50% 103.453us 34.484us 0.000us 0.00% 6.912us 2.304us 3
- aten::_conv_depthwise2d 1.15% 21.640us 3.37% 63.473us 21.158us 6.912us 37.24% 6.912us 2.304us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.912us 37.24% 6.912us 2.304us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.953us 32.07% 5.953us 1.984us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.69% 5.696us 1.899us 3
- Activity Buffer Request 75.77% 1.426ms 75.77% 1.426ms 1.426ms 1.920us 10.34% 1.920us 1.920us 1
- aten::empty_strided 1.59% 29.840us 1.59% 29.840us 4.973us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 3.79% 71.241us 3.79% 71.241us 7.916us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.92% 17.220us 1.19% 22.362us 2.485us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.47% 8.782us 0.47% 8.782us 0.585us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.49% 9.312us 0.49% 9.312us 3.104us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.46% 8.581us 0.46% 8.581us 2.860us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.33% 6.290us 0.41% 7.740us 2.580us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 355.199us 1913.89% 355.199us 355.199us 1
+ torch_eager 6.67% 125.171us 99.71% 1.872ms 1.872ms 0.000us 0.00% 20.511us 20.511us 1
+ aten::to 0.32% 6.091us 84.23% 1.581ms 263.570us 0.000us 0.00% 13.600us 2.267us 6
+ aten::_to_copy 1.32% 24.859us 83.90% 1.575ms 262.555us 0.000us 0.00% 13.600us 2.267us 6
+ aten::copy_ 2.70% 50.760us 80.88% 1.518ms 253.083us 11.648us 62.76% 13.600us 2.267us 6
+ aten::conv1d 0.30% 5.670us 7.37% 138.423us 46.141us 0.000us 0.00% 6.911us 2.304us 3
+ aten::convolution 0.52% 9.720us 7.07% 132.753us 44.251us 0.000us 0.00% 6.911us 2.304us 3
+ aten::_convolution 1.24% 23.210us 6.55% 123.033us 41.011us 0.000us 0.00% 6.911us 2.304us 3
+ aten::_conv_depthwise2d 1.26% 23.712us 4.48% 84.033us 28.011us 6.911us 37.24% 6.911us 2.304us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.911us 37.24% 6.911us 2.304us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 32.24% 5.984us 1.995us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 30.52% 5.664us 1.888us 3
+ Activity Buffer Request 75.59% 1.419ms 75.59% 1.419ms 1.419ms 1.952us 10.52% 1.952us 1.952us 1
+ aten::empty_strided 1.70% 31.973us 1.70% 31.973us 5.329us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.83% 72.002us 3.83% 72.002us 8.000us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.89% 16.661us 1.15% 21.682us 2.409us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.48% 8.941us 0.48% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.49% 28.041us 1.49% 28.041us 9.347us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.47% 8.840us 0.47% 8.840us 2.947us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.32% 5.960us 0.40% 7.470us 2.490us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.882ms
-Self CUDA time total: 18.561us
+Self CPU time total: 1.878ms
+Self CUDA time total: 18.559us
@@ -4078,29 +4078,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.628us 1741.67% 341.628us 341.628us 1
- torch_eager 6.79% 135.276us 99.76% 1.989ms 1.989ms 0.000us 0.00% 21.759us 21.759us 1
- aten::to 0.31% 6.091us 85.44% 1.703ms 283.911us 0.000us 0.00% 14.111us 2.352us 6
- aten::_to_copy 1.20% 23.892us 85.13% 1.697ms 282.896us 0.000us 0.00% 14.111us 2.352us 6
- aten::copy_ 2.47% 49.180us 82.37% 1.642ms 273.716us 11.967us 61.01% 14.111us 2.352us 6
- aten::conv1d 0.29% 5.740us 6.09% 121.414us 40.471us 0.000us 0.00% 7.648us 2.549us 3
- aten::convolution 0.55% 11.061us 5.80% 115.674us 38.558us 0.000us 0.00% 7.648us 2.549us 3
- aten::_convolution 1.19% 23.780us 5.25% 104.613us 34.871us 0.000us 0.00% 7.648us 2.549us 3
- aten::_conv_depthwise2d 1.14% 22.750us 3.26% 64.953us 21.651us 7.648us 38.99% 7.648us 2.549us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.648us 38.99% 7.648us 2.549us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.175us 31.48% 6.175us 2.058us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.792us 29.53% 5.792us 1.931us 3
- Activity Buffer Request 68.82% 1.372ms 68.82% 1.372ms 1.372ms 2.144us 10.93% 2.144us 2.144us 1
- aten::empty_strided 1.56% 31.190us 1.56% 31.190us 5.198us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 12.22% 243.619us 12.22% 243.619us 27.069us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.88% 17.629us 1.14% 22.660us 2.518us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.44% 8.782us 0.44% 8.782us 0.585us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.48% 9.630us 0.48% 9.630us 3.210us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.50% 9.941us 0.50% 9.941us 3.314us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.34% 6.720us 0.41% 8.110us 2.703us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.841us 1714.87% 335.841us 335.841us 1
+ torch_eager 6.09% 125.084us 99.75% 2.047ms 2.047ms 0.000us 0.00% 21.728us 21.728us 1
+ aten::to 0.29% 6.012us 86.59% 1.777ms 296.210us 0.000us 0.00% 14.049us 2.341us 6
+ aten::_to_copy 1.18% 24.318us 86.30% 1.771ms 295.209us 0.000us 0.00% 14.049us 2.341us 6
+ aten::copy_ 2.44% 50.170us 83.64% 1.717ms 286.105us 11.905us 60.79% 14.049us 2.341us 6
+ aten::conv1d 0.29% 5.981us 5.73% 117.633us 39.211us 0.000us 0.00% 7.679us 2.560us 3
+ aten::convolution 0.48% 9.909us 5.44% 111.652us 37.217us 0.000us 0.00% 7.679us 2.560us 3
+ aten::_convolution 1.11% 22.712us 4.96% 101.743us 33.914us 0.000us 0.00% 7.679us 2.560us 3
+ aten::_conv_depthwise2d 1.08% 22.231us 3.11% 63.781us 21.260us 7.679us 39.21% 7.679us 2.560us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.679us 39.21% 7.679us 2.560us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 31.54% 6.176us 2.059us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.729us 29.25% 5.729us 1.910us 3
+ Activity Buffer Request 70.17% 1.440ms 70.17% 1.440ms 1.440ms 2.144us 10.95% 2.144us 2.144us 1
+ aten::empty_strided 1.48% 30.301us 1.48% 30.301us 5.050us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 12.02% 246.676us 12.02% 246.676us 27.408us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.85% 17.450us 1.12% 22.930us 2.548us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.44% 8.940us 0.44% 8.940us 0.596us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.47% 9.630us 0.47% 9.630us 3.210us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.56% 11.490us 0.56% 11.490us 3.830us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.28% 5.710us 0.34% 6.930us 2.310us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.994ms
-Self CUDA time total: 19.615us
+Self CPU time total: 2.053ms
+Self CUDA time total: 19.584us
@@ -4110,29 +4110,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.213us 1403.01% 341.213us 341.213us 1
- torch_eager 7.36% 148.867us 99.73% 2.016ms 2.016ms 0.000us 0.00% 26.560us 26.560us 1
- aten::to 0.30% 6.030us 84.88% 1.716ms 285.962us 0.000us 0.00% 15.168us 2.528us 6
- aten::_to_copy 1.20% 24.229us 84.58% 1.710ms 284.956us 0.000us 0.00% 15.168us 2.528us 6
- aten::copy_ 2.44% 49.414us 81.85% 1.655ms 275.782us 12.928us 53.16% 15.168us 2.528us 6
- aten::conv1d 0.28% 5.730us 5.99% 121.174us 40.391us 0.000us 0.00% 11.392us 3.797us 3
- aten::convolution 0.47% 9.480us 5.71% 115.444us 38.481us 0.000us 0.00% 11.392us 3.797us 3
- aten::_convolution 1.14% 23.073us 5.24% 105.964us 35.321us 0.000us 0.00% 11.392us 3.797us 3
- aten::_conv_depthwise2d 1.05% 21.189us 3.24% 65.411us 21.804us 11.392us 46.84% 11.392us 3.797us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.392us 46.84% 11.392us 3.797us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 27.11% 6.592us 2.197us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 26.05% 6.336us 2.112us 3
- Activity Buffer Request 70.12% 1.417ms 70.12% 1.417ms 1.417ms 2.240us 9.21% 2.240us 2.240us 1
- aten::empty_strided 1.52% 30.820us 1.52% 30.820us 5.137us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 10.45% 211.347us 10.45% 211.347us 23.483us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.95% 19.208us 1.23% 24.829us 2.759us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.46% 9.241us 0.46% 9.241us 0.616us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.47% 9.482us 0.47% 9.482us 3.161us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.55% 11.190us 0.55% 11.190us 3.730us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.34% 6.961us 0.41% 8.361us 2.787us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 329.565us 1339.31% 329.565us 329.565us 1
+ torch_eager 6.13% 122.184us 99.75% 1.990ms 1.990ms 0.000us 0.00% 26.911us 26.911us 1
+ aten::to 0.30% 5.979us 86.40% 1.724ms 287.259us 0.000us 0.00% 15.359us 2.560us 6
+ aten::_to_copy 1.37% 27.300us 86.10% 1.718ms 286.262us 0.000us 0.00% 15.359us 2.560us 6
+ aten::copy_ 2.45% 48.801us 83.22% 1.660ms 276.655us 13.055us 53.05% 15.359us 2.560us 6
+ aten::conv1d 0.29% 5.841us 5.86% 116.932us 38.977us 0.000us 0.00% 11.552us 3.851us 3
+ aten::convolution 0.50% 9.929us 5.57% 111.091us 37.030us 0.000us 0.00% 11.552us 3.851us 3
+ aten::_convolution 1.16% 23.192us 5.07% 101.162us 33.721us 0.000us 0.00% 11.552us 3.851us 3
+ aten::_conv_depthwise2d 1.12% 22.341us 3.11% 62.030us 20.677us 11.552us 46.95% 11.552us 3.851us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 46.95% 11.552us 3.851us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.688us 27.18% 6.688us 2.229us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.367us 25.87% 6.367us 2.122us 3
+ Activity Buffer Request 71.71% 1.430ms 71.71% 1.430ms 1.430ms 2.304us 9.36% 2.304us 2.304us 1
+ aten::empty_strided 1.52% 30.342us 1.52% 30.342us 5.057us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.06% 200.744us 10.06% 200.744us 22.305us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.86% 17.251us 1.14% 22.681us 2.520us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.051us 0.45% 9.051us 0.603us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.48% 9.579us 0.48% 9.579us 3.193us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.50% 10.050us 0.50% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.30% 6.019us 0.36% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.022ms
-Self CUDA time total: 24.320us
+Self CPU time total: 1.995ms
+Self CUDA time total: 24.607us
@@ -4142,29 +4142,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.330us 1285.14% 334.330us 334.330us 1
- torch_eager 7.10% 143.875us 99.74% 2.020ms 2.020ms 0.000us 0.00% 28.255us 28.255us 1
- aten::to 0.28% 5.680us 85.25% 1.727ms 287.810us 0.000us 0.00% 15.232us 2.539us 6
- aten::_to_copy 1.18% 23.873us 84.97% 1.721ms 286.863us 0.000us 0.00% 15.232us 2.539us 6
- aten::copy_ 2.45% 49.640us 82.36% 1.668ms 278.038us 12.992us 49.94% 15.232us 2.539us 6
- aten::conv1d 0.29% 5.889us 5.94% 120.414us 40.138us 0.000us 0.00% 13.023us 4.341us 3
- aten::convolution 0.46% 9.401us 5.65% 114.525us 38.175us 0.000us 0.00% 13.023us 4.341us 3
- aten::_convolution 1.22% 24.611us 5.19% 105.124us 35.041us 0.000us 0.00% 13.023us 4.341us 3
- aten::_conv_depthwise2d 1.06% 21.480us 3.19% 64.562us 21.521us 13.023us 50.06% 13.023us 4.341us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.023us 50.06% 13.023us 4.341us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 25.46% 6.624us 2.208us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.48% 6.368us 2.123us 3
- Activity Buffer Request 71.17% 1.442ms 71.17% 1.442ms 1.442ms 2.240us 8.61% 2.240us 2.240us 1
- aten::empty_strided 1.44% 29.082us 1.44% 29.082us 4.847us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.85% 199.548us 9.85% 199.548us 22.172us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.91% 18.470us 1.17% 23.650us 2.628us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.44% 8.970us 0.44% 8.970us 0.598us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.51% 10.400us 0.51% 10.400us 3.467us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.50% 10.200us 0.50% 10.200us 3.400us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.30% 6.091us 0.38% 7.621us 2.540us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 358.812us 1379.20% 358.812us 358.812us 1
+ torch_eager 6.94% 139.423us 99.75% 2.005ms 2.005ms 0.000us 0.00% 28.256us 28.256us 1
+ aten::to 0.33% 6.550us 85.45% 1.717ms 286.205us 0.000us 0.00% 15.199us 2.533us 6
+ aten::_to_copy 1.20% 24.182us 85.13% 1.711ms 285.114us 0.000us 0.00% 15.199us 2.533us 6
+ aten::copy_ 2.59% 52.130us 82.30% 1.654ms 275.648us 12.959us 49.81% 15.199us 2.533us 6
+ aten::conv1d 0.30% 6.120us 5.97% 119.993us 39.998us 0.000us 0.00% 13.057us 4.352us 3
+ aten::convolution 0.48% 9.660us 5.67% 113.873us 37.958us 0.000us 0.00% 13.057us 4.352us 3
+ aten::_convolution 1.13% 22.802us 5.19% 104.213us 34.738us 0.000us 0.00% 13.057us 4.352us 3
+ aten::_conv_depthwise2d 1.09% 21.932us 3.25% 65.242us 21.747us 13.057us 50.19% 13.057us 4.352us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.057us 50.19% 13.057us 4.352us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.623us 25.46% 6.623us 2.208us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 24.35% 6.336us 2.112us 3
+ Activity Buffer Request 70.68% 1.420ms 70.68% 1.420ms 1.420ms 2.240us 8.61% 2.240us 2.240us 1
+ aten::empty_strided 1.62% 32.611us 1.62% 32.611us 5.435us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.17% 204.364us 10.17% 204.364us 22.707us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.88% 17.647us 1.15% 23.189us 2.577us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.47% 9.382us 0.47% 9.382us 0.625us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.58% 11.651us 0.58% 11.651us 3.884us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.44% 8.769us 0.44% 8.769us 2.923us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.32% 6.420us 0.39% 7.890us 2.630us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.026ms
-Self CUDA time total: 26.015us
+Self CPU time total: 2.010ms
+Self CUDA time total: 26.016us
@@ -4174,29 +4174,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.315us 888.50% 340.315us 340.315us 1
- torch_eager 7.29% 147.016us 99.74% 2.012ms 2.012ms 0.000us 0.00% 40.894us 40.894us 1
- aten::conv1d 0.29% 5.920us 5.91% 119.264us 39.755us 0.000us 0.00% 22.496us 7.499us 3
- aten::convolution 0.47% 9.411us 5.62% 113.344us 37.781us 0.000us 0.00% 22.496us 7.499us 3
- aten::_convolution 1.19% 23.960us 5.15% 103.933us 34.644us 0.000us 0.00% 22.496us 7.499us 3
- aten::_conv_depthwise2d 1.11% 22.310us 3.18% 64.143us 21.381us 22.496us 58.73% 22.496us 7.499us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.496us 58.73% 22.496us 7.499us 3
- aten::to 0.29% 5.851us 85.12% 1.717ms 286.238us 0.000us 0.00% 18.398us 3.066us 6
- aten::_to_copy 1.17% 23.549us 84.83% 1.712ms 285.263us 0.000us 0.00% 18.398us 3.066us 6
- aten::copy_ 2.43% 48.960us 82.11% 1.657ms 276.121us 15.806us 41.27% 18.398us 3.066us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.416us 21.97% 8.416us 2.805us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.390us 19.29% 7.390us 2.463us 3
- Activity Buffer Request 70.87% 1.430ms 70.87% 1.430ms 1.430ms 2.592us 6.77% 2.592us 2.592us 1
- aten::empty_strided 1.55% 31.301us 1.55% 31.301us 5.217us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.91% 199.938us 9.91% 199.938us 22.215us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.87% 17.540us 1.13% 22.711us 2.523us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.44% 8.912us 0.44% 8.912us 0.594us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.47% 9.390us 0.47% 9.390us 3.130us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.51% 10.361us 0.51% 10.361us 3.454us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.30% 6.100us 0.37% 7.550us 2.517us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.896us 853.65% 328.896us 328.896us 1
+ torch_eager 6.29% 121.493us 99.73% 1.928ms 1.928ms 0.000us 0.00% 41.088us 41.088us 1
+ aten::conv1d 0.31% 5.961us 6.00% 115.903us 38.634us 0.000us 0.00% 22.688us 7.563us 3
+ aten::convolution 0.50% 9.600us 5.69% 109.942us 36.647us 0.000us 0.00% 22.688us 7.563us 3
+ aten::_convolution 1.16% 22.510us 5.19% 100.342us 33.447us 0.000us 0.00% 22.688us 7.563us 3
+ aten::_conv_depthwise2d 1.17% 22.551us 3.25% 62.881us 20.960us 22.688us 58.89% 22.688us 7.563us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.688us 58.89% 22.688us 7.563us 3
+ aten::to 0.33% 6.421us 86.08% 1.664ms 277.308us 0.000us 0.00% 18.400us 3.067us 6
+ aten::_to_copy 1.25% 24.161us 85.75% 1.657ms 276.238us 0.000us 0.00% 18.400us 3.067us 6
+ aten::copy_ 2.57% 49.759us 82.93% 1.603ms 267.166us 15.840us 41.11% 18.400us 3.067us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 21.93% 8.448us 2.816us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 19.19% 7.392us 2.464us 3
+ Activity Buffer Request 71.07% 1.374ms 71.07% 1.374ms 1.374ms 2.560us 6.64% 2.560us 2.560us 1
+ aten::empty_strided 1.57% 30.271us 1.57% 30.271us 5.045us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.43% 201.525us 10.43% 201.525us 22.392us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.86% 16.701us 1.14% 22.001us 2.445us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 8.751us 0.45% 8.751us 0.583us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.48% 9.290us 0.48% 9.290us 3.097us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.47% 9.060us 0.47% 9.060us 3.020us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.28% 5.459us 0.35% 6.690us 2.230us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.018ms
-Self CUDA time total: 38.302us
+Self CPU time total: 1.933ms
+Self CUDA time total: 38.528us
@@ -4206,29 +4206,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 363.388us 882.35% 363.388us 363.388us 1
- torch_eager 8.20% 165.958us 99.73% 2.020ms 2.020ms 0.000us 0.00% 43.808us 43.808us 1
- aten::conv1d 0.32% 6.510us 6.06% 122.733us 40.911us 0.000us 0.00% 25.408us 8.469us 3
- aten::convolution 0.48% 9.730us 5.74% 116.223us 38.741us 0.000us 0.00% 25.408us 8.469us 3
- aten::_convolution 1.17% 23.611us 5.26% 106.493us 35.498us 0.000us 0.00% 25.408us 8.469us 3
- aten::_conv_depthwise2d 1.11% 22.549us 3.28% 66.422us 22.141us 25.408us 61.69% 25.408us 8.469us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.408us 61.69% 25.408us 8.469us 3
- aten::to 0.31% 6.220us 83.98% 1.701ms 283.450us 0.000us 0.00% 18.400us 3.067us 6
- aten::_to_copy 1.16% 23.591us 83.68% 1.694ms 282.413us 0.000us 0.00% 18.400us 3.067us 6
- aten::copy_ 2.51% 50.781us 81.00% 1.640ms 273.388us 15.776us 38.31% 18.400us 3.067us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.352us 20.28% 8.352us 2.784us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 18.03% 7.424us 2.475us 3
- Activity Buffer Request 69.68% 1.411ms 69.68% 1.411ms 1.411ms 2.624us 6.37% 2.624us 2.624us 1
- aten::empty_strided 1.51% 30.560us 1.51% 30.560us 5.093us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.99% 202.397us 9.99% 202.397us 22.489us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.88% 17.759us 1.14% 23.000us 2.556us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.46% 9.250us 0.46% 9.250us 0.617us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.51% 10.382us 0.51% 10.382us 3.461us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.48% 9.651us 0.48% 9.651us 3.217us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.33% 6.630us 0.40% 8.160us 2.720us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.458us 810.83% 334.458us 334.458us 1
+ torch_eager 6.32% 125.394us 99.75% 1.978ms 1.978ms 0.000us 0.00% 43.841us 43.841us 1
+ aten::conv1d 0.30% 5.899us 5.88% 116.562us 38.854us 0.000us 0.00% 25.600us 8.533us 3
+ aten::convolution 0.49% 9.810us 5.58% 110.663us 36.888us 0.000us 0.00% 25.600us 8.533us 3
+ aten::_convolution 1.13% 22.411us 5.09% 100.853us 33.618us 0.000us 0.00% 25.600us 8.533us 3
+ aten::_conv_depthwise2d 1.14% 22.520us 3.20% 63.392us 21.131us 25.600us 62.06% 25.600us 8.533us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.600us 62.06% 25.600us 8.533us 3
+ aten::to 0.30% 5.959us 86.14% 1.708ms 284.675us 0.000us 0.00% 18.241us 3.040us 6
+ aten::_to_copy 1.33% 26.372us 85.84% 1.702ms 283.682us 0.000us 0.00% 18.241us 3.040us 6
+ aten::copy_ 2.49% 49.420us 83.02% 1.646ms 274.363us 15.649us 37.94% 18.241us 3.040us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.321us 20.17% 8.321us 2.774us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 17.77% 7.328us 2.443us 3
+ Activity Buffer Request 71.51% 1.418ms 71.51% 1.418ms 1.418ms 2.592us 6.28% 2.592us 2.592us 1
+ aten::empty_strided 1.49% 29.540us 1.49% 29.540us 4.923us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.06% 199.427us 10.06% 199.427us 22.159us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.92% 18.199us 1.18% 23.330us 2.592us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.44% 8.651us 0.44% 8.651us 0.577us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.54% 10.640us 0.54% 10.640us 3.547us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.48% 9.610us 0.48% 9.610us 3.203us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.28% 5.590us 0.34% 6.770us 2.257us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.025ms
-Self CUDA time total: 41.184us
+Self CPU time total: 1.983ms
+Self CUDA time total: 41.249us
@@ -4238,29 +4238,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 352.830us 343.38% 352.830us 352.830us 1
- torch_eager 7.15% 144.983us 99.76% 2.023ms 2.023ms 0.000us 0.00% 108.768us 108.768us 1
- aten::conv1d 0.29% 5.781us 5.92% 120.074us 40.025us 0.000us 0.00% 70.432us 23.477us 3
- aten::convolution 0.47% 9.599us 5.64% 114.293us 38.098us 0.000us 0.00% 70.432us 23.477us 3
- aten::_convolution 1.14% 23.149us 5.16% 104.694us 34.898us 0.000us 0.00% 70.432us 23.477us 3
- aten::_conv_depthwise2d 1.16% 23.581us 3.22% 65.212us 21.737us 70.432us 68.55% 70.432us 23.477us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.432us 68.55% 70.432us 23.477us 3
- aten::to 0.30% 6.111us 85.26% 1.729ms 288.085us 0.000us 0.00% 38.336us 6.389us 6
- aten::_to_copy 1.62% 32.820us 84.95% 1.722ms 287.067us 0.000us 0.00% 38.336us 6.389us 6
- aten::copy_ 2.46% 49.781us 81.90% 1.660ms 276.745us 32.320us 31.45% 38.336us 6.389us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.696us 17.22% 17.696us 5.899us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.624us 14.23% 14.624us 4.875us 3
- Activity Buffer Request 70.70% 1.433ms 70.70% 1.433ms 1.433ms 6.016us 5.85% 6.016us 6.016us 1
- aten::empty_strided 1.44% 29.111us 1.44% 29.111us 4.852us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.89% 200.449us 9.89% 200.449us 22.272us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.88% 17.943us 1.16% 23.512us 2.612us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.46% 9.330us 0.46% 9.330us 0.622us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.47% 9.471us 0.47% 9.471us 3.157us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.45% 9.050us 0.45% 9.050us 3.017us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.32% 6.391us 0.39% 7.911us 2.637us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.849us 326.92% 338.849us 338.849us 1
+ torch_eager 5.95% 117.585us 99.74% 1.970ms 1.970ms 0.000us 0.00% 109.697us 109.697us 1
+ aten::conv1d 0.30% 5.970us 6.05% 119.502us 39.834us 0.000us 0.00% 71.232us 23.744us 3
+ aten::convolution 0.49% 9.700us 5.75% 113.532us 37.844us 0.000us 0.00% 71.232us 23.744us 3
+ aten::_convolution 1.15% 22.781us 5.26% 103.832us 34.611us 0.000us 0.00% 71.232us 23.744us 3
+ aten::_conv_depthwise2d 1.18% 23.259us 3.31% 65.420us 21.807us 71.232us 68.72% 71.232us 23.744us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 71.232us 68.72% 71.232us 23.744us 3
+ aten::to 0.31% 6.199us 86.38% 1.706ms 284.313us 0.000us 0.00% 38.465us 6.411us 6
+ aten::_to_copy 1.31% 25.891us 86.06% 1.700ms 283.280us 0.000us 0.00% 38.465us 6.411us 6
+ aten::copy_ 2.57% 50.812us 83.17% 1.643ms 273.758us 32.417us 31.28% 38.465us 6.411us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.760us 17.13% 17.760us 5.920us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.657us 14.14% 14.657us 4.886us 3
+ Activity Buffer Request 71.61% 1.414ms 71.61% 1.414ms 1.414ms 6.048us 5.84% 6.048us 6.048us 1
+ aten::empty_strided 1.58% 31.240us 1.58% 31.240us 5.207us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.13% 200.155us 10.13% 200.155us 22.239us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.87% 17.181us 1.15% 22.621us 2.513us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 8.941us 0.45% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.51% 10.050us 0.51% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.47% 9.370us 0.47% 9.370us 3.123us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.28% 5.551us 0.35% 6.851us 2.284us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.027ms
-Self CUDA time total: 102.752us
+Self CPU time total: 1.975ms
+Self CUDA time total: 103.649us
@@ -4270,29 +4270,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.363us 292.97% 330.363us 330.363us 1
- torch_eager 16.10% 118.634us 99.31% 731.955us 731.955us 0.000us 0.00% 118.781us 118.781us 1
- aten::conv1d 0.80% 5.881us 15.92% 117.344us 39.115us 0.000us 0.00% 80.541us 26.847us 3
- aten::convolution 1.32% 9.760us 15.12% 111.463us 37.154us 0.000us 0.00% 80.541us 26.847us 3
- aten::_convolution 3.06% 22.540us 13.80% 101.703us 33.901us 0.000us 0.00% 80.541us 26.847us 3
- aten::_conv_depthwise2d 2.83% 20.841us 8.49% 62.593us 20.864us 80.541us 71.42% 80.541us 26.847us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.541us 71.42% 80.541us 26.847us 3
- aten::to 0.79% 5.790us 63.53% 468.255us 78.043us 0.000us 0.00% 38.240us 6.373us 6
- aten::_to_copy 3.21% 23.660us 62.75% 462.465us 77.078us 0.000us 0.00% 38.240us 6.373us 6
- aten::copy_ 6.76% 49.831us 55.55% 409.415us 68.236us 32.224us 28.58% 38.240us 6.373us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.728us 15.72% 17.728us 5.909us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.496us 12.86% 14.496us 4.832us 3
- Activity Buffer Request 25.24% 185.996us 25.24% 185.996us 185.996us 6.016us 5.33% 6.016us 6.016us 1
- aten::empty_strided 3.99% 29.390us 3.99% 29.390us 4.898us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 26.60% 196.028us 26.60% 196.028us 21.781us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.44% 17.960us 3.11% 22.951us 2.550us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.15% 8.461us 1.15% 8.461us 0.564us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.35% 9.931us 1.35% 9.931us 3.310us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.27% 9.381us 1.27% 9.381us 3.127us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.87% 6.430us 1.06% 7.840us 2.613us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.597us 314.53% 357.597us 357.597us 1
+ torch_eager 6.01% 120.196us 99.73% 1.995ms 1.995ms 0.000us 0.00% 119.645us 119.645us 1
+ aten::conv1d 0.28% 5.578us 6.85% 137.112us 45.704us 0.000us 0.00% 81.344us 27.115us 3
+ aten::convolution 0.47% 9.452us 6.58% 131.534us 43.845us 0.000us 0.00% 81.344us 27.115us 3
+ aten::_convolution 1.16% 23.298us 6.10% 122.082us 40.694us 0.000us 0.00% 81.344us 27.115us 3
+ aten::_conv_depthwise2d 1.16% 23.221us 4.15% 82.932us 27.644us 81.344us 71.55% 81.344us 27.115us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 81.344us 71.55% 81.344us 27.115us 3
+ aten::to 0.33% 6.509us 85.46% 1.710ms 284.935us 0.000us 0.00% 38.301us 6.383us 6
+ aten::_to_copy 1.29% 25.870us 85.14% 1.703ms 283.850us 0.000us 0.00% 38.301us 6.383us 6
+ aten::copy_ 2.58% 51.531us 82.27% 1.646ms 274.308us 32.350us 28.45% 38.301us 6.383us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.727us 15.59% 17.727us 5.909us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.623us 12.86% 14.623us 4.874us 3
+ Activity Buffer Request 70.95% 1.419ms 70.95% 1.419ms 1.419ms 5.951us 5.23% 5.951us 5.951us 1
+ aten::empty_strided 1.57% 31.380us 1.57% 31.380us 5.230us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.95% 199.044us 9.95% 199.044us 22.116us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.89% 17.740us 1.16% 23.191us 2.577us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.47% 9.433us 0.47% 9.433us 0.629us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.53% 10.531us 0.53% 10.531us 3.510us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.26% 25.130us 1.26% 25.130us 8.377us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.30% 6.010us 0.38% 7.612us 2.537us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 737.005us
-Self CUDA time total: 112.765us
+Self CPU time total: 2.000ms
+Self CUDA time total: 113.694us
@@ -4302,29 +4302,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 22.21% 170.695us 99.32% 763.366us 763.366us 0.000us 0.00% 430.770us 430.770us 1
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 416.723us 106.46% 416.723us 416.723us 1
- aten::conv1d 0.77% 5.951us 14.86% 114.225us 38.075us 0.000us 0.00% 251.288us 83.763us 3
- aten::convolution 1.24% 9.541us 14.09% 108.274us 36.091us 0.000us 0.00% 251.288us 83.763us 3
- aten::_convolution 2.83% 21.719us 12.85% 98.733us 32.911us 0.000us 0.00% 251.288us 83.763us 3
- aten::_conv_depthwise2d 2.74% 21.061us 7.99% 61.422us 20.474us 251.288us 64.20% 251.288us 83.763us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 251.288us 64.20% 251.288us 83.763us 3
- aten::to 0.75% 5.750us 58.89% 452.676us 75.446us 0.000us 0.00% 179.482us 29.914us 6
- aten::_to_copy 3.02% 23.182us 58.15% 446.926us 74.488us 0.000us 0.00% 179.482us 29.914us 6
- aten::copy_ 6.40% 49.211us 51.58% 396.473us 66.079us 140.155us 35.80% 179.482us 29.914us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 100.254us 25.61% 100.254us 33.418us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.901us 10.19% 39.901us 13.300us 3
- Activity Buffer Request 22.72% 174.636us 22.72% 174.636us 174.636us 39.327us 10.05% 39.327us 39.327us 1
- aten::empty_strided 3.55% 27.271us 3.55% 27.271us 4.545us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 25.36% 194.936us 25.36% 194.936us 21.660us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.13% 16.381us 2.81% 21.611us 2.401us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.16% 8.880us 1.16% 8.880us 0.592us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.18% 9.091us 1.18% 9.091us 3.030us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.17% 8.960us 1.17% 8.960us 2.987us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.75% 5.770us 0.94% 7.191us 2.397us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 5.97% 120.782us 97.66% 1.975ms 1.975ms 0.000us 0.00% 434.301us 434.301us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 421.021us 106.85% 421.021us 421.021us 1
+ aten::conv1d 0.30% 6.069us 5.79% 117.202us 39.067us 0.000us 0.00% 251.007us 83.669us 3
+ aten::convolution 0.47% 9.471us 5.49% 111.133us 37.044us 0.000us 0.00% 251.007us 83.669us 3
+ aten::_convolution 1.10% 22.180us 5.03% 101.662us 33.887us 0.000us 0.00% 251.007us 83.669us 3
+ aten::_conv_depthwise2d 1.13% 22.779us 3.17% 64.182us 21.394us 251.007us 63.71% 251.007us 83.669us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 251.007us 63.71% 251.007us 83.669us 3
+ aten::to 0.31% 6.200us 84.52% 1.710ms 284.917us 0.000us 0.00% 183.294us 30.549us 6
+ aten::_to_copy 1.19% 24.072us 84.22% 1.703ms 283.884us 0.000us 0.00% 183.294us 30.549us 6
+ aten::copy_ 2.45% 49.593us 81.56% 1.650ms 274.942us 143.007us 36.29% 183.294us 30.549us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 102.495us 26.01% 102.495us 34.165us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.512us 10.28% 40.512us 13.504us 3
+ Activity Buffer Request 70.36% 1.423ms 70.36% 1.423ms 1.423ms 40.287us 10.22% 40.287us 40.287us 1
+ aten::empty_strided 1.46% 29.579us 1.46% 29.579us 4.930us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.86% 199.474us 9.86% 199.474us 22.164us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.84% 17.021us 1.11% 22.432us 2.492us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.090us 0.45% 9.090us 0.606us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.48% 9.720us 0.48% 9.720us 3.240us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.45% 9.202us 0.45% 9.202us 3.067us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.28% 5.680us 0.35% 7.060us 2.353us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 768.616us
-Self CUDA time total: 391.443us
+Self CPU time total: 2.023ms
+Self CUDA time total: 394.014us
@@ -4334,29 +4334,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 13.26% 117.114us 87.73% 774.557us 774.557us 0.000us 0.00% 486.014us 486.014us 1
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 473.342us 105.98% 473.342us 473.342us 1
- aten::conv1d 0.63% 5.520us 13.02% 114.943us 38.314us 0.000us 0.00% 298.622us 99.541us 3
- aten::convolution 1.08% 9.570us 12.39% 109.423us 36.474us 0.000us 0.00% 298.622us 99.541us 3
- aten::_convolution 2.49% 22.001us 11.31% 99.853us 33.284us 0.000us 0.00% 298.622us 99.541us 3
- aten::_conv_depthwise2d 2.40% 21.190us 7.05% 62.252us 20.751us 298.622us 66.86% 298.622us 99.541us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.622us 66.86% 298.622us 99.541us 3
- aten::to 0.65% 5.781us 58.29% 514.667us 85.778us 0.000us 0.00% 187.392us 31.232us 6
- aten::_to_copy 2.57% 22.699us 57.64% 508.886us 84.814us 0.000us 0.00% 187.392us 31.232us 6
- aten::copy_ 5.62% 49.650us 51.80% 457.366us 76.228us 148.032us 33.14% 187.392us 31.232us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 108.256us 24.24% 108.256us 36.085us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.776us 8.91% 39.776us 13.259us 3
- Activity Buffer Request 26.78% 236.449us 26.78% 236.449us 236.449us 39.360us 8.81% 39.360us 39.360us 1
- aten::empty_strided 3.26% 28.821us 3.26% 28.821us 4.804us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 22.01% 194.327us 22.01% 194.327us 21.592us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.00% 17.701us 2.60% 22.912us 2.546us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.01% 8.901us 1.01% 8.901us 0.593us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.05% 9.311us 1.05% 9.311us 3.104us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.98% 8.691us 0.98% 8.691us 2.897us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.65% 5.750us 0.82% 7.230us 2.410us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 5.89% 122.072us 95.29% 1.975ms 1.975ms 0.000us 0.00% 486.458us 486.458us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 474.010us 106.16% 474.010us 474.010us 1
+ aten::conv1d 0.28% 5.830us 5.59% 115.853us 38.618us 0.000us 0.00% 299.291us 99.764us 3
+ aten::convolution 0.46% 9.610us 5.31% 110.023us 36.674us 0.000us 0.00% 299.291us 99.764us 3
+ aten::_convolution 1.08% 22.439us 4.85% 100.413us 33.471us 0.000us 0.00% 299.291us 99.764us 3
+ aten::_conv_depthwise2d 1.04% 21.490us 3.04% 62.983us 20.994us 299.291us 67.03% 299.291us 99.764us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 299.291us 67.03% 299.291us 99.764us 3
+ aten::to 0.31% 6.341us 82.51% 1.710ms 284.962us 0.000us 0.00% 187.167us 31.195us 6
+ aten::_to_copy 1.23% 25.592us 82.20% 1.703ms 283.906us 0.000us 0.00% 187.167us 31.195us 6
+ aten::copy_ 2.39% 49.481us 79.48% 1.647ms 274.512us 147.199us 32.97% 187.167us 31.195us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 106.911us 23.94% 106.911us 35.637us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.288us 9.02% 40.288us 13.429us 3
+ Activity Buffer Request 68.62% 1.422ms 68.62% 1.422ms 1.422ms 39.968us 8.95% 39.968us 39.968us 1
+ aten::empty_strided 1.48% 30.770us 1.48% 30.770us 5.128us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.53% 197.485us 9.53% 197.485us 21.943us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.81% 16.791us 1.08% 22.301us 2.478us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.44% 9.141us 0.44% 9.141us 0.609us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.47% 9.701us 0.47% 9.701us 3.234us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.48% 9.941us 0.48% 9.941us 3.314us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.27% 5.510us 0.33% 6.790us 2.263us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 882.890us
-Self CUDA time total: 446.654us
+Self CPU time total: 2.072ms
+Self CUDA time total: 446.490us
@@ -4366,29 +4366,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 324.122us 1734.57% 324.122us 324.122us 1
- torch_eager 15.60% 121.627us 99.38% 775.067us 775.067us 0.000us 0.00% 20.574us 20.574us 1
- aten::to 0.72% 5.589us 65.70% 512.356us 85.393us 0.000us 0.00% 13.343us 2.224us 6
- aten::_to_copy 2.88% 22.431us 64.98% 506.767us 84.461us 0.000us 0.00% 13.343us 2.224us 6
- aten::copy_ 6.46% 50.411us 58.51% 456.326us 76.054us 11.455us 61.30% 13.343us 2.224us 6
- aten::conv1d 0.72% 5.580us 14.59% 113.823us 37.941us 0.000us 0.00% 7.231us 2.410us 3
- aten::convolution 1.19% 9.260us 13.88% 108.243us 36.081us 0.000us 0.00% 7.231us 2.410us 3
- aten::_convolution 2.87% 22.359us 12.69% 98.983us 32.994us 0.000us 0.00% 7.231us 2.410us 3
- aten::_conv_depthwise2d 2.67% 20.840us 7.84% 61.153us 20.384us 7.231us 38.70% 7.231us 2.410us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.231us 38.70% 7.231us 2.410us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 31.34% 5.856us 1.952us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.599us 29.96% 5.599us 1.866us 3
- Activity Buffer Request 30.21% 235.608us 30.21% 235.608us 235.608us 1.888us 10.10% 1.888us 1.888us 1
- aten::empty_strided 3.59% 28.010us 3.59% 28.010us 4.668us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 24.63% 192.088us 24.63% 192.088us 21.343us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.29% 17.871us 2.95% 23.001us 2.556us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.13% 8.820us 1.13% 8.820us 0.588us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.21% 9.401us 1.21% 9.401us 3.134us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.17% 9.131us 1.17% 9.131us 3.044us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.75% 5.851us 0.94% 7.321us 2.440us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 358.523us 1924.96% 358.523us 358.523us 1
+ torch_eager 17.94% 139.773us 99.33% 774.049us 774.049us 0.000us 0.00% 20.513us 20.513us 1
+ aten::to 0.94% 7.351us 62.88% 489.983us 81.664us 0.000us 0.00% 13.376us 2.229us 6
+ aten::_to_copy 3.20% 24.930us 61.93% 482.632us 80.439us 0.000us 0.00% 13.376us 2.229us 6
+ aten::copy_ 6.90% 53.742us 54.52% 424.881us 70.813us 11.488us 61.68% 13.376us 2.229us 6
+ aten::conv1d 0.75% 5.841us 15.01% 116.973us 38.991us 0.000us 0.00% 7.137us 2.379us 3
+ aten::convolution 1.33% 10.360us 14.26% 111.132us 37.044us 0.000us 0.00% 7.137us 2.379us 3
+ aten::_convolution 3.01% 23.430us 12.93% 100.772us 33.591us 0.000us 0.00% 7.137us 2.379us 3
+ aten::_conv_depthwise2d 2.81% 21.882us 7.98% 62.192us 20.731us 7.137us 38.32% 7.137us 2.379us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.137us 38.32% 7.137us 2.379us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 31.61% 5.888us 1.963us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.600us 30.07% 5.600us 1.867us 3
+ Activity Buffer Request 24.98% 194.695us 24.98% 194.695us 194.695us 1.888us 10.14% 1.888us 1.888us 1
+ aten::empty_strided 4.21% 32.821us 4.21% 32.821us 5.470us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.28% 197.004us 25.28% 197.004us 21.889us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.16% 16.850us 2.84% 22.160us 2.462us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.13% 8.821us 1.13% 8.821us 0.588us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.22% 9.521us 1.22% 9.521us 3.174us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.31% 10.229us 1.31% 10.229us 3.410us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.74% 5.740us 0.90% 7.020us 2.340us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 779.877us
-Self CUDA time total: 18.686us
+Self CPU time total: 779.258us
+Self CUDA time total: 18.625us
@@ -4398,29 +4398,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 316.346us 1628.63% 316.346us 316.346us 1
- torch_eager 14.51% 117.604us 99.38% 805.188us 805.188us 0.000us 0.00% 21.312us 21.312us 1
- aten::to 0.69% 5.621us 67.40% 546.068us 91.011us 0.000us 0.00% 13.376us 2.229us 6
- aten::_to_copy 2.81% 22.789us 66.70% 540.447us 90.075us 0.000us 0.00% 13.376us 2.229us 6
- aten::copy_ 5.89% 47.733us 60.20% 487.757us 81.293us 11.488us 59.14% 13.376us 2.229us 6
- aten::conv1d 0.69% 5.581us 14.11% 114.294us 38.098us 0.000us 0.00% 7.936us 2.645us 3
- aten::convolution 1.17% 9.520us 13.42% 108.713us 36.238us 0.000us 0.00% 7.936us 2.645us 3
- aten::_convolution 2.68% 21.682us 12.24% 99.193us 33.064us 0.000us 0.00% 7.936us 2.645us 3
- aten::_conv_depthwise2d 2.64% 21.391us 7.61% 61.682us 20.561us 7.936us 40.86% 7.936us 2.645us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 40.86% 7.936us 2.645us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 30.15% 5.856us 1.952us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.00% 5.632us 1.877us 3
- Activity Buffer Request 33.53% 271.649us 33.53% 271.649us 271.649us 1.888us 9.72% 1.888us 1.888us 1
- aten::empty_strided 3.69% 29.901us 3.69% 29.901us 4.984us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 23.39% 189.555us 23.39% 189.555us 21.062us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.18% 17.698us 2.81% 22.771us 2.530us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.07% 8.674us 1.07% 8.674us 0.578us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.14% 9.260us 1.14% 9.260us 3.087us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.22% 9.851us 1.22% 9.851us 3.284us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.76% 6.120us 0.93% 7.530us 2.510us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.763us 1698.07% 328.763us 328.763us 1
+ torch_eager 14.65% 115.015us 99.34% 779.670us 779.670us 0.000us 0.00% 21.248us 21.248us 1
+ aten::to 0.80% 6.290us 66.21% 519.631us 86.605us 0.000us 0.00% 13.406us 2.234us 6
+ aten::_to_copy 3.14% 24.649us 65.41% 513.341us 85.557us 0.000us 0.00% 13.406us 2.234us 6
+ aten::copy_ 6.80% 53.351us 58.20% 456.761us 76.127us 11.519us 59.50% 13.406us 2.234us 6
+ aten::conv1d 0.75% 5.880us 15.10% 118.484us 39.495us 0.000us 0.00% 7.842us 2.614us 3
+ aten::convolution 1.21% 9.513us 14.35% 112.604us 37.535us 0.000us 0.00% 7.842us 2.614us 3
+ aten::_convolution 2.83% 22.229us 13.14% 103.091us 34.364us 0.000us 0.00% 7.842us 2.614us 3
+ aten::_conv_depthwise2d 3.15% 24.720us 8.43% 66.141us 22.047us 7.842us 40.50% 7.842us 2.614us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.842us 40.50% 7.842us 2.614us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.887us 30.41% 5.887us 1.962us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.09% 5.632us 1.877us 3
+ Activity Buffer Request 29.55% 231.946us 29.55% 231.946us 231.946us 1.887us 9.75% 1.887us 1.887us 1
+ aten::empty_strided 4.07% 31.931us 4.07% 31.931us 5.322us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 24.68% 193.684us 24.68% 193.684us 21.520us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.11% 16.541us 2.75% 21.581us 2.398us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.09% 8.568us 1.09% 8.568us 0.571us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.27% 9.951us 1.27% 9.951us 3.317us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.18% 9.250us 1.18% 9.250us 3.083us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.72% 5.642us 0.89% 6.980us 2.327us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 810.248us
-Self CUDA time total: 19.424us
+Self CPU time total: 784.850us
+Self CUDA time total: 19.361us
@@ -4430,29 +4430,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 322.590us 1658.13% 322.590us 322.590us 1
- torch_eager 6.77% 135.447us 99.76% 1.996ms 1.996ms 0.000us 0.00% 21.631us 21.631us 1
- aten::to 0.29% 5.801us 85.87% 1.718ms 286.282us 0.000us 0.00% 14.400us 2.400us 6
- aten::_to_copy 1.16% 23.150us 85.58% 1.712ms 285.315us 0.000us 0.00% 14.400us 2.400us 6
- aten::copy_ 2.46% 49.110us 82.93% 1.659ms 276.491us 12.224us 62.83% 14.400us 2.400us 6
- aten::conv1d 0.28% 5.690us 5.75% 114.953us 38.318us 0.000us 0.00% 7.231us 2.410us 3
- aten::convolution 0.48% 9.520us 5.46% 109.263us 36.421us 0.000us 0.00% 7.231us 2.410us 3
- aten::_convolution 1.10% 21.931us 4.99% 99.743us 33.248us 0.000us 0.00% 7.231us 2.410us 3
- aten::_conv_depthwise2d 1.06% 21.231us 3.12% 62.372us 20.791us 7.231us 37.17% 7.231us 2.410us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.231us 37.17% 7.231us 2.410us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.40% 6.304us 2.101us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 30.43% 5.920us 1.973us 3
- Activity Buffer Request 71.98% 1.440ms 71.98% 1.440ms 1.440ms 2.176us 11.18% 2.176us 2.176us 1
- aten::empty_strided 1.49% 29.791us 1.49% 29.791us 4.965us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.66% 193.277us 9.66% 193.277us 21.475us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.86% 17.278us 1.13% 22.539us 2.504us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.45% 9.001us 0.45% 9.001us 0.600us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.46% 9.281us 0.46% 9.281us 3.094us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.43% 8.570us 0.43% 8.570us 2.857us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.29% 5.760us 0.36% 7.200us 2.400us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.454us 1698.73% 330.454us 330.454us 1
+ torch_eager 14.50% 115.185us 99.38% 789.290us 789.290us 0.000us 0.00% 21.628us 21.628us 1
+ aten::to 0.75% 5.979us 66.62% 529.132us 88.189us 0.000us 0.00% 14.332us 2.389us 6
+ aten::_to_copy 3.11% 24.732us 65.87% 523.153us 87.192us 0.000us 0.00% 14.332us 2.389us 6
+ aten::copy_ 6.75% 53.590us 58.69% 466.101us 77.684us 12.157us 62.49% 14.332us 2.389us 6
+ aten::conv1d 0.72% 5.740us 14.75% 117.122us 39.041us 0.000us 0.00% 7.296us 2.432us 3
+ aten::convolution 1.18% 9.359us 14.02% 111.382us 37.127us 0.000us 0.00% 7.296us 2.432us 3
+ aten::_convolution 2.82% 22.362us 12.85% 102.023us 34.008us 0.000us 0.00% 7.296us 2.432us 3
+ aten::_conv_depthwise2d 2.86% 22.741us 8.10% 64.351us 21.450us 7.296us 37.51% 7.296us 2.432us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.296us 37.51% 7.296us 2.432us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.238us 32.07% 6.238us 2.079us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 30.43% 5.919us 1.973us 3
+ Activity Buffer Request 30.19% 239.746us 30.19% 239.746us 239.746us 2.175us 11.18% 2.175us 2.175us 1
+ aten::empty_strided 4.07% 32.320us 4.07% 32.320us 5.387us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 24.58% 195.235us 24.58% 195.235us 21.693us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.10% 16.713us 2.76% 21.891us 2.432us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.12% 8.919us 1.12% 8.919us 0.595us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.20% 9.570us 1.20% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.20% 9.570us 1.20% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.72% 5.709us 0.89% 7.030us 2.343us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.000ms
-Self CUDA time total: 19.455us
+Self CPU time total: 794.200us
+Self CUDA time total: 19.453us
@@ -4462,29 +4462,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 326.589us 1627.82% 326.589us 326.589us 1
- torch_eager 7.03% 140.275us 99.72% 1.991ms 1.991ms 0.000us 0.00% 22.207us 22.207us 1
- aten::to 0.30% 6.010us 85.45% 1.706ms 284.341us 0.000us 0.00% 14.304us 2.384us 6
- aten::_to_copy 1.18% 23.623us 85.15% 1.700ms 283.340us 0.000us 0.00% 14.304us 2.384us 6
- aten::copy_ 2.42% 48.261us 82.53% 1.648ms 274.613us 12.160us 60.61% 14.304us 2.384us 6
- aten::conv1d 0.34% 6.690us 5.89% 117.664us 39.221us 0.000us 0.00% 7.903us 2.634us 3
- aten::convolution 0.46% 9.260us 5.56% 110.974us 36.991us 0.000us 0.00% 7.903us 2.634us 3
- aten::_convolution 1.15% 23.009us 5.09% 101.714us 33.905us 0.000us 0.00% 7.903us 2.634us 3
- aten::_conv_depthwise2d 1.10% 21.970us 3.15% 62.812us 20.937us 7.903us 39.39% 7.903us 2.634us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.903us 39.39% 7.903us 2.634us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.240us 31.10% 6.240us 2.080us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.51% 5.920us 1.973us 3
- Activity Buffer Request 71.49% 1.427ms 71.49% 1.427ms 1.427ms 2.144us 10.69% 2.144us 2.144us 1
- aten::empty_strided 1.44% 28.740us 1.44% 28.740us 4.790us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.68% 193.308us 9.68% 193.308us 21.479us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.85% 16.982us 1.11% 22.224us 2.469us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.45% 8.892us 0.45% 8.892us 0.593us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.47% 9.420us 0.47% 9.420us 3.140us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.51% 10.100us 0.51% 10.100us 3.367us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.31% 6.130us 0.38% 7.650us 2.550us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.021us 1622.51% 325.021us 325.021us 1
+ torch_eager 14.95% 114.725us 99.33% 762.279us 762.279us 0.000us 0.00% 22.176us 22.176us 1
+ aten::to 0.78% 5.949us 65.87% 505.530us 84.255us 0.000us 0.00% 14.272us 2.379us 6
+ aten::_to_copy 3.19% 24.509us 65.10% 499.581us 83.264us 0.000us 0.00% 14.272us 2.379us 6
+ aten::copy_ 6.59% 50.599us 57.97% 444.890us 74.148us 12.128us 60.54% 14.272us 2.379us 6
+ aten::conv1d 0.79% 6.100us 15.11% 115.973us 38.658us 0.000us 0.00% 7.904us 2.635us 3
+ aten::convolution 1.34% 10.290us 14.32% 109.873us 36.624us 0.000us 0.00% 7.904us 2.635us 3
+ aten::_convolution 2.97% 22.812us 12.98% 99.583us 33.194us 0.000us 0.00% 7.904us 2.635us 3
+ aten::_conv_depthwise2d 2.93% 22.501us 8.10% 62.182us 20.727us 7.904us 39.46% 7.904us 2.635us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 39.46% 7.904us 2.635us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 30.99% 6.208us 2.069us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.55% 5.920us 1.973us 3
+ Activity Buffer Request 28.71% 220.306us 28.71% 220.306us 220.306us 2.144us 10.70% 2.144us 2.144us 1
+ aten::empty_strided 3.93% 30.182us 3.93% 30.182us 5.030us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.32% 194.286us 25.32% 194.286us 21.587us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.11% 16.159us 2.76% 21.209us 2.357us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.09% 8.360us 1.09% 8.360us 0.557us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.23% 9.450us 1.23% 9.450us 3.150us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.29% 9.930us 1.29% 9.930us 3.310us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.71% 5.470us 0.87% 6.670us 2.223us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.997ms
-Self CUDA time total: 20.063us
+Self CPU time total: 767.429us
+Self CUDA time total: 20.032us
@@ -4494,29 +4494,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 319.164us 887.36% 319.164us 319.164us 1
- torch_eager 15.26% 115.785us 99.38% 754.046us 754.046us 0.000us 0.00% 38.560us 38.560us 1
- aten::conv1d 0.72% 5.471us 14.90% 113.045us 37.682us 0.000us 0.00% 20.097us 6.699us 3
- aten::convolution 1.25% 9.510us 14.18% 107.574us 35.858us 0.000us 0.00% 20.097us 6.699us 3
- aten::_convolution 2.85% 21.590us 12.92% 98.064us 32.688us 0.000us 0.00% 20.097us 6.699us 3
- aten::_conv_depthwise2d 2.82% 21.412us 8.06% 61.133us 20.378us 20.097us 55.87% 20.097us 6.699us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.097us 55.87% 20.097us 6.699us 3
- aten::to 0.74% 5.628us 65.55% 497.346us 82.891us 0.000us 0.00% 18.463us 3.077us 6
- aten::_to_copy 3.02% 22.942us 64.80% 491.718us 81.953us 0.000us 0.00% 18.463us 3.077us 6
- aten::copy_ 6.50% 49.290us 57.91% 439.376us 73.229us 15.871us 44.13% 18.463us 3.077us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.447us 23.48% 8.447us 2.816us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 20.64% 7.424us 2.475us 3
- Activity Buffer Request 28.99% 219.958us 28.99% 219.958us 219.958us 2.592us 7.21% 2.592us 2.592us 1
- aten::empty_strided 3.87% 29.400us 3.87% 29.400us 4.900us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 25.31% 192.058us 25.31% 192.058us 21.340us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.43% 18.410us 3.09% 23.410us 2.601us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.12% 8.490us 1.12% 8.490us 0.566us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.20% 9.081us 1.20% 9.081us 3.027us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.15% 8.710us 1.15% 8.710us 2.903us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.77% 5.871us 0.96% 7.301us 2.434us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 356.764us 983.15% 356.764us 356.764us 1
+ torch_eager 15.53% 123.844us 99.36% 792.350us 792.350us 0.000us 0.00% 38.944us 38.944us 1
+ aten::conv1d 0.79% 6.320us 15.33% 122.233us 40.744us 0.000us 0.00% 20.320us 6.773us 3
+ aten::convolution 1.24% 9.851us 14.54% 115.913us 38.638us 0.000us 0.00% 20.320us 6.773us 3
+ aten::_convolution 2.89% 23.052us 13.30% 106.062us 35.354us 0.000us 0.00% 20.320us 6.773us 3
+ aten::_conv_depthwise2d 2.97% 23.692us 8.39% 66.891us 22.297us 20.320us 56.00% 20.320us 6.773us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.320us 56.00% 20.320us 6.773us 3
+ aten::to 0.80% 6.349us 64.76% 516.391us 86.065us 0.000us 0.00% 18.624us 3.104us 6
+ aten::_to_copy 3.21% 25.572us 63.96% 510.042us 85.007us 0.000us 0.00% 18.624us 3.104us 6
+ aten::copy_ 6.54% 52.120us 56.52% 450.739us 75.123us 15.968us 44.00% 18.624us 3.104us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.607us 23.72% 8.607us 2.869us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 20.28% 7.361us 2.454us 3
+ Activity Buffer Request 27.46% 218.966us 27.46% 218.966us 218.966us 2.656us 7.32% 2.656us 2.656us 1
+ aten::empty_strided 4.23% 33.731us 4.23% 33.731us 5.622us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.38% 202.413us 25.38% 202.413us 22.490us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.20% 17.520us 2.88% 22.939us 2.549us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.21% 9.679us 1.21% 9.679us 0.645us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.40% 11.140us 1.40% 11.140us 3.713us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.17% 9.299us 1.17% 9.299us 3.100us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.75% 6.010us 0.93% 7.450us 2.483us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 758.766us
-Self CUDA time total: 35.968us
+Self CPU time total: 797.430us
+Self CUDA time total: 36.288us
@@ -4526,29 +4526,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 318.166us 839.07% 318.166us 318.166us 1
- torch_eager 15.61% 115.614us 99.23% 735.056us 735.056us 0.000us 0.00% 40.512us 40.512us 1
- aten::conv1d 0.77% 5.689us 15.23% 112.833us 37.611us 0.000us 0.00% 22.206us 7.402us 3
- aten::convolution 1.28% 9.450us 14.46% 107.144us 35.715us 0.000us 0.00% 22.206us 7.402us 3
- aten::_convolution 2.90% 21.450us 13.19% 97.694us 32.565us 0.000us 0.00% 22.206us 7.402us 3
- aten::_conv_depthwise2d 2.86% 21.190us 8.15% 60.352us 20.117us 22.206us 58.56% 22.206us 7.402us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.206us 58.56% 22.206us 7.402us 3
- aten::to 0.76% 5.621us 64.62% 478.657us 79.776us 0.000us 0.00% 18.306us 3.051us 6
- aten::_to_copy 3.14% 23.241us 63.86% 473.036us 78.839us 0.000us 0.00% 18.306us 3.051us 6
- aten::copy_ 6.66% 49.364us 56.82% 420.865us 70.144us 15.713us 41.44% 18.306us 3.051us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.385us 22.11% 8.385us 2.795us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 19.33% 7.328us 2.443us 3
- Activity Buffer Request 27.11% 200.816us 27.11% 200.816us 200.816us 2.593us 6.84% 2.593us 2.593us 1
- aten::empty_strided 3.91% 28.930us 3.91% 28.930us 4.822us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 25.94% 192.117us 25.94% 192.117us 21.346us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.42% 17.932us 3.14% 23.222us 2.580us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.19% 8.781us 1.19% 8.781us 0.585us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.25% 9.270us 1.25% 9.270us 3.090us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.14% 8.460us 1.14% 8.460us 2.820us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.85% 6.280us 1.02% 7.591us 2.530us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.353us 866.25% 332.353us 332.353us 1
+ torch_eager 6.20% 124.083us 99.73% 1.997ms 1.997ms 0.000us 0.00% 40.959us 40.959us 1
+ aten::conv1d 0.30% 6.071us 5.74% 115.013us 38.338us 0.000us 0.00% 22.592us 7.531us 3
+ aten::convolution 0.48% 9.660us 5.44% 108.942us 36.314us 0.000us 0.00% 22.592us 7.531us 3
+ aten::_convolution 1.09% 21.840us 4.96% 99.282us 33.094us 0.000us 0.00% 22.592us 7.531us 3
+ aten::_conv_depthwise2d 1.15% 22.991us 3.11% 62.342us 20.781us 22.592us 58.88% 22.592us 7.531us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.592us 58.88% 22.592us 7.531us 3
+ aten::to 0.32% 6.339us 86.44% 1.731ms 288.505us 0.000us 0.00% 18.367us 3.061us 6
+ aten::_to_copy 1.25% 24.980us 86.12% 1.725ms 287.449us 0.000us 0.00% 18.367us 3.061us 6
+ aten::copy_ 2.51% 50.252us 83.36% 1.669ms 278.222us 15.775us 41.12% 18.367us 3.061us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.416us 21.94% 8.416us 2.805us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.359us 19.18% 7.359us 2.453us 3
+ Activity Buffer Request 72.13% 1.445ms 72.13% 1.445ms 1.445ms 2.592us 6.76% 2.592us 2.592us 1
+ aten::empty_strided 1.52% 30.382us 1.52% 30.382us 5.064us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.74% 194.985us 9.74% 194.985us 21.665us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.87% 17.330us 1.13% 22.630us 2.514us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 8.941us 0.45% 8.941us 0.596us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.48% 9.610us 0.48% 9.610us 3.203us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.46% 9.250us 0.46% 9.250us 3.083us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.27% 5.490us 0.34% 6.780us 2.260us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 740.726us
-Self CUDA time total: 37.919us
+Self CPU time total: 2.003ms
+Self CUDA time total: 38.367us
@@ -4558,29 +4558,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 321.366us 502.64% 321.366us 321.366us 1
- torch_eager 15.27% 113.396us 99.28% 737.126us 737.126us 0.000us 0.00% 68.031us 68.031us 1
- aten::conv1d 0.76% 5.670us 15.56% 115.503us 38.501us 0.000us 0.00% 41.567us 13.856us 3
- aten::convolution 1.28% 9.489us 14.79% 109.833us 36.611us 0.000us 0.00% 41.567us 13.856us 3
- aten::_convolution 3.08% 22.850us 13.52% 100.344us 33.448us 0.000us 0.00% 41.567us 13.856us 3
- aten::_conv_depthwise2d 2.89% 21.483us 8.27% 61.383us 20.461us 41.567us 65.01% 41.567us 13.856us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.567us 65.01% 41.567us 13.856us 3
- aten::to 0.76% 5.660us 64.85% 481.506us 80.251us 0.000us 0.00% 26.464us 4.411us 6
- aten::_to_copy 3.08% 22.842us 64.09% 475.846us 79.308us 0.000us 0.00% 26.464us 4.411us 6
- aten::copy_ 6.57% 48.752us 57.01% 423.304us 70.551us 22.368us 34.99% 26.464us 4.411us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.968us 18.72% 11.968us 3.989us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 16.27% 10.400us 3.467us 3
- Activity Buffer Request 27.27% 202.487us 27.27% 202.487us 202.487us 4.096us 6.41% 4.096us 4.096us 1
- aten::empty_strided 4.00% 29.700us 4.00% 29.700us 4.950us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 26.15% 194.125us 26.15% 194.125us 21.569us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.30% 17.061us 2.99% 22.191us 2.466us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.19% 8.800us 1.19% 8.800us 0.587us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.25% 9.280us 1.25% 9.280us 3.093us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.15% 8.560us 1.15% 8.560us 2.853us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.77% 5.741us 0.96% 7.151us 2.384us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.952us 509.17% 328.952us 328.952us 1
+ torch_eager 15.31% 114.903us 99.32% 745.599us 745.599us 0.000us 0.00% 68.701us 68.701us 1
+ aten::conv1d 0.89% 6.660us 15.50% 116.373us 38.791us 0.000us 0.00% 42.238us 14.079us 3
+ aten::convolution 1.33% 9.952us 14.61% 109.713us 36.571us 0.000us 0.00% 42.238us 14.079us 3
+ aten::_convolution 2.95% 22.149us 13.29% 99.761us 33.254us 0.000us 0.00% 42.238us 14.079us 3
+ aten::_conv_depthwise2d 2.94% 22.090us 8.38% 62.891us 20.964us 42.238us 65.38% 42.238us 14.079us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 42.238us 65.38% 42.238us 14.079us 3
+ aten::to 0.80% 6.039us 65.05% 488.341us 81.390us 0.000us 0.00% 26.463us 4.410us 6
+ aten::_to_copy 3.23% 24.281us 64.25% 482.302us 80.384us 0.000us 0.00% 26.463us 4.410us 6
+ aten::copy_ 6.57% 49.302us 56.69% 425.561us 70.927us 22.367us 34.62% 26.463us 4.410us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.936us 18.48% 11.936us 3.979us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 16.15% 10.431us 3.477us 3
+ Activity Buffer Request 26.58% 199.565us 26.58% 199.565us 199.565us 4.096us 6.34% 4.096us 4.096us 1
+ aten::empty_strided 4.32% 32.460us 4.32% 32.460us 5.410us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 26.45% 198.565us 26.45% 198.565us 22.063us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.13% 16.001us 2.81% 21.091us 2.343us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.16% 8.690us 1.16% 8.690us 0.579us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.26% 9.490us 1.26% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.26% 9.440us 1.26% 9.440us 3.147us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.75% 5.611us 0.93% 6.981us 2.327us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 742.446us
-Self CUDA time total: 63.935us
+Self CPU time total: 750.709us
+Self CUDA time total: 64.605us
@@ -4590,29 +4590,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 326.264us 468.36% 326.264us 326.264us 1
- torch_eager 14.61% 117.663us 99.38% 800.347us 800.347us 0.000us 0.00% 73.789us 73.789us 1
- aten::conv1d 0.75% 6.020us 14.38% 115.844us 38.615us 0.000us 0.00% 47.230us 15.743us 3
- aten::convolution 1.16% 9.351us 13.64% 109.824us 36.608us 0.000us 0.00% 47.230us 15.743us 3
- aten::_convolution 2.76% 22.250us 12.48% 100.473us 33.491us 0.000us 0.00% 47.230us 15.743us 3
- aten::_conv_depthwise2d 2.71% 21.790us 7.76% 62.461us 20.820us 47.230us 67.80% 47.230us 15.743us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.230us 67.80% 47.230us 15.743us 3
- aten::to 0.71% 5.690us 66.94% 539.059us 89.843us 0.000us 0.00% 26.559us 4.426us 6
- aten::_to_copy 2.87% 23.082us 66.23% 533.369us 88.895us 0.000us 0.00% 26.559us 4.426us 6
- aten::copy_ 6.12% 49.260us 59.73% 480.976us 80.163us 22.431us 32.20% 26.559us 4.426us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 17.23% 12.000us 4.000us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 14.97% 10.431us 3.477us 3
- Activity Buffer Request 29.99% 241.509us 29.99% 241.509us 241.509us 4.128us 5.93% 4.128us 4.128us 1
- aten::empty_strided 3.64% 29.311us 3.64% 29.311us 4.885us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 26.37% 212.348us 26.37% 212.348us 23.594us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.22% 17.841us 2.86% 23.041us 2.560us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.09% 8.761us 1.09% 8.761us 0.584us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.16% 9.320us 1.16% 9.320us 3.107us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.14% 9.210us 1.14% 9.210us 3.070us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.77% 6.201us 0.95% 7.621us 2.540us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.798us 467.68% 328.798us 328.798us 1
+ torch_eager 14.69% 115.264us 99.37% 779.669us 779.669us 0.000us 0.00% 74.432us 74.432us 1
+ aten::conv1d 0.75% 5.869us 14.89% 116.853us 38.951us 0.000us 0.00% 47.840us 15.947us 3
+ aten::convolution 1.20% 9.412us 14.15% 110.984us 36.995us 0.000us 0.00% 47.840us 15.947us 3
+ aten::_convolution 2.99% 23.451us 12.95% 101.572us 33.857us 0.000us 0.00% 47.840us 15.947us 3
+ aten::_conv_depthwise2d 2.71% 21.281us 8.10% 63.532us 21.177us 47.840us 68.05% 47.840us 15.947us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.840us 68.05% 47.840us 15.947us 3
+ aten::to 0.74% 5.828us 66.46% 521.411us 86.902us 0.000us 0.00% 26.592us 4.432us 6
+ aten::_to_copy 3.27% 25.622us 65.71% 515.583us 85.931us 0.000us 0.00% 26.592us 4.432us 6
+ aten::copy_ 6.42% 50.382us 58.46% 458.651us 76.442us 22.464us 31.95% 26.592us 4.432us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.032us 17.11% 12.032us 4.011us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 14.84% 10.432us 3.477us 3
+ Activity Buffer Request 29.93% 234.846us 29.93% 234.846us 234.846us 4.128us 5.87% 4.128us 4.128us 1
+ aten::empty_strided 3.99% 31.310us 3.99% 31.310us 5.218us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 24.83% 194.803us 24.83% 194.803us 21.645us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.07% 16.243us 2.72% 21.332us 2.370us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.07% 8.401us 1.07% 8.401us 0.560us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.35% 10.581us 1.35% 10.581us 3.527us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.31% 10.290us 1.31% 10.290us 3.430us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.69% 5.406us 0.84% 6.568us 2.189us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 805.317us
-Self CUDA time total: 69.661us
+Self CPU time total: 784.589us
+Self CUDA time total: 70.304us
@@ -4622,29 +4622,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 372.509us 200.60% 372.509us 372.509us 1
- torch_eager 16.32% 136.903us 99.36% 833.418us 833.418us 0.000us 0.00% 195.711us 195.711us 1
- aten::conv1d 0.67% 5.580us 15.45% 129.615us 43.205us 0.000us 0.00% 133.247us 44.416us 3
- aten::convolution 1.13% 9.510us 14.79% 124.035us 41.345us 0.000us 0.00% 133.247us 44.416us 3
- aten::_convolution 3.89% 32.633us 13.65% 114.525us 38.175us 0.000us 0.00% 133.247us 44.416us 3
- aten::_conv_depthwise2d 2.50% 20.960us 7.87% 66.022us 22.007us 133.247us 71.76% 133.247us 44.416us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.247us 71.76% 133.247us 44.416us 3
- aten::to 0.72% 6.039us 64.27% 539.099us 89.850us 0.000us 0.00% 62.464us 10.411us 6
- aten::_to_copy 2.75% 23.094us 63.55% 533.060us 88.843us 0.000us 0.00% 62.464us 10.411us 6
- aten::copy_ 5.97% 50.071us 57.15% 479.385us 79.897us 52.448us 28.24% 62.464us 10.411us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.504us 15.89% 29.504us 9.835us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.944us 12.36% 22.944us 7.648us 3
- Activity Buffer Request 30.64% 256.969us 30.64% 256.969us 256.969us 10.016us 5.39% 10.016us 10.016us 1
- aten::empty_strided 3.65% 30.581us 3.65% 30.581us 5.097us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 23.59% 197.827us 23.59% 197.827us 21.981us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.16% 18.130us 2.81% 23.610us 2.623us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.09% 9.169us 1.09% 9.169us 0.611us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.19% 9.940us 1.19% 9.940us 3.313us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.15% 9.640us 1.15% 9.640us 3.213us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.72% 6.001us 0.89% 7.490us 2.497us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.882us 182.91% 341.882us 341.882us 1
+ torch_eager 15.14% 117.185us 99.33% 768.879us 768.879us 0.000us 0.00% 197.117us 197.117us 1
+ aten::conv1d 0.79% 6.110us 14.86% 114.993us 38.331us 0.000us 0.00% 134.270us 44.757us 3
+ aten::convolution 1.22% 9.451us 14.07% 108.883us 36.294us 0.000us 0.00% 134.270us 44.757us 3
+ aten::_convolution 2.87% 22.240us 12.85% 99.432us 33.144us 0.000us 0.00% 134.270us 44.757us 3
+ aten::_conv_depthwise2d 2.84% 21.991us 8.04% 62.222us 20.741us 134.270us 71.84% 134.270us 44.757us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 134.270us 71.84% 134.270us 44.757us 3
+ aten::to 0.77% 5.950us 65.77% 509.102us 84.850us 0.000us 0.00% 62.847us 10.474us 6
+ aten::_to_copy 3.29% 25.489us 65.00% 503.152us 83.859us 0.000us 0.00% 62.847us 10.474us 6
+ aten::copy_ 6.45% 49.889us 57.58% 445.721us 74.287us 52.639us 28.16% 62.847us 10.474us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.728us 15.91% 29.728us 9.909us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.911us 12.26% 22.911us 7.637us 3
+ Activity Buffer Request 28.61% 221.416us 28.61% 221.416us 221.416us 10.208us 5.46% 10.208us 10.208us 1
+ aten::empty_strided 4.13% 31.942us 4.13% 31.942us 5.324us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.24% 195.386us 25.24% 195.386us 21.710us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.14% 16.602us 2.90% 22.460us 2.496us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.19% 9.247us 1.19% 9.247us 0.616us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.23% 9.500us 1.23% 9.500us 3.167us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.26% 9.761us 1.26% 9.761us 3.254us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.71% 5.470us 0.87% 6.700us 2.233us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 838.778us
-Self CUDA time total: 185.695us
+Self CPU time total: 774.039us
+Self CUDA time total: 186.909us
@@ -4654,29 +4654,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 368.701us 175.32% 368.701us 368.701us 1
- torch_eager 16.38% 138.724us 99.39% 841.559us 841.559us 0.000us 0.00% 224.383us 224.383us 1
- aten::conv1d 0.69% 5.870us 14.05% 118.945us 39.648us 0.000us 0.00% 154.015us 51.338us 3
- aten::convolution 1.19% 10.050us 13.35% 113.075us 37.692us 0.000us 0.00% 154.015us 51.338us 3
- aten::_convolution 2.68% 22.669us 12.17% 103.025us 34.342us 0.000us 0.00% 154.015us 51.338us 3
- aten::_conv_depthwise2d 2.54% 21.472us 7.66% 64.883us 21.628us 154.015us 73.23% 154.015us 51.338us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.015us 73.23% 154.015us 51.338us 3
- aten::to 0.70% 5.911us 65.49% 554.540us 92.423us 0.000us 0.00% 70.368us 11.728us 6
- aten::_to_copy 2.70% 22.862us 64.79% 548.629us 91.438us 0.000us 0.00% 70.368us 11.728us 6
- aten::copy_ 5.97% 50.511us 58.49% 495.276us 82.546us 56.288us 26.77% 70.368us 11.728us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 33.248us 15.81% 33.248us 11.083us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.040us 10.96% 23.040us 7.680us 3
- Activity Buffer Request 32.21% 272.739us 32.21% 272.739us 272.739us 14.080us 6.70% 14.080us 14.080us 1
- aten::empty_strided 3.60% 30.491us 3.60% 30.491us 5.082us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 23.06% 195.277us 23.06% 195.277us 21.697us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.26% 19.134us 2.91% 24.623us 2.736us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.07% 9.019us 1.07% 9.019us 0.601us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.15% 9.700us 1.15% 9.700us 3.233us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.24% 10.460us 1.24% 10.460us 3.487us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.68% 5.760us 0.85% 7.180us 2.393us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 349.277us 165.88% 349.277us 349.277us 1
+ torch_eager 15.39% 117.165us 99.36% 756.609us 756.609us 0.000us 0.00% 224.029us 224.029us 1
+ aten::conv1d 0.74% 5.661us 15.33% 116.734us 38.911us 0.000us 0.00% 154.686us 51.562us 3
+ aten::convolution 1.20% 9.150us 14.59% 111.073us 37.024us 0.000us 0.00% 154.686us 51.562us 3
+ aten::_convolution 2.96% 22.532us 13.38% 101.923us 33.974us 0.000us 0.00% 154.686us 51.562us 3
+ aten::_conv_depthwise2d 2.86% 21.751us 8.47% 64.492us 21.497us 154.686us 73.47% 154.686us 51.562us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.686us 73.47% 154.686us 51.562us 3
+ aten::to 0.84% 6.379us 65.15% 496.150us 82.692us 0.000us 0.00% 69.343us 11.557us 6
+ aten::_to_copy 3.33% 25.371us 64.32% 489.771us 81.628us 0.000us 0.00% 69.343us 11.557us 6
+ aten::copy_ 6.44% 49.031us 56.76% 432.240us 72.040us 55.871us 26.53% 69.343us 11.557us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.831us 15.59% 32.831us 10.944us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.040us 10.94% 23.040us 7.680us 3
+ Activity Buffer Request 27.33% 208.145us 27.33% 208.145us 208.145us 13.472us 6.40% 13.472us 13.472us 1
+ aten::empty_strided 4.22% 32.160us 4.22% 32.160us 5.360us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.87% 197.025us 25.87% 197.025us 21.892us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.14% 16.329us 2.83% 21.520us 2.391us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.17% 8.932us 1.17% 8.932us 0.595us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.38% 10.500us 1.38% 10.500us 3.500us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.35% 10.280us 1.35% 10.280us 3.427us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.72% 5.468us 0.90% 6.839us 2.280us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 846.749us
-Self CUDA time total: 210.303us
+Self CPU time total: 761.499us
+Self CUDA time total: 210.557us
@@ -4686,29 +4686,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 6.86% 124.525us 53.03% 963.064us 963.064us 0.000us 0.00% 1.524ms 1.524ms 1
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.423ms 100.42% 1.423ms 1.423ms 1
- aten::to 0.37% 6.781us 38.11% 692.105us 115.351us 0.000us 0.00% 827.798us 137.966us 6
- aten::_to_copy 1.62% 29.329us 37.74% 685.324us 114.221us 0.000us 0.00% 827.798us 137.966us 6
- aten::copy_ 2.86% 52.014us 24.74% 449.228us 74.871us 721.111us 50.87% 827.798us 137.966us 6
- aten::conv1d 0.32% 5.800us 6.51% 118.154us 39.385us 0.000us 0.00% 696.313us 232.104us 3
- aten::convolution 0.55% 9.981us 6.19% 112.354us 37.451us 0.000us 0.00% 696.313us 232.104us 3
- aten::_convolution 1.25% 22.722us 5.64% 102.373us 34.124us 0.000us 0.00% 696.313us 232.104us 3
- aten::_conv_depthwise2d 1.22% 22.241us 3.54% 64.332us 21.444us 696.313us 49.13% 696.313us 232.104us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 696.313us 49.13% 696.313us 232.104us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 411.194us 29.01% 411.194us 137.065us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 309.917us 21.86% 309.917us 103.306us 3
- Activity Buffer Request 12.02% 218.207us 12.02% 218.207us 218.207us 106.687us 7.53% 106.687us 106.687us 1
- aten::empty_strided 1.97% 35.692us 11.39% 206.767us 34.461us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 11.11% 201.717us 11.11% 201.717us 22.413us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.96% 17.369us 1.26% 22.889us 2.543us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.51% 9.249us 0.51% 9.249us 0.617us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.50% 9.061us 0.50% 9.061us 3.020us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.57% 10.320us 0.57% 10.320us 3.440us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.33% 5.990us 0.41% 7.360us 2.453us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 6.72% 121.944us 52.58% 953.714us 953.714us 0.000us 0.00% 1.521ms 1.521ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.421ms 100.41% 1.421ms 1.421ms 1
+ aten::to 0.35% 6.300us 37.63% 682.555us 113.759us 0.000us 0.00% 824.097us 137.350us 6
+ aten::_to_copy 1.68% 30.549us 37.28% 676.255us 112.709us 0.000us 0.00% 824.097us 137.350us 6
+ aten::copy_ 2.98% 53.981us 24.83% 450.422us 75.070us 718.817us 50.79% 824.097us 137.350us 6
+ aten::conv1d 0.35% 6.281us 6.65% 120.554us 40.185us 0.000us 0.00% 696.543us 232.181us 3
+ aten::convolution 0.57% 10.251us 6.30% 114.273us 38.091us 0.000us 0.00% 696.543us 232.181us 3
+ aten::_convolution 1.27% 23.111us 5.73% 104.022us 34.674us 0.000us 0.00% 696.543us 232.181us 3
+ aten::_conv_depthwise2d 1.23% 22.359us 3.60% 65.321us 21.774us 696.543us 49.21% 696.543us 232.181us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 696.543us 49.21% 696.543us 232.181us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 409.920us 28.96% 409.920us 136.640us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 308.897us 21.82% 308.897us 102.966us 3
+ Activity Buffer Request 11.98% 217.246us 11.98% 217.246us 217.246us 105.280us 7.44% 105.280us 105.280us 1
+ aten::empty_strided 2.17% 39.370us 10.77% 195.284us 32.547us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 11.13% 201.976us 11.13% 201.976us 22.442us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.99% 18.030us 1.31% 23.761us 2.640us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.53% 9.620us 0.53% 9.620us 0.641us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.59% 10.751us 0.59% 10.751us 3.584us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.52% 9.430us 0.52% 9.430us 3.143us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.31% 5.670us 0.39% 7.030us 2.343us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.816ms
-Self CUDA time total: 1.417ms
+Self CPU time total: 1.814ms
+Self CUDA time total: 1.415ms
@@ -4718,33 +4718,33 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 6.33% 114.706us 41.01% 743.286us 743.286us 0.000us 0.00% 1.500ms 1.500ms 1
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.431ms 100.39% 1.431ms 1.431ms 1
- aten::to 0.32% 5.881us 26.81% 485.936us 80.989us 0.000us 0.00% 762.577us 127.096us 6
- aten::_to_copy 1.28% 23.109us 26.49% 480.055us 80.009us 0.000us 0.00% 762.577us 127.096us 6
- aten::copy_ 2.74% 49.733us 23.67% 429.056us 71.509us 687.698us 48.25% 762.577us 127.096us 6
- aten::conv1d 0.31% 5.590us 6.38% 115.623us 38.541us 0.000us 0.00% 737.523us 245.841us 3
- aten::convolution 0.55% 9.990us 6.07% 110.033us 36.678us 0.000us 0.00% 737.523us 245.841us 3
- aten::_convolution 1.21% 21.900us 5.52% 100.043us 33.348us 0.000us 0.00% 737.523us 245.841us 3
- aten::_conv_depthwise2d 1.16% 21.072us 3.45% 62.453us 20.818us 737.523us 51.75% 737.523us 245.841us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 737.523us 51.75% 737.523us 245.841us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 400.247us 28.08% 400.247us 133.416us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 287.451us 20.17% 287.451us 95.817us 3
- Activity Buffer Request 11.32% 205.227us 11.32% 205.227us 205.227us 74.879us 5.25% 74.879us 74.879us 1
- aten::empty_strided 1.54% 27.890us 1.54% 27.890us 4.648us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 10.89% 197.296us 10.89% 197.296us 21.922us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.95% 17.181us 1.23% 22.321us 2.480us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.49% 8.961us 0.49% 8.961us 0.597us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.50% 9.050us 0.50% 9.050us 3.017us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.50% 9.131us 0.50% 9.131us 3.044us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.32% 5.870us 0.41% 7.390us 2.463us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 4.05% 123.714us 65.96% 2.016ms 2.016ms 0.000us 0.00% 1.502ms 1.502ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.433ms 100.43% 1.433ms 1.433ms 1
+ aten::to 0.21% 6.507us 56.82% 1.737ms 289.475us 0.000us 0.00% 764.927us 127.488us 6
+ aten::_to_copy 0.85% 25.961us 56.61% 1.730ms 288.391us 0.000us 0.00% 764.927us 127.488us 6
+ aten::copy_ 1.76% 53.800us 54.73% 1.673ms 278.832us 689.887us 48.36% 764.927us 127.488us 6
+ aten::conv1d 0.20% 6.220us 4.18% 127.663us 42.554us 0.000us 0.00% 736.735us 245.578us 3
+ aten::convolution 0.34% 10.420us 3.97% 121.443us 40.481us 0.000us 0.00% 736.735us 245.578us 3
+ aten::_convolution 0.75% 22.860us 3.63% 111.023us 37.008us 0.000us 0.00% 736.735us 245.578us 3
+ aten::_conv_depthwise2d 0.96% 29.441us 2.37% 72.583us 24.194us 736.735us 51.64% 736.735us 245.578us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 736.735us 51.64% 736.735us 245.578us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 397.471us 27.86% 397.471us 132.490us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 292.416us 20.50% 292.416us 97.472us 3
+ Activity Buffer Request 47.26% 1.445ms 47.26% 1.445ms 1.445ms 75.040us 5.26% 75.040us 75.040us 1
+ aten::empty_strided 1.03% 31.391us 1.03% 31.391us 5.232us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 6.45% 197.169us 6.45% 197.169us 21.908us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.57% 17.300us 0.75% 22.850us 2.539us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.30% 9.200us 0.30% 9.200us 0.613us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.32% 9.780us 0.32% 9.780us 3.260us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.36% 10.870us 0.36% 10.870us 3.623us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.19% 5.770us 0.23% 7.180us 2.393us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.812ms
-Self CUDA time total: 1.425ms
+Self CPU time total: 3.057ms
+Self CUDA time total: 1.427ms
impl wl p50(ms) ok
-torch_eager cuda_B2_D2048_S128_W2 0.09 True
+torch_eager cuda_B2_D2048_S128_W2 0.08 True
torch_eager cuda_B2_D2048_S128_W4 0.08 True
torch_eager cuda_B2_D2048_S2048_W2 0.15 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
@@ -4752,7 +4752,7 @@ torch_eager cuda_B2_D2048_S512_W2 0.08 True
torch_eager cuda_B2_D2048_S512_W4 0.08 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.09 True
-torch_eager cuda_B2_D64_S2048_W2 0.09 True
+torch_eager cuda_B2_D64_S2048_W2 0.08 True
torch_eager cuda_B2_D64_S2048_W4 0.08 True
torch_eager cuda_B2_D64_S512_W2 0.09 True
torch_eager cuda_B2_D64_S512_W4 0.08 True
@@ -4765,10 +4765,16 @@ torch_eager cuda_B4_D2048_S512_W4 0.10 True
torch_eager cuda_B4_D64_S128_W2 0.08 True
torch_eager cuda_B4_D64_S128_W4 0.08 True
torch_eager cuda_B4_D64_S2048_W2 0.08 True
-torch_eager cuda_B4_D64_S2048_W4 0.08 True
+torch_eager cuda_B4_D64_S2048_W4 0.09 True
torch_eager cuda_B4_D64_S512_W2 0.08 True
torch_eager cuda_B4_D64_S512_W4 0.08 True
+
+
+
+Installed 37 packages in 229ms
+
+
Artifacts:
causal_conv1d.jsonl
diff --git a/causal_conv1d/results/artifacts/combine/latency.svg b/causal_conv1d/results/artifacts/combine/latency.svg
index e336babd9e22036016f034e9655aa303d520c536..07cfbdf7d6b5520fa7d67c8819a8378d9bcd8cb5 100644
--- a/causal_conv1d/results/artifacts/combine/latency.svg
+++ b/causal_conv1d/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:a640783c4d5cb4dc1763b97fa9a3e0cf2d278599a3fc38ba2056846c760ec8fe
-size 35421
+oid sha256:3d92f3a3aa92e11f21958cf1c591a4e709fd40f7b0cccbd544c1e1a77b11bcd2
+size 35429
diff --git a/causal_conv1d/results/combined_results.html b/causal_conv1d/results/combined_results.html
index dcc52b58db96b72ed197292d2ffb66bacd9bf72c..45b22fabef9b9c6a15964465834db2598fd9e481 100644
--- a/causal_conv1d/results/combined_results.html
+++ b/causal_conv1d/results/combined_results.html
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
- 2025-10-28T14:09:26.231666
+ 2025-10-29T14:27:58.771179
image/svg+xml
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
-
+
- 0.1
+ 0.1
-
+
-
+
- 0.2
+ 0.2
-
+
-
+
- 0.3
+ 0.3
-
+
-
+
- 0.4
+ 0.4
-
+
-
+
- 0.5
+ 0.5
@@ -4287,66 +4287,66 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -4405,7 +4405,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: combine | 4.38s
+Cell: combine | 4.32s
| ▶ run
Copy
Raw
@@ -4499,11 +4499,11 @@ hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
-hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
-hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.06 True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.06 True
hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.05 True
-hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.06 True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.06 True
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
@@ -4514,9 +4514,9 @@ hf_kernels_causal_conv1d cuda_B4_D64_S128_W2 0.05 True
hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
-hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.06 True
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
-torch_eager cuda_B2_D2048_S128_W2 0.09 True
+torch_eager cuda_B2_D2048_S128_W2 0.08 True
torch_eager cuda_B2_D2048_S128_W4 0.08 True
torch_eager cuda_B2_D2048_S2048_W2 0.15 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
@@ -4524,7 +4524,7 @@ torch_eager cuda_B2_D2048_S512_W2 0.08 True
torch_eager cuda_B2_D2048_S512_W4 0.08 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.09 True
-torch_eager cuda_B2_D64_S2048_W2 0.09 True
+torch_eager cuda_B2_D64_S2048_W2 0.08 True
torch_eager cuda_B2_D64_S2048_W4 0.08 True
torch_eager cuda_B2_D64_S512_W2 0.09 True
torch_eager cuda_B2_D64_S512_W4 0.08 True
@@ -4537,7 +4537,7 @@ torch_eager cuda_B4_D2048_S512_W4 0.10 True
torch_eager cuda_B4_D64_S128_W2 0.08 True
torch_eager cuda_B4_D64_S128_W4 0.08 True
torch_eager cuda_B4_D64_S2048_W2 0.08 True
-torch_eager cuda_B4_D64_S2048_W4 0.08 True
+torch_eager cuda_B4_D64_S2048_W4 0.09 True
torch_eager cuda_B4_D64_S512_W2 0.08 True
torch_eager cuda_B4_D64_S512_W4 0.08 True
@@ -4559,7 +4559,7 @@ Implementations included:
-Installed 37 packages in 221ms
+Installed 37 packages in 214ms
@@ -4572,7 +4572,7 @@ Installed 37 packages in 221ms
- 2025-10-28T14:09:26.231666
+ 2025-10-29T14:27:58.771179
image/svg+xml
@@ -4916,70 +4916,70 @@ Installed 37 packages in 221ms
-
+
-
+
- 0.1
+ 0.1
-
+
-
+
- 0.2
+ 0.2
-
+
-
+
- 0.3
+ 0.3
-
+
-
+
- 0.4
+ 0.4
-
+
-
+
- 0.5
+ 0.5
@@ -4987,66 +4987,66 @@ Installed 37 packages in 221ms
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl
index c187be19ed971576faca83871bac5aeb9c24284a..dfaf0c99c533e861b9b0cf0a7d640e38745db1c9 100644
--- a/flash_attn/impls/artifacts/benchmark/attention.jsonl
+++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl
@@ -1,6 +1,6 @@
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9106109999947876, "p50": 0.9171110000352201, "p90": 0.9204320000435473, "mean": 0.9179216000347878, "iqr": 0.005419999979494605, "raw_times": [0.9171110000352201, 0.9150120000640527, 0.9106109999947876, 0.9204320000435473, 0.9264420000363316], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9176309999929799, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9576329999845257, "p50": 0.960063999968952, "p90": 0.9623629999850891, "mean": 0.9611931999643275, "iqr": 0.0033900000744324643, "raw_times": [0.9589729999106567, 0.9576329999845257, 0.960063999968952, 0.9669329999724141, 0.9623629999850891], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9673530000782193, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0019650000003821, "p50": 1.0193159999971613, "p90": 1.0211459999709405, "mean": 1.015251600006195, "iqr": 0.01198099994326185, "raw_times": [1.0019650000003821, 1.0091650000276786, 1.024666000034813, 1.0193159999971613, 1.0211459999709405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.009233999980097, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0176959999625979, "p50": 1.0199849999708022, "p90": 1.025695000066662, "mean": 1.0218714000075124, "iqr": 0.006820000066909415, "raw_times": [1.0271060000377474, 1.0176959999625979, 1.0188749999997526, 1.0199849999708022, 1.025695000066662], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.027405000058934, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1665810000067722, "p50": 1.1845809999613266, "p90": 1.185440999961429, "mean": 1.1787729999923613, "iqr": 0.01419000000169035, "raw_times": [1.1712509999597387, 1.1665810000067722, 1.18601100007254, 1.1845809999613266, 1.185440999961429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1787800000320203, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1722899999995207, "p50": 1.1832310000272628, "p90": 1.1854509999693619, "mean": 1.181276799979969, "iqr": 0.008630000024822948, "raw_times": [1.1885909999591604, 1.1854509999693619, 1.176820999944539, 1.1832310000272628, 1.1722899999995207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1782799999764393, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:40Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.98791400000664, "p50": 0.995113999977093, "p90": 1.0003840000081254, "mean": 0.9967803999984426, "iqr": 0.00634899998885885, "raw_times": [0.98791400000664, 0.9940350000192666, 1.006454999981088, 1.0003840000081254, 0.995113999977093], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0150049999992916, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0233649999804584, "p50": 1.0321449999537435, "p90": 1.0348449999924014, "mean": 1.032277399974646, "iqr": 0.009739000006447895, "raw_times": [1.045925999960673, 1.0251059999859535, 1.0321449999537435, 1.0233649999804584, 1.0348449999924014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.035865999995167, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0737370000128976, "p50": 1.084086999981082, "p90": 1.088675999994848, "mean": 1.0826705999988917, "iqr": 0.0148400000057336, "raw_times": [1.0738359999891145, 1.0930170000165162, 1.084086999981082, 1.0737370000128976, 1.088675999994848], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0889670000437945, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0887770000067576, "p50": 1.0916359999555425, "p90": 1.096396999969329, "mean": 1.0932085999797891, "iqr": 0.005600999998023326, "raw_times": [1.0887770000067576, 1.0907959999713057, 1.0916359999555425, 1.0984369999960109, 1.096396999969329], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1031370000296192, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2465009999687027, "p50": 1.2523310000460697, "p90": 1.2523909999799798, "mean": 1.2538410000047406, "iqr": 0.005290999979479238, "raw_times": [1.2523310000460697, 1.2523909999799798, 1.2471000000005006, 1.2465009999687027, 1.2708820000284504], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2551809999763464, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T14:27:41Z", "run": "f6f1662cc0064e52be2f08072a7254af", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2355700000057368, "p50": 1.241141000036805, "p90": 1.2576200000466997, "mean": 1.2477664000130062, "iqr": 0.02047000003813082, "raw_times": [1.2355700000057368, 1.241141000036805, 1.237150000008569, 1.2576200000466997, 1.2673509999672206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2579809999806457, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py
index 15f02e2ed444e10eba9708f3f69247414b6c962b..04ae262009c3d6e33aaa3e392d28c903f24c287c 100644
--- a/flash_attn/impls/cells/benchmark.py
+++ b/flash_attn/impls/cells/benchmark.py
@@ -4,7 +4,7 @@
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
-# "kernels",
+# "xformers",
# ]
#
# [tool.uv.sources]
@@ -13,19 +13,18 @@
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
-from kernels import get_kernel
+import xformers.ops as xops
-# Load the flash attention 3 kernel
-hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
-
-def hf_flash_attention3(query, key, value):
- return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
+def xformers_attention(q, k, v):
+ """xFormers memory efficient attention"""
+ # xFormers expects [batch, seq_len, heads, head_dim]
+ return xops.memory_efficient_attention(q, k, v)
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
- impl_name="hf_kernels_flash_attn3",
- impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
- impl_func=hf_flash_attention3,
+ impl_name="xformers_meff",
+ impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
+ impl_func=xformers_attention,
)
\ No newline at end of file
diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html
index b4834aa56614f91a384d067a2ab29e14d8abc5f4..a6e50f4eba46389d1f17c35d67cbb770dc3d8952 100644
--- a/flash_attn/impls/flash_attention.html
+++ b/flash_attn/impls/flash_attention.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: nv | 0.26s
+Cell: nv | 0.28s
| ▶ run
Copy
Raw
@@ -3888,7 +3888,7 @@ Cell: nv | 0.26s
-
Tue Oct 28 14:08:39 2025
+Wed Oct 29 14:25:53 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -3897,7 +3897,7 @@ Cell: nv | 0.26s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 32C P0 153W / 350W | 0MiB / 46068MiB | 26% Default |
+| N/A 27C P8 21W / 350W | 0MiB / 46068MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -3919,9 +3919,9 @@ Cell: nv | 0.26s
▼ code
▼ output
- ▶ uv-logs
+ ▶ uv-logs
|
-Cell: benchmark | 3.83s
+Cell: benchmark | 32.77s
| ▶ run
Copy
Raw
@@ -3972,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.585ms 101.47% 3.585ms 3.585ms 1
- torch_flash_ma 6.34% 327.656us 45.53% 2.352ms 2.352ms 0.000us 0.00% 3.573ms 3.573ms 1
- aten::scaled_dot_product_attention 0.82% 42.312us 4.12% 213.057us 71.019us 0.000us 0.00% 2.820ms 940.062us 3
- aten::_scaled_dot_product_flash_attention 0.51% 26.321us 3.31% 170.745us 56.915us 0.000us 0.00% 2.820ms 940.062us 3
- aten::_flash_attention_forward 0.73% 37.527us 2.40% 124.015us 41.338us 2.820ms 79.83% 2.820ms 940.062us 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.820ms 79.83% 2.820ms 940.062us 3
- aten::contiguous 0.27% 14.121us 33.79% 1.745ms 145.446us 0.000us 0.00% 752.928us 62.744us 12
- aten::clone 0.72% 37.329us 33.52% 1.731ms 144.269us 0.000us 0.00% 752.928us 62.744us 12
- aten::copy_ 1.68% 87.013us 31.25% 1.614ms 134.513us 712.672us 20.17% 752.928us 62.744us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 712.672us 20.17% 712.672us 59.389us 12
- Activity Buffer Request 27.64% 1.428ms 27.64% 1.428ms 1.428ms 40.256us 1.14% 40.256us 40.256us 1
- aten::transpose 1.24% 64.087us 1.67% 86.009us 3.584us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.42% 21.922us 0.42% 21.922us 0.913us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.48% 24.711us 1.99% 102.775us 6.852us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.74% 89.843us 1.74% 89.843us 3.743us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 2.38% 122.771us 2.38% 122.771us 8.185us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.34% 17.310us 0.34% 17.310us 5.770us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.04% 2.229us 0.04% 2.229us 0.372us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.17% 8.900us 0.17% 8.900us 2.967us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 54.47% 2.814ms 54.47% 2.814ms 2.814ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.644ms 102.02% 3.644ms 3.644ms 1
+ torch_flash_ma 6.80% 356.846us 47.04% 2.468ms 2.468ms 0.000us 0.00% 3.612ms 3.612ms 1
+ aten::scaled_dot_product_attention 0.82% 43.042us 4.47% 234.776us 78.259us 0.000us 0.00% 2.857ms 952.201us 3
+ aten::_scaled_dot_product_flash_attention 0.56% 29.330us 3.65% 191.734us 63.911us 0.000us 0.00% 2.857ms 952.201us 3
+ aten::_flash_attention_forward 0.75% 39.581us 2.59% 135.674us 45.225us 2.857ms 79.97% 2.857ms 952.201us 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.857ms 79.97% 2.857ms 952.201us 3
+ aten::contiguous 0.27% 14.180us 34.32% 1.801ms 150.051us 0.000us 0.00% 755.680us 62.973us 12
+ aten::clone 0.74% 38.791us 34.04% 1.786ms 148.870us 0.000us 0.00% 755.680us 62.973us 12
+ aten::copy_ 1.85% 97.030us 31.43% 1.649ms 137.429us 715.456us 20.03% 755.680us 62.973us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.456us 20.03% 715.456us 59.621us 12
+ Activity Buffer Request 27.38% 1.437ms 27.38% 1.437ms 1.437ms 40.224us 1.13% 40.224us 40.224us 1
+ aten::transpose 1.47% 77.273us 1.96% 102.714us 4.280us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.48% 25.441us 0.48% 25.441us 1.060us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.70% 36.821us 2.35% 123.326us 8.222us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.93% 101.493us 1.93% 101.493us 4.229us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 2.70% 141.775us 2.70% 141.775us 9.452us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.35% 18.402us 0.35% 18.402us 6.134us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.05% 2.540us 0.05% 2.540us 0.423us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.17% 8.890us 0.17% 8.890us 2.963us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 52.96% 2.779ms 52.96% 2.779ms 2.779ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.165ms
-Self CUDA time total: 3.533ms
+Self CPU time total: 5.247ms
+Self CUDA time total: 3.572ms
@@ -4004,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 4.84% 255.079us 41.49% 2.188ms 2.188ms 0.000us 0.00% 3.787ms 3.787ms 1
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.743ms 100.29% 3.743ms 3.743ms 1
- aten::scaled_dot_product_attention 0.47% 24.640us 3.42% 180.356us 60.119us 0.000us 0.00% 2.967ms 989.106us 3
- aten::_scaled_dot_product_flash_attention 0.36% 19.241us 2.95% 155.716us 51.905us 0.000us 0.00% 2.967ms 989.106us 3
- aten::_flash_attention_forward 0.73% 38.683us 2.19% 115.525us 38.508us 2.967ms 79.51% 2.967ms 989.106us 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.967ms 79.51% 2.967ms 989.106us 3
- aten::contiguous 0.17% 8.802us 32.41% 1.709ms 142.425us 0.000us 0.00% 819.868us 68.322us 12
- aten::clone 0.52% 27.349us 32.24% 1.700ms 141.692us 0.000us 0.00% 819.868us 68.322us 12
- aten::copy_ 1.56% 82.061us 30.60% 1.614ms 134.473us 764.892us 20.49% 819.868us 68.322us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 764.892us 20.49% 764.892us 63.741us 12
- Activity Buffer Request 27.50% 1.450ms 27.50% 1.450ms 1.450ms 54.976us 1.47% 54.976us 54.976us 1
- aten::transpose 0.91% 47.959us 1.22% 64.512us 2.688us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.31% 16.553us 0.31% 16.553us 0.690us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.39% 20.732us 1.52% 80.304us 5.354us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.38% 72.972us 1.38% 72.972us 3.040us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 1.96% 103.146us 1.96% 103.146us 6.876us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.28% 14.880us 0.28% 14.880us 4.960us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.03% 1.800us 0.03% 1.800us 0.300us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.07% 3.830us 0.07% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 58.51% 3.085ms 58.51% 3.085ms 3.085ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 4.70% 246.528us 41.73% 2.189ms 2.189ms 0.000us 0.00% 3.817ms 3.817ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.772ms 100.28% 3.772ms 3.772ms 1
+ aten::scaled_dot_product_attention 0.51% 26.610us 3.43% 180.143us 60.048us 0.000us 0.00% 2.999ms 999.573us 3
+ aten::_scaled_dot_product_flash_attention 0.37% 19.600us 2.93% 153.533us 51.178us 0.000us 0.00% 2.999ms 999.573us 3
+ aten::_flash_attention_forward 0.63% 32.980us 2.12% 111.443us 37.148us 2.999ms 79.71% 2.999ms 999.573us 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.999ms 79.71% 2.999ms 999.573us 3
+ aten::contiguous 0.19% 10.030us 32.68% 1.715ms 142.893us 0.000us 0.00% 818.210us 68.184us 12
+ aten::clone 0.55% 29.002us 32.49% 1.705ms 142.057us 0.000us 0.00% 818.210us 68.184us 12
+ aten::copy_ 2.09% 109.441us 30.74% 1.613ms 134.399us 763.297us 20.29% 818.210us 68.184us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 763.297us 20.29% 763.297us 63.608us 12
+ Activity Buffer Request 26.94% 1.413ms 26.94% 1.413ms 1.413ms 54.913us 1.46% 54.913us 54.913us 1
+ aten::transpose 1.00% 52.652us 1.34% 70.433us 2.935us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.34% 17.781us 0.34% 17.781us 0.741us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.38% 19.980us 1.61% 84.581us 5.639us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.45% 76.201us 1.45% 76.201us 3.175us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 2.16% 113.102us 2.16% 113.102us 7.540us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.31% 16.430us 0.31% 16.430us 5.477us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 1.751us 0.03% 1.751us 0.292us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.07% 3.771us 0.07% 3.771us 1.257us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 58.27% 3.058ms 58.27% 3.058ms 3.058ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.273ms
-Self CUDA time total: 3.732ms
+Self CPU time total: 5.247ms
+Self CUDA time total: 3.762ms
@@ -4036,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 4.77% 251.162us 41.45% 2.184ms 2.184ms 0.000us 0.00% 3.786ms 3.786ms 1
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.738ms 100.28% 3.738ms 3.738ms 1
- aten::scaled_dot_product_attention 0.46% 24.280us 3.42% 180.086us 60.029us 0.000us 0.00% 2.949ms 982.872us 3
- aten::_scaled_dot_product_flash_attention 0.34% 18.160us 2.96% 155.806us 51.935us 0.000us 0.00% 2.949ms 982.872us 3
- aten::_flash_attention_forward 0.73% 38.599us 2.20% 115.865us 38.622us 2.949ms 79.09% 2.949ms 982.872us 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.949ms 79.09% 2.949ms 982.872us 3
- aten::contiguous 0.17% 8.991us 32.44% 1.710ms 142.465us 0.000us 0.00% 837.719us 69.810us 12
- aten::clone 0.53% 27.728us 32.27% 1.701ms 141.715us 0.000us 0.00% 837.719us 69.810us 12
- aten::copy_ 1.52% 79.873us 30.57% 1.611ms 134.242us 779.480us 20.91% 837.719us 69.810us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.480us 20.91% 779.480us 64.957us 12
- Activity Buffer Request 27.50% 1.449ms 27.50% 1.449ms 1.449ms 58.239us 1.56% 58.239us 58.239us 1
- aten::transpose 0.92% 48.219us 1.24% 65.252us 2.719us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.32% 17.033us 0.32% 17.033us 0.710us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.37% 19.303us 1.55% 81.795us 5.453us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.44% 76.031us 1.44% 76.031us 3.168us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 1.98% 104.564us 1.98% 104.564us 6.971us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.28% 14.492us 0.28% 14.492us 4.831us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.04% 1.860us 0.04% 1.860us 0.310us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.10% 5.030us 0.10% 5.030us 1.677us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 58.55% 3.085ms 58.55% 3.085ms 3.085ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 4.50% 237.986us 41.18% 2.178ms 2.178ms 0.000us 0.00% 3.833ms 3.833ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.785ms 100.29% 3.785ms 3.785ms 1
+ aten::scaled_dot_product_attention 0.46% 24.381us 3.40% 179.915us 59.972us 0.000us 0.00% 2.998ms 999.221us 3
+ aten::_scaled_dot_product_flash_attention 0.36% 19.171us 2.94% 155.534us 51.845us 0.000us 0.00% 2.998ms 999.221us 3
+ aten::_flash_attention_forward 0.65% 34.259us 2.15% 113.691us 37.897us 2.998ms 79.44% 2.998ms 999.221us 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.998ms 79.44% 2.998ms 999.221us 3
+ aten::contiguous 0.19% 9.800us 32.38% 1.712ms 142.708us 0.000us 0.00% 835.263us 69.605us 12
+ aten::clone 0.53% 28.211us 32.20% 1.703ms 141.891us 0.000us 0.00% 835.263us 69.605us 12
+ aten::copy_ 1.60% 84.650us 30.46% 1.611ms 134.247us 776.063us 20.56% 835.263us 69.605us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 776.063us 20.56% 776.063us 64.672us 12
+ Activity Buffer Request 27.18% 1.437ms 27.18% 1.437ms 1.437ms 59.200us 1.57% 59.200us 59.200us 1
+ aten::transpose 0.99% 52.225us 1.33% 70.125us 2.922us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.34% 17.900us 0.34% 17.900us 0.746us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.37% 19.782us 1.60% 84.803us 5.654us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.45% 76.431us 1.45% 76.431us 3.185us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 2.16% 114.204us 2.16% 114.204us 7.614us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.30% 16.100us 0.30% 16.100us 5.367us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 1.730us 0.03% 1.730us 0.288us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.07% 3.730us 0.07% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 58.82% 3.110ms 58.82% 3.110ms 3.110ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.269ms
-Self CUDA time total: 3.728ms
+Self CPU time total: 5.288ms
+Self CUDA time total: 3.774ms
@@ -4068,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 5.01% 280.573us 44.17% 2.475ms 2.475ms 0.000us 0.00% 3.878ms 3.878ms 1
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.831ms 100.27% 3.831ms 3.831ms 1
- aten::scaled_dot_product_attention 0.48% 26.630us 3.39% 189.956us 63.319us 0.000us 0.00% 3.032ms 1.011ms 3
- aten::_scaled_dot_product_flash_attention 0.34% 19.101us 2.91% 163.326us 54.442us 0.000us 0.00% 3.032ms 1.011ms 3
- aten::_flash_attention_forward 0.70% 39.063us 2.15% 120.325us 40.108us 3.032ms 79.37% 3.032ms 1.011ms 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.032ms 79.37% 3.032ms 1.011ms 3
- aten::contiguous 0.17% 9.271us 34.98% 1.960ms 163.354us 0.000us 0.00% 845.820us 70.485us 12
- aten::clone 0.52% 28.974us 34.82% 1.951ms 162.581us 0.000us 0.00% 845.820us 70.485us 12
- aten::copy_ 1.48% 83.180us 33.17% 1.859ms 154.908us 788.284us 20.63% 845.820us 70.485us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 788.284us 20.63% 788.284us 65.690us 12
- Activity Buffer Request 26.18% 1.467ms 26.18% 1.467ms 1.467ms 57.536us 1.51% 57.536us 57.536us 1
- aten::transpose 0.89% 50.110us 1.21% 67.952us 2.831us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.32% 17.842us 0.32% 17.842us 0.743us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.36% 19.969us 1.53% 85.492us 5.699us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.37% 76.982us 1.37% 76.982us 3.208us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 5.95% 333.480us 5.95% 333.480us 22.232us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.30% 17.041us 0.30% 17.041us 5.680us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 55.83% 3.129ms 55.83% 3.129ms 3.129ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 4.36% 241.837us 43.33% 2.405ms 2.405ms 0.000us 0.00% 3.884ms 3.884ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.837ms 100.27% 3.837ms 3.837ms 1
+ aten::scaled_dot_product_attention 0.48% 26.802us 3.27% 181.715us 60.572us 0.000us 0.00% 3.042ms 1.014ms 3
+ aten::_scaled_dot_product_flash_attention 0.35% 19.308us 2.79% 154.913us 51.638us 0.000us 0.00% 3.042ms 1.014ms 3
+ aten::_flash_attention_forward 0.60% 33.361us 2.03% 112.712us 37.571us 3.042ms 79.50% 3.042ms 1.014ms 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.042ms 79.50% 3.042ms 1.014ms 3
+ aten::contiguous 0.17% 9.659us 34.84% 1.934ms 161.162us 0.000us 0.00% 841.829us 70.152us 12
+ aten::clone 0.50% 27.830us 34.67% 1.924ms 160.357us 0.000us 0.00% 841.829us 70.152us 12
+ aten::copy_ 1.56% 86.702us 32.55% 1.807ms 150.547us 784.548us 20.50% 841.829us 70.152us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 784.548us 20.50% 784.548us 65.379us 12
+ Activity Buffer Request 25.45% 1.413ms 25.45% 1.413ms 1.413ms 57.281us 1.50% 57.281us 57.281us 1
+ aten::transpose 0.95% 52.620us 1.27% 70.404us 2.933us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.32% 17.784us 0.32% 17.784us 0.741us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.78% 43.221us 2.00% 111.194us 7.413us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.45% 80.673us 1.45% 80.673us 3.361us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 5.96% 331.078us 5.96% 331.078us 22.072us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.28% 15.800us 0.28% 15.800us 5.267us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 1.730us 0.03% 1.730us 0.288us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.07% 3.850us 0.07% 3.850us 1.283us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 56.67% 3.146ms 56.67% 3.146ms 3.146ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.603ms
-Self CUDA time total: 3.820ms
+Self CPU time total: 5.551ms
+Self CUDA time total: 3.827ms
@@ -4100,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 5.07% 303.893us 39.93% 2.395ms 2.395ms 0.000us 0.00% 4.370ms 4.370ms 1
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.320ms 100.25% 4.320ms 4.320ms 1
- aten::scaled_dot_product_attention 0.41% 24.650us 3.07% 184.006us 61.335us 0.000us 0.00% 3.503ms 1.168ms 3
- aten::_scaled_dot_product_flash_attention 0.32% 19.311us 2.66% 159.356us 53.119us 0.000us 0.00% 3.503ms 1.168ms 3
- aten::_flash_attention_forward 0.68% 40.911us 1.97% 118.205us 39.402us 3.503ms 81.28% 3.503ms 1.168ms 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.503ms 81.28% 3.503ms 1.168ms 3
- aten::contiguous 0.15% 8.977us 31.04% 1.862ms 155.201us 0.000us 0.00% 867.581us 72.298us 12
- aten::clone 0.47% 28.114us 30.89% 1.853ms 154.453us 0.000us 0.00% 867.581us 72.298us 12
- aten::copy_ 1.36% 81.500us 29.40% 1.764ms 146.991us 806.749us 18.72% 867.581us 72.298us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 806.749us 18.72% 806.749us 67.229us 12
- Activity Buffer Request 23.82% 1.429ms 23.82% 1.429ms 1.429ms 60.832us 1.41% 60.832us 60.832us 1
- aten::transpose 0.82% 49.363us 1.11% 66.863us 2.786us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.29% 17.500us 0.29% 17.500us 0.729us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.33% 20.081us 1.37% 82.424us 5.495us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.26% 75.593us 1.26% 75.593us 3.150us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 4.60% 275.759us 4.60% 275.759us 18.384us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.25% 15.251us 0.25% 15.251us 5.084us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.03% 1.740us 0.03% 1.740us 0.290us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.06% 3.680us 0.06% 3.680us 1.227us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 60.07% 3.604ms 60.07% 3.604ms 3.604ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 4.46% 268.165us 40.09% 2.413ms 2.413ms 0.000us 0.00% 4.405ms 4.405ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.355ms 100.25% 4.355ms 4.355ms 1
+ aten::scaled_dot_product_attention 0.46% 27.642us 3.64% 218.806us 72.935us 0.000us 0.00% 3.540ms 1.180ms 3
+ aten::_scaled_dot_product_flash_attention 0.75% 45.250us 3.18% 191.164us 63.721us 0.000us 0.00% 3.540ms 1.180ms 3
+ aten::_flash_attention_forward 0.61% 36.651us 2.01% 120.923us 40.308us 3.540ms 81.48% 3.540ms 1.180ms 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.540ms 81.48% 3.540ms 1.180ms 3
+ aten::contiguous 0.18% 10.862us 31.11% 1.873ms 156.050us 0.000us 0.00% 865.606us 72.134us 12
+ aten::clone 0.51% 30.490us 30.93% 1.862ms 155.145us 0.000us 0.00% 865.606us 72.134us 12
+ aten::copy_ 1.51% 90.931us 29.34% 1.766ms 147.155us 804.645us 18.52% 865.606us 72.134us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 804.645us 18.52% 804.645us 67.054us 12
+ Activity Buffer Request 21.61% 1.300ms 21.61% 1.300ms 1.300ms 60.961us 1.40% 60.961us 60.961us 1
+ aten::transpose 0.99% 59.753us 1.30% 78.501us 3.271us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.31% 18.748us 0.31% 18.748us 0.781us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.35% 20.935us 1.45% 87.165us 5.811us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.32% 79.690us 1.32% 79.690us 3.320us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 6.67% 401.680us 6.67% 401.680us 26.779us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.27% 16.081us 0.27% 16.081us 5.360us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 2.030us 0.03% 2.030us 0.338us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.06% 3.810us 0.06% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 59.91% 3.605ms 59.91% 3.605ms 3.605ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.999ms
-Self CUDA time total: 4.309ms
+Self CPU time total: 6.018ms
+Self CUDA time total: 4.344ms
@@ -4132,39 +4132,91 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 3.83% 232.270us 37.82% 2.296ms 2.296ms 0.000us 0.00% 4.474ms 4.474ms 1
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.423ms 100.25% 4.423ms 4.423ms 1
- aten::scaled_dot_product_attention 0.41% 24.850us 2.85% 172.746us 57.582us 0.000us 0.00% 3.595ms 1.198ms 3
- aten::_scaled_dot_product_flash_attention 0.30% 18.250us 2.44% 147.896us 49.299us 0.000us 0.00% 3.595ms 1.198ms 3
- aten::_flash_attention_forward 0.54% 32.692us 1.77% 107.224us 35.741us 3.595ms 81.48% 3.595ms 1.198ms 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.595ms 81.48% 3.595ms 1.198ms 3
- aten::contiguous 0.14% 8.610us 30.41% 1.846ms 153.859us 0.000us 0.00% 878.139us 73.178us 12
- aten::clone 0.45% 27.368us 30.27% 1.838ms 153.142us 0.000us 0.00% 878.139us 73.178us 12
- aten::copy_ 1.35% 81.917us 28.83% 1.750ms 145.831us 817.083us 18.52% 878.139us 73.178us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.083us 18.52% 817.083us 68.090us 12
- Activity Buffer Request 23.72% 1.440ms 23.72% 1.440ms 1.440ms 61.056us 1.38% 61.056us 61.056us 1
- aten::transpose 0.82% 50.064us 1.10% 66.792us 2.783us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.28% 16.728us 0.28% 16.728us 0.697us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.32% 19.431us 1.31% 79.591us 5.306us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.21% 73.220us 1.21% 73.220us 3.051us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 4.12% 249.950us 4.12% 249.950us 16.663us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.24% 14.270us 0.24% 14.270us 4.757us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.03% 1.680us 0.03% 1.680us 0.280us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.07% 4.380us 0.07% 4.380us 1.460us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 62.18% 3.775ms 62.18% 3.775ms 3.775ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 4.01% 246.839us 39.75% 2.447ms 2.447ms 0.000us 0.00% 4.458ms 4.458ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.407ms 100.23% 4.407ms 4.407ms 1
+ aten::scaled_dot_product_attention 0.40% 24.621us 2.95% 181.474us 60.491us 0.000us 0.00% 3.579ms 1.193ms 3
+ aten::_scaled_dot_product_flash_attention 0.34% 20.980us 2.55% 156.853us 52.284us 0.000us 0.00% 3.579ms 1.193ms 3
+ aten::_flash_attention_forward 0.58% 35.588us 1.84% 113.003us 37.668us 3.579ms 81.40% 3.579ms 1.193ms 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.579ms 81.40% 3.579ms 1.193ms 3
+ aten::contiguous 0.16% 10.061us 32.01% 1.971ms 164.244us 0.000us 0.00% 878.818us 73.235us 12
+ aten::clone 0.50% 30.903us 31.85% 1.961ms 163.406us 0.000us 0.00% 878.818us 73.235us 12
+ aten::copy_ 1.35% 82.841us 30.27% 1.864ms 155.305us 817.634us 18.60% 878.818us 73.235us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.634us 18.60% 817.634us 68.136us 12
+ Activity Buffer Request 23.50% 1.447ms 23.50% 1.447ms 1.447ms 61.184us 1.39% 61.184us 61.184us 1
+ aten::transpose 0.85% 52.630us 1.15% 70.790us 2.950us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.29% 18.160us 0.29% 18.160us 0.757us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.33% 20.456us 1.41% 86.700us 5.780us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.28% 78.794us 1.28% 78.794us 3.283us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 5.81% 357.919us 5.81% 357.919us 23.861us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.25% 15.401us 0.25% 15.401us 5.134us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 1.632us 0.03% 1.632us 0.272us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.06% 3.720us 0.06% 3.720us 1.240us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 60.25% 3.709ms 60.25% 3.709ms 3.709ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 6.071ms
-Self CUDA time total: 4.413ms
+Self CPU time total: 6.156ms
+Self CUDA time total: 4.397ms
impl wl p50(ms) ok
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
-torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
-torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
-torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
+torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
+torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
+torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
+
+
+
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading matplotlib (8.3MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading triton (148.3MiB)
+Downloading pillow (6.7MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading torch (846.9MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading setuptools
+ Downloading networkx
+ Downloading fonttools
+ Downloading pillow
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 37 packages in 212ms
+
+
Artifacts:
attention.jsonl
diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html
index 6414b268459e56cf2a96ef4b229b35fde2e104fa..7d03567858952d02de89e25ce04873ef34373a75 100644
--- a/flash_attn/impls/hf_kernels_flash_attn.html
+++ b/flash_attn/impls/hf_kernels_flash_attn.html
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ code
▼ output
- ▶ uv-logs
+ ▶ uv-logs
|
-Cell: benchmark | 6.08s
+Cell: benchmark | 5.58s
|
▶ run
Copy
Raw
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 3.64% 160.058us 41.50% 1.823ms 1.823ms 0.000us 0.00% 3.744ms 3.744ms 1
- _flash_attn_9e27194::fwd 1.78% 78.347us 37.86% 1.663ms 554.208us 2.792ms 100.00% 3.744ms 1.248ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.794ms 100.05% 2.794ms 2.794ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.792ms 100.00% 2.792ms 930.800us 3
- Activity Buffer Request 33.00% 1.449ms 33.00% 1.449ms 1.449ms 951.685us 34.08% 951.685us 951.685us 1
- cudaDeviceGetAttribute 0.13% 5.638us 0.13% 5.638us 0.376us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.40% 17.551us 1.19% 52.122us 17.374us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.79% 34.571us 0.79% 34.571us 11.524us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.57% 24.890us 0.57% 24.890us 2.766us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.28% 12.210us 0.28% 12.210us 4.070us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.92% 40.292us 0.92% 40.292us 13.431us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 58.50% 2.569ms 58.50% 2.569ms 2.569ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 3.55% 156.153us 41.08% 1.807ms 1.807ms 0.000us 0.00% 3.775ms 3.775ms 1
+ _flash_attn_9e27194::fwd 1.65% 72.542us 37.53% 1.651ms 550.240us 2.812ms 100.00% 3.775ms 1.258ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.814ms 100.05% 2.814ms 2.814ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.812ms 100.00% 2.812ms 937.398us 3
+ Activity Buffer Request 32.22% 1.417ms 32.22% 1.417ms 1.417ms 962.880us 34.24% 962.880us 962.880us 1
+ cudaDeviceGetAttribute 0.13% 5.500us 0.13% 5.500us 0.367us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.43% 19.110us 1.25% 54.882us 18.294us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.81% 35.772us 0.81% 35.772us 11.924us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.57% 25.101us 0.57% 25.101us 2.789us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.30% 13.270us 0.30% 13.270us 4.423us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.42% 62.402us 1.42% 62.402us 20.801us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 58.92% 2.591ms 58.92% 2.591ms 2.591ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.392ms
-Self CUDA time total: 2.792ms
+Self CPU time total: 4.398ms
+Self CUDA time total: 2.812ms
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 2.22% 99.144us 37.48% 1.673ms 1.673ms 0.000us 0.00% 3.949ms 3.949ms 1
- _flash_attn_9e27194::fwd 1.20% 53.462us 35.26% 1.574ms 524.654us 2.953ms 100.00% 3.949ms 1.316ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.955ms 100.05% 2.955ms 2.955ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 100.00% 2.953ms 984.436us 3
- Activity Buffer Request 32.23% 1.439ms 32.23% 1.439ms 1.439ms 995.807us 33.72% 995.807us 995.807us 1
- cudaDeviceGetAttribute 0.10% 4.621us 0.10% 4.621us 0.308us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.17% 7.710us 0.56% 24.861us 8.287us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.38% 17.151us 0.38% 17.151us 5.717us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.47% 21.122us 0.47% 21.122us 2.347us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.08% 3.791us 0.08% 3.791us 1.264us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.61% 27.380us 0.61% 27.380us 9.127us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 62.52% 2.791ms 62.52% 2.791ms 2.791ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 2.04% 91.192us 36.62% 1.634ms 1.634ms 0.000us 0.00% 3.983ms 3.983ms 1
+ _flash_attn_9e27194::fwd 1.11% 49.718us 34.57% 1.543ms 514.203us 2.978ms 100.00% 3.983ms 1.328ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.980ms 100.05% 2.980ms 2.980ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.978ms 100.00% 2.978ms 992.707us 3
+ Activity Buffer Request 31.74% 1.416ms 31.74% 1.416ms 1.416ms 1.004ms 33.73% 1.004ms 1.004ms 1
+ cudaDeviceGetAttribute 0.08% 3.711us 0.08% 3.711us 0.247us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.17% 7.481us 0.51% 22.841us 7.614us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.34% 15.360us 0.34% 15.360us 5.120us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.46% 20.620us 0.46% 20.620us 2.291us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.08% 3.741us 0.08% 3.741us 1.247us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.58% 25.842us 0.58% 25.842us 8.614us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 63.38% 2.828ms 63.38% 2.828ms 2.828ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.464ms
-Self CUDA time total: 2.953ms
+Self CPU time total: 4.462ms
+Self CUDA time total: 2.978ms
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 2.58% 116.955us 37.54% 1.702ms 1.702ms 0.000us 0.00% 4.041ms 4.041ms 1
- _flash_attn_9e27194::fwd 1.53% 69.255us 34.96% 1.585ms 528.314us 3.010ms 100.00% 4.041ms 1.347ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.012ms 100.05% 3.012ms 3.012ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.010ms 100.00% 3.010ms 1.003ms 3
- Activity Buffer Request 31.53% 1.430ms 31.53% 1.430ms 1.430ms 1.031ms 34.26% 1.031ms 1.031ms 1
- cudaDeviceGetAttribute 0.10% 4.450us 0.10% 4.450us 0.297us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.18% 8.151us 0.57% 25.801us 8.600us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.39% 17.650us 0.39% 17.650us 5.883us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.48% 21.771us 0.48% 21.771us 2.419us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.10% 4.360us 0.10% 4.360us 1.453us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.66% 29.790us 0.66% 29.790us 9.930us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 62.46% 2.832ms 62.46% 2.832ms 2.832ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 2.28% 105.284us 36.17% 1.673ms 1.673ms 0.000us 0.00% 4.145ms 4.145ms 1
+ _flash_attn_9e27194::fwd 1.09% 50.271us 33.89% 1.567ms 522.459us 3.096ms 100.00% 4.145ms 1.382ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.098ms 100.05% 3.098ms 3.098ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.096ms 100.00% 3.096ms 1.032ms 3
+ Activity Buffer Request 31.08% 1.437ms 31.08% 1.437ms 1.437ms 1.049ms 33.87% 1.049ms 1.049ms 1
+ cudaDeviceGetAttribute 0.08% 3.850us 0.08% 3.850us 0.257us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.15% 7.061us 0.49% 22.631us 7.544us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.34% 15.570us 0.34% 15.570us 5.190us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.47% 21.760us 0.47% 21.760us 2.418us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.08% 3.689us 0.08% 3.689us 1.230us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.61% 27.992us 0.61% 27.992us 9.331us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 63.83% 2.952ms 63.83% 2.952ms 2.952ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.534ms
-Self CUDA time total: 3.010ms
+Self CPU time total: 4.625ms
+Self CUDA time total: 3.096ms
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 2.39% 114.805us 40.03% 1.925ms 1.925ms 0.000us 0.00% 4.094ms 4.094ms 1
- _flash_attn_9e27194::fwd 1.09% 52.653us 37.65% 1.810ms 603.407us 3.063ms 100.00% 4.094ms 1.365ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.065ms 100.05% 3.065ms 3.065ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.063ms 100.00% 3.063ms 1.021ms 3
- Activity Buffer Request 29.78% 1.432ms 29.78% 1.432ms 1.432ms 1.031ms 33.65% 1.031ms 1.031ms 1
- cudaDeviceGetAttribute 0.10% 4.861us 0.10% 4.861us 0.324us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.16% 7.720us 0.55% 26.331us 8.777us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.39% 18.611us 0.39% 18.611us 6.204us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.45% 21.731us 0.45% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.08% 3.728us 0.08% 3.728us 1.243us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 5.59% 268.862us 5.59% 268.862us 89.621us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 59.97% 2.884ms 59.97% 2.884ms 2.884ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 2.30% 110.882us 38.29% 1.842ms 1.842ms 0.000us 0.00% 4.161ms 4.161ms 1
+ _flash_attn_9e27194::fwd 1.05% 50.321us 35.98% 1.731ms 577.014us 3.117ms 100.00% 4.161ms 1.387ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.118ms 100.05% 3.118ms 3.118ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.117ms 100.00% 3.117ms 1.039ms 3
+ Activity Buffer Request 29.64% 1.426ms 29.64% 1.426ms 1.426ms 1.044ms 33.50% 1.044ms 1.044ms 1
+ cudaDeviceGetAttribute 0.08% 3.780us 0.08% 3.780us 0.252us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.15% 7.259us 0.50% 24.240us 8.080us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.35% 16.981us 0.35% 16.981us 5.660us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.45% 21.602us 0.45% 21.602us 2.400us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.08% 3.770us 0.08% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 4.18% 201.205us 4.18% 201.205us 67.068us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 61.71% 2.969ms 61.71% 2.969ms 2.969ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.809ms
-Self CUDA time total: 3.063ms
+Self CPU time total: 4.811ms
+Self CUDA time total: 3.117ms
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 2.13% 113.755us 35.84% 1.918ms 1.918ms 0.000us 0.00% 4.786ms 4.786ms 1
- _flash_attn_9e27194::fwd 1.02% 54.483us 33.71% 1.804ms 601.364us 3.588ms 100.00% 4.786ms 1.595ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.590ms 100.04% 3.590ms 3.590ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.588ms 100.00% 3.588ms 1.196ms 3
- Activity Buffer Request 26.99% 1.445ms 26.99% 1.445ms 1.445ms 1.198ms 33.38% 1.198ms 1.198ms 1
- cudaDeviceGetAttribute 0.08% 4.270us 0.08% 4.270us 0.285us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.15% 8.039us 0.48% 25.640us 8.547us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.33% 17.601us 0.33% 17.601us 5.867us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.40% 21.582us 0.40% 21.582us 2.398us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.07% 3.700us 0.07% 3.700us 1.233us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 4.67% 249.891us 4.67% 249.891us 83.297us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 64.16% 3.434ms 64.16% 3.434ms 3.434ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 2.05% 108.443us 34.64% 1.832ms 1.832ms 0.000us 0.00% 4.810ms 4.810ms 1
+ _flash_attn_9e27194::fwd 0.96% 50.812us 32.59% 1.723ms 574.364us 3.602ms 100.00% 4.810ms 1.603ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.603ms 100.04% 3.603ms 3.603ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.602ms 100.00% 3.602ms 1.201ms 3
+ Activity Buffer Request 27.53% 1.455ms 27.53% 1.455ms 1.455ms 1.209ms 33.55% 1.209ms 1.209ms 1
+ cudaDeviceGetAttribute 0.08% 4.070us 0.08% 4.070us 0.271us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.14% 7.390us 0.45% 23.900us 7.967us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.31% 16.510us 0.31% 16.510us 5.503us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.40% 21.151us 0.40% 21.151us 2.350us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.10% 164.023us 3.10% 164.023us 54.674us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 65.36% 3.455ms 65.36% 3.455ms 3.455ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.351ms
-Self CUDA time total: 3.588ms
+Self CPU time total: 5.287ms
+Self CUDA time total: 3.602ms
@@ -4046,41 +4046,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 2.08% 111.044us 35.25% 1.879ms 1.879ms 0.000us 0.00% 4.816ms 4.816ms 1
- _flash_attn_9e27194::fwd 0.99% 52.834us 33.17% 1.768ms 589.427us 3.606ms 100.00% 4.816ms 1.605ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.607ms 100.05% 3.607ms 3.607ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.606ms 100.00% 3.606ms 1.202ms 3
- Activity Buffer Request 26.56% 1.416ms 26.56% 1.416ms 1.416ms 1.210ms 33.55% 1.210ms 1.210ms 1
- cudaDeviceGetAttribute 0.08% 4.460us 0.08% 4.460us 0.297us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.14% 7.500us 0.49% 26.051us 8.684us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.35% 18.551us 0.35% 18.551us 6.184us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.41% 21.960us 0.41% 21.960us 2.440us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.08% 4.009us 0.08% 4.009us 1.336us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 4.55% 242.792us 4.55% 242.792us 80.931us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 64.75% 3.452ms 64.75% 3.452ms 3.452ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 1.95% 105.103us 34.11% 1.836ms 1.836ms 0.000us 0.00% 4.931ms 4.931ms 1
+ _flash_attn_9e27194::fwd 1.08% 58.141us 32.16% 1.731ms 577.087us 3.693ms 100.00% 4.931ms 1.644ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.695ms 100.04% 3.695ms 3.695ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.693ms 100.00% 3.693ms 1.231ms 3
+ Activity Buffer Request 26.71% 1.438ms 26.71% 1.438ms 1.438ms 1.238ms 33.53% 1.238ms 1.238ms 1
+ cudaDeviceGetAttribute 0.08% 4.380us 0.08% 4.380us 0.292us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.15% 8.230us 0.50% 26.750us 8.917us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.34% 18.520us 0.34% 18.520us 6.173us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.48% 25.961us 0.48% 25.961us 2.885us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.08% 4.220us 0.08% 4.220us 1.407us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.23% 173.714us 3.23% 173.714us 57.905us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 65.89% 3.548ms 65.89% 3.548ms 3.548ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.332ms
-Self CUDA time total: 3.606ms
+Self CPU time total: 5.384ms
+Self CUDA time total: 3.693ms
impl wl p50(ms) ok
-hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
+hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
-hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
-hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
-hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
+hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True
+hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
+hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
-
-
-
-Installed 15 packages in 13ms
+
+Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
+Fetching 20 files: 10%|█ | 2/20 [00:01<00:13, 1.34it/s]
+Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 13.40it/s]
-
-
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
-Fetching 20 files: 5%|▌ | 1/20 [00:00<00:04, 4.26it/s]
-Fetching 20 files: 10%|█ | 2/20 [00:01<00:17, 1.03it/s]
-Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.64it/s]
Artifacts:
attention.jsonl
diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html
index 3b19dae40ca718ad81f3050d7a0de99c655bf943..889bda3eb9ecfa28e1bd79f67d85d1acc88d58a0 100644
--- a/flash_attn/impls/hf_kernels_flash_attn3.html
+++ b/flash_attn/impls/hf_kernels_flash_attn3.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: benchmark | 5.68s
+Cell: benchmark | 5.52s
|
▶ run
Copy
Raw
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 3.89% 167.076us 44.49% 1.911ms 1.911ms 0.000us 0.00% 3.576ms 3.576ms 1
- FlashAttnFunc 3.00% 128.934us 40.60% 1.744ms 581.290us 0.000us 0.00% 3.576ms 1.192ms 3
- _flash_attn3_48fe103_dirty::fwd 1.82% 78.184us 37.60% 1.615ms 538.312us 2.688ms 100.00% 3.576ms 1.192ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.690ms 100.05% 2.690ms 2.690ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.688ms 100.00% 2.688ms 896.117us 3
- Activity Buffer Request 33.29% 1.430ms 33.29% 1.430ms 1.430ms 887.327us 33.01% 887.327us 887.327us 1
- aten::empty 1.08% 46.281us 1.08% 46.281us 7.714us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.37% 15.900us 0.37% 15.900us 5.300us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 1.04% 44.671us 1.04% 44.671us 14.890us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 55.51% 2.384ms 55.51% 2.384ms 2.384ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 3.72% 161.222us 44.67% 1.935ms 1.935ms 0.000us 0.00% 3.599ms 3.599ms 1
+ FlashAttnFunc 2.81% 121.834us 40.95% 1.774ms 591.218us 0.000us 0.00% 3.599ms 1.200ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.85% 79.992us 38.14% 1.652ms 550.607us 2.693ms 100.00% 3.599ms 1.200ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.695ms 100.05% 2.695ms 2.695ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.693ms 100.00% 2.693ms 897.759us 3
+ Activity Buffer Request 33.93% 1.470ms 33.93% 1.470ms 1.470ms 905.439us 33.62% 905.439us 905.439us 1
+ aten::empty 1.00% 43.311us 1.00% 43.311us 7.219us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.32% 13.891us 0.32% 13.891us 4.630us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.04% 45.121us 1.04% 45.121us 15.040us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 55.33% 2.396ms 55.33% 2.396ms 2.396ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.295ms
-Self CUDA time total: 2.688ms
+Self CPU time total: 4.331ms
+Self CUDA time total: 2.693ms
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 3.06% 130.754us 41.10% 1.758ms 1.758ms 0.000us 0.00% 3.668ms 3.668ms 1
- FlashAttnFunc 2.23% 95.572us 38.05% 1.627ms 542.455us 0.000us 0.00% 3.668ms 1.223ms 3
- _flash_attn3_48fe103_dirty::fwd 1.23% 52.754us 35.81% 1.532ms 510.598us 2.747ms 100.00% 3.668ms 1.223ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.748ms 100.05% 2.748ms 2.748ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.747ms 100.00% 2.747ms 915.501us 3
- Activity Buffer Request 33.10% 1.416ms 33.10% 1.416ms 1.416ms 921.272us 33.54% 921.272us 921.272us 1
- aten::empty 0.63% 26.890us 0.63% 26.890us 4.482us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.12% 4.970us 0.12% 4.970us 1.657us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.73% 31.351us 0.73% 31.351us 10.450us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 58.90% 2.519ms 58.90% 2.519ms 2.519ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 2.17% 96.772us 39.76% 1.770ms 1.770ms 0.000us 0.00% 3.876ms 3.876ms 1
+ FlashAttnFunc 2.04% 90.694us 37.59% 1.674ms 557.834us 0.000us 0.00% 3.876ms 1.292ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.15% 51.142us 35.55% 1.583ms 527.603us 2.896ms 100.00% 3.876ms 1.292ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.898ms 100.05% 2.898ms 2.898ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.896ms 100.00% 2.896ms 965.387us 3
+ Activity Buffer Request 33.04% 1.471ms 33.04% 1.471ms 1.471ms 979.809us 33.83% 979.809us 979.809us 1
+ aten::empty 0.58% 25.610us 0.58% 25.610us 4.268us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.240us 0.12% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.67% 29.750us 0.67% 29.750us 9.917us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 60.24% 2.682ms 60.24% 2.682ms 2.682ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.277ms
-Self CUDA time total: 2.747ms
+Self CPU time total: 4.452ms
+Self CUDA time total: 2.896ms
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 2.33% 101.653us 39.53% 1.727ms 1.727ms 0.000us 0.00% 3.829ms 3.829ms 1
- FlashAttnFunc 2.05% 89.593us 37.20% 1.625ms 541.619us 0.000us 0.00% 3.829ms 1.276ms 3
- _flash_attn3_48fe103_dirty::fwd 1.17% 51.051us 35.15% 1.535ms 511.754us 2.856ms 100.00% 3.829ms 1.276ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.858ms 100.06% 2.858ms 2.858ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.856ms 100.00% 2.856ms 952.136us 3
- Activity Buffer Request 32.54% 1.421ms 32.54% 1.421ms 1.421ms 972.574us 34.05% 972.574us 972.574us 1
- aten::empty 0.62% 27.231us 0.62% 27.231us 4.538us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.12% 5.411us 0.12% 5.411us 1.804us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.69% 30.341us 0.69% 30.341us 10.114us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 60.47% 2.642ms 60.47% 2.642ms 2.642ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 2.19% 98.331us 39.82% 1.786ms 1.786ms 0.000us 0.00% 3.885ms 3.885ms 1
+ FlashAttnFunc 1.99% 89.333us 37.63% 1.688ms 562.551us 0.000us 0.00% 3.885ms 1.295ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.08% 48.311us 35.64% 1.598ms 532.773us 2.912ms 100.00% 3.885ms 1.295ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.914ms 100.05% 2.914ms 2.914ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.912ms 100.00% 2.912ms 970.802us 3
+ Activity Buffer Request 33.18% 1.488ms 33.18% 1.488ms 1.488ms 972.637us 33.40% 972.637us 972.637us 1
+ aten::empty 0.57% 25.370us 0.57% 25.370us 4.228us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.13% 5.730us 0.13% 5.730us 1.910us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.69% 30.861us 0.69% 30.861us 10.287us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 60.18% 2.699ms 60.18% 2.699ms 2.699ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.368ms
-Self CUDA time total: 2.856ms
+Self CPU time total: 4.485ms
+Self CUDA time total: 2.912ms
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 2.61% 122.474us 42.62% 2.001ms 2.001ms 0.000us 0.00% 3.906ms 3.906ms 1
- FlashAttnFunc 1.99% 93.683us 40.01% 1.879ms 626.332us 0.000us 0.00% 3.906ms 1.302ms 3
- _flash_attn3_48fe103_dirty::fwd 1.17% 54.872us 38.02% 1.785ms 595.104us 2.915ms 100.00% 3.906ms 1.302ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.917ms 100.05% 2.917ms 2.917ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.915ms 100.00% 2.915ms 971.727us 3
- Activity Buffer Request 31.11% 1.461ms 31.11% 1.461ms 1.461ms 991.129us 34.00% 991.129us 991.129us 1
- aten::empty 0.59% 27.622us 0.59% 27.622us 4.604us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.12% 5.820us 0.12% 5.820us 1.940us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 5.03% 236.178us 5.03% 236.178us 78.726us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 57.38% 2.695ms 57.38% 2.695ms 2.695ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 2.51% 118.553us 41.81% 1.973ms 1.973ms 0.000us 0.00% 3.964ms 3.964ms 1
+ FlashAttnFunc 1.94% 91.662us 39.30% 1.855ms 618.205us 0.000us 0.00% 3.964ms 1.321ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.07% 50.373us 37.36% 1.763ms 587.651us 2.962ms 100.00% 3.964ms 1.321ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.964ms 100.05% 2.964ms 2.964ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.962ms 100.00% 2.962ms 987.401us 3
+ Activity Buffer Request 30.92% 1.459ms 30.92% 1.459ms 1.459ms 1.002ms 33.82% 1.002ms 1.002ms 1
+ aten::empty 0.56% 26.451us 0.56% 26.451us 4.408us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 4.70% 221.845us 4.70% 221.845us 73.948us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 58.19% 2.746ms 58.19% 2.746ms 2.746ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.696ms
-Self CUDA time total: 2.915ms
+Self CPU time total: 4.719ms
+Self CUDA time total: 2.962ms
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 2.45% 124.235us 37.18% 1.882ms 1.882ms 0.000us 0.00% 4.537ms 4.537ms 1
- FlashAttnFunc 1.83% 92.522us 34.73% 1.758ms 585.897us 0.000us 0.00% 4.537ms 1.512ms 3
- _flash_attn3_48fe103_dirty::fwd 1.03% 52.313us 32.90% 1.665ms 555.056us 3.398ms 100.00% 4.537ms 1.512ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.399ms 100.05% 3.399ms 3.399ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3
- Activity Buffer Request 27.82% 1.408ms 27.82% 1.408ms 1.408ms 1.139ms 33.52% 1.139ms 1.139ms 1
- aten::empty 0.54% 27.441us 0.54% 27.441us 4.573us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.12% 5.839us 0.12% 5.839us 1.946us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 3.39% 171.646us 3.39% 171.646us 57.215us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 62.82% 3.179ms 62.82% 3.179ms 3.179ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 2.19% 114.453us 37.34% 1.953ms 1.953ms 0.000us 0.00% 4.662ms 4.662ms 1
+ FlashAttnFunc 1.73% 90.401us 35.15% 1.838ms 612.822us 0.000us 0.00% 4.662ms 1.554ms 3
+ _flash_attn3_48fe103_dirty::fwd 0.97% 50.643us 33.42% 1.748ms 582.688us 3.490ms 100.00% 4.662ms 1.554ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.492ms 100.04% 3.492ms 3.492ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.490ms 100.00% 3.490ms 1.163ms 3
+ Activity Buffer Request 28.44% 1.487ms 28.44% 1.487ms 1.487ms 1.171ms 33.56% 1.171ms 1.171ms 1
+ aten::empty 0.52% 27.271us 0.52% 27.271us 4.545us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.09% 4.950us 0.09% 4.950us 1.650us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.40% 178.024us 3.40% 178.024us 59.341us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 62.66% 3.277ms 62.66% 3.277ms 3.277ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.061ms
-Self CUDA time total: 3.398ms
+Self CPU time total: 5.230ms
+Self CUDA time total: 3.490ms
@@ -4035,33 +4035,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 2.74% 138.223us 36.95% 1.864ms 1.864ms 0.000us 0.00% 4.557ms 4.557ms 1
- FlashAttnFunc 1.84% 92.725us 34.21% 1.726ms 575.197us 0.000us 0.00% 4.557ms 1.519ms 3
- _flash_attn3_48fe103_dirty::fwd 1.03% 52.171us 32.37% 1.633ms 544.289us 3.424ms 100.00% 4.557ms 1.519ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.425ms 100.04% 3.425ms 3.425ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.424ms 100.00% 3.424ms 1.141ms 3
- Activity Buffer Request 27.34% 1.379ms 27.34% 1.379ms 1.379ms 1.133ms 33.10% 1.133ms 1.133ms 1
- aten::empty 0.57% 28.661us 0.57% 28.661us 4.777us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.10% 5.240us 0.10% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 3.33% 167.776us 3.33% 167.776us 55.925us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 63.05% 3.181ms 63.05% 3.181ms 3.181ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 2.26% 115.663us 36.27% 1.854ms 1.854ms 0.000us 0.00% 4.679ms 4.679ms 1
+ FlashAttnFunc 2.25% 114.773us 34.01% 1.738ms 579.364us 0.000us 0.00% 4.679ms 1.560ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.02% 51.933us 31.76% 1.623ms 541.107us 3.499ms 100.00% 4.679ms 1.560ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 100.04% 3.500ms 3.500ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.499ms 100.00% 3.499ms 1.166ms 3
+ Activity Buffer Request 26.80% 1.370ms 26.80% 1.370ms 1.370ms 1.181ms 33.75% 1.181ms 1.181ms 1
+ aten::empty 0.54% 27.681us 0.54% 27.681us 4.613us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.10% 5.079us 0.10% 5.079us 1.693us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.30% 168.813us 3.30% 168.813us 56.271us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 63.73% 3.257ms 63.73% 3.257ms 3.257ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.045ms
-Self CUDA time total: 3.424ms
+Self CPU time total: 5.111ms
+Self CUDA time total: 3.499ms
impl wl p50(ms) ok
-hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
-hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
-hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
-hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
-hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
+hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
+hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
+hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True
+hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True
+hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
-Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.27it/s]
-Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.55it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.38it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.75it/s]
Artifacts:
diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html
index 9e30082f387ab5511025b216cc2dd03e743dccac..f6ab4e24cf377304db7fbbedb7a4571918177b17 100644
--- a/flash_attn/impls/mem_efficient_attention.html
+++ b/flash_attn/impls/mem_efficient_attention.html
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ code
▼ output
- ▶ uv-logs
+ ▶ uv-logs
|
-Cell: benchmark | 32.68s
+Cell: benchmark | 3.92s
|
▶ run
Copy
Raw
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 4.77% 340.490us 32.91% 2.350ms 2.350ms 0.000us 0.00% 5.530ms 5.530ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.523ms 100.81% 5.523ms 5.523ms 1
- aten::scaled_dot_product_attention 0.44% 31.421us 2.67% 190.938us 63.646us 0.000us 0.00% 4.861ms 1.620ms 3
- aten::_scaled_dot_product_efficient_attention 0.35% 24.771us 2.23% 159.517us 53.172us 0.000us 0.00% 4.861ms 1.620ms 3
- aten::_efficient_attention_forward 0.51% 36.163us 1.50% 107.413us 35.804us 4.861ms 88.73% 4.861ms 1.620ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.861ms 88.73% 4.861ms 1.620ms 3
- aten::contiguous 0.17% 12.232us 24.52% 1.751ms 194.525us 0.000us 0.00% 668.128us 74.236us 9
- aten::clone 0.48% 34.579us 24.35% 1.738ms 193.165us 0.000us 0.00% 668.128us 74.236us 9
- aten::copy_ 1.16% 82.494us 22.79% 1.628ms 180.845us 617.312us 11.27% 668.128us 74.236us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.312us 11.27% 617.312us 68.590us 9
- Activity Buffer Request 20.35% 1.453ms 20.35% 1.453ms 1.453ms 50.816us 0.93% 50.816us 50.816us 1
- aten::transpose 1.00% 71.754us 1.33% 95.065us 3.961us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.33% 23.311us 0.33% 23.311us 0.971us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.27% 19.481us 1.07% 76.301us 8.478us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 1.26% 89.759us 1.26% 89.759us 4.274us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 1.62% 115.656us 1.62% 115.656us 9.638us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.04% 2.980us 0.04% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.16% 11.490us 0.16% 11.490us 3.830us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 67.09% 4.790ms 67.09% 4.790ms 4.790ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 4.77% 333.269us 32.71% 2.284ms 2.284ms 0.000us 0.00% 5.420ms 5.420ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.402ms 100.61% 5.402ms 5.402ms 1
+ aten::scaled_dot_product_attention 0.44% 30.450us 2.54% 177.435us 59.145us 0.000us 0.00% 4.753ms 1.584ms 3
+ aten::_scaled_dot_product_efficient_attention 0.33% 22.722us 2.10% 146.985us 48.995us 0.000us 0.00% 4.753ms 1.584ms 3
+ aten::_efficient_attention_forward 0.51% 35.382us 1.42% 99.273us 33.091us 4.753ms 88.51% 4.753ms 1.584ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.753ms 88.51% 4.753ms 1.584ms 3
+ aten::contiguous 0.17% 11.660us 24.51% 1.712ms 190.185us 0.000us 0.00% 667.266us 74.141us 9
+ aten::clone 0.46% 31.810us 24.34% 1.700ms 188.889us 0.000us 0.00% 667.266us 74.141us 9
+ aten::copy_ 1.01% 70.871us 22.86% 1.597ms 177.404us 616.738us 11.49% 667.266us 74.141us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.738us 11.49% 616.738us 68.526us 9
+ Activity Buffer Request 20.64% 1.441ms 20.64% 1.441ms 1.441ms 50.528us 0.94% 50.528us 50.528us 1
+ aten::transpose 0.91% 63.619us 1.25% 87.011us 3.625us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.33% 23.392us 0.33% 23.392us 0.975us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.24% 16.972us 1.02% 71.553us 7.950us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 1.18% 82.691us 1.18% 82.691us 3.938us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 1.55% 108.383us 1.55% 108.383us 9.032us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.05% 3.260us 0.05% 3.260us 1.087us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.12% 8.450us 0.12% 8.450us 2.817us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 67.29% 4.700ms 67.29% 4.700ms 4.700ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 7.140ms
-Self CUDA time total: 5.479ms
+Self CPU time total: 6.984ms
+Self CUDA time total: 5.369ms
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 3.38% 251.986us 27.98% 2.086ms 2.086ms 0.000us 0.00% 6.014ms 6.014ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.969ms 100.15% 5.969ms 5.969ms 1
- aten::scaled_dot_product_attention 0.27% 19.962us 1.97% 146.646us 48.882us 0.000us 0.00% 5.323ms 1.774ms 3
- aten::_scaled_dot_product_efficient_attention 0.26% 19.141us 1.70% 126.684us 42.228us 0.000us 0.00% 5.323ms 1.774ms 3
- aten::_efficient_attention_forward 0.39% 29.281us 1.12% 83.514us 27.838us 5.323ms 89.32% 5.323ms 1.774ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.323ms 89.32% 5.323ms 1.774ms 3
- aten::contiguous 0.10% 7.510us 22.05% 1.644ms 182.655us 0.000us 0.00% 690.909us 76.768us 9
- aten::clone 0.31% 23.251us 21.95% 1.636ms 181.821us 0.000us 0.00% 690.909us 76.768us 9
- aten::copy_ 0.91% 68.131us 20.95% 1.562ms 173.540us 636.478us 10.68% 690.909us 76.768us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.478us 10.68% 636.478us 70.720us 9
- Activity Buffer Request 19.09% 1.423ms 19.09% 1.423ms 1.423ms 54.431us 0.91% 54.431us 54.431us 1
- aten::transpose 0.68% 50.542us 0.90% 67.292us 2.804us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.22% 16.750us 0.22% 16.750us 0.698us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.17% 12.371us 0.69% 51.272us 5.697us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 0.87% 64.771us 0.87% 64.771us 3.084us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 1.25% 93.466us 1.25% 93.466us 7.789us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.03% 2.400us 0.03% 2.400us 0.800us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.05% 3.371us 0.05% 3.371us 1.124us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 72.02% 5.368ms 72.02% 5.368ms 5.368ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 3.53% 251.015us 29.52% 2.098ms 2.098ms 0.000us 0.00% 5.633ms 5.633ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.587ms 100.15% 5.587ms 5.587ms 1
+ aten::scaled_dot_product_attention 0.25% 17.630us 2.05% 145.594us 48.531us 0.000us 0.00% 4.943ms 1.648ms 3
+ aten::_scaled_dot_product_efficient_attention 0.28% 19.810us 1.80% 127.964us 42.655us 0.000us 0.00% 4.943ms 1.648ms 3
+ aten::_efficient_attention_forward 0.42% 29.862us 1.18% 83.512us 27.837us 4.943ms 88.61% 4.943ms 1.648ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.943ms 88.61% 4.943ms 1.648ms 3
+ aten::contiguous 0.10% 7.191us 23.30% 1.656ms 184.002us 0.000us 0.00% 689.540us 76.616us 9
+ aten::clone 0.33% 23.318us 23.20% 1.649ms 183.203us 0.000us 0.00% 689.540us 76.616us 9
+ aten::copy_ 0.92% 65.725us 22.12% 1.572ms 174.717us 635.140us 11.39% 689.540us 76.616us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.140us 11.39% 635.140us 70.571us 9
+ Activity Buffer Request 20.24% 1.439ms 20.24% 1.439ms 1.439ms 54.400us 0.98% 54.400us 54.400us 1
+ aten::transpose 0.71% 50.494us 0.99% 70.123us 2.922us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.28% 19.629us 0.28% 19.629us 0.818us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.18% 12.608us 0.75% 53.061us 5.896us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.94% 66.903us 0.94% 66.903us 3.186us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 1.25% 89.012us 1.25% 89.012us 7.418us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.220us 0.03% 2.220us 0.740us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.05% 3.880us 0.05% 3.880us 1.293us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 70.48% 5.009ms 70.48% 5.009ms 5.009ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 7.454ms
-Self CUDA time total: 5.959ms
+Self CPU time total: 7.107ms
+Self CUDA time total: 5.578ms
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 3.08% 235.490us 27.25% 2.083ms 2.083ms 0.000us 0.00% 6.182ms 6.182ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.132ms 100.15% 6.132ms 6.132ms 1
- aten::scaled_dot_product_attention 0.24% 18.220us 1.86% 142.046us 47.349us 0.000us 0.00% 5.466ms 1.822ms 3
- aten::_scaled_dot_product_efficient_attention 0.24% 18.131us 1.62% 123.826us 41.275us 0.000us 0.00% 5.466ms 1.822ms 3
- aten::_efficient_attention_forward 0.37% 27.940us 1.08% 82.291us 27.430us 5.466ms 89.28% 5.466ms 1.822ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.466ms 89.28% 5.466ms 1.822ms 3
- aten::contiguous 0.10% 7.272us 21.47% 1.642ms 182.409us 0.000us 0.00% 715.197us 79.466us 9
- aten::clone 0.29% 22.290us 21.38% 1.634ms 181.601us 0.000us 0.00% 715.197us 79.466us 9
- aten::copy_ 0.83% 63.251us 20.39% 1.559ms 173.182us 656.318us 10.72% 715.197us 79.466us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.318us 10.72% 656.318us 72.924us 9
- Activity Buffer Request 18.70% 1.430ms 18.70% 1.430ms 1.430ms 58.879us 0.96% 58.879us 58.879us 1
- aten::transpose 0.93% 71.209us 1.15% 87.625us 3.651us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.21% 16.416us 0.21% 16.416us 0.684us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.15% 11.741us 0.70% 53.481us 5.942us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 0.89% 67.840us 0.89% 67.840us 3.230us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 1.15% 88.022us 1.15% 88.022us 7.335us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.03% 2.651us 0.03% 2.651us 0.884us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.04% 3.370us 0.04% 3.370us 1.123us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 72.75% 5.562ms 72.75% 5.562ms 5.562ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 3.28% 246.598us 28.54% 2.146ms 2.146ms 0.000us 0.00% 6.014ms 6.014ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.967ms 100.18% 5.967ms 5.967ms 1
+ aten::scaled_dot_product_attention 0.24% 18.181us 1.92% 144.583us 48.194us 0.000us 0.00% 5.302ms 1.767ms 3
+ aten::_scaled_dot_product_efficient_attention 0.27% 19.980us 1.68% 126.402us 42.134us 0.000us 0.00% 5.302ms 1.767ms 3
+ aten::_efficient_attention_forward 0.38% 28.571us 1.10% 82.521us 27.507us 5.302ms 89.01% 5.302ms 1.767ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.302ms 89.01% 5.302ms 1.767ms 3
+ aten::contiguous 0.09% 6.930us 22.70% 1.707ms 189.666us 0.000us 0.00% 712.547us 79.172us 9
+ aten::clone 0.30% 22.691us 22.61% 1.700ms 188.896us 0.000us 0.00% 712.547us 79.172us 9
+ aten::copy_ 1.08% 81.024us 21.57% 1.622ms 180.228us 654.403us 10.99% 712.547us 79.172us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.403us 10.99% 654.403us 72.711us 9
+ Activity Buffer Request 19.57% 1.471ms 19.57% 1.471ms 1.471ms 58.144us 0.98% 58.144us 58.144us 1
+ aten::transpose 0.68% 51.431us 0.95% 71.351us 2.973us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.26% 19.920us 0.26% 19.920us 0.830us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.16% 11.979us 0.74% 55.320us 6.147us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.93% 69.561us 0.93% 69.561us 3.312us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 1.22% 91.652us 1.22% 91.652us 7.638us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.359us 0.03% 2.359us 0.786us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.05% 3.430us 0.05% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 71.46% 5.373ms 71.46% 5.373ms 5.373ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 7.646ms
-Self CUDA time total: 6.123ms
+Self CPU time total: 7.519ms
+Self CUDA time total: 5.956ms
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 2.84% 224.838us 29.78% 2.354ms 2.354ms 0.000us 0.00% 6.170ms 6.170ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.121ms 100.15% 6.121ms 6.121ms 1
- aten::scaled_dot_product_attention 0.24% 18.891us 1.82% 143.646us 47.882us 0.000us 0.00% 5.458ms 1.819ms 3
- aten::_scaled_dot_product_efficient_attention 0.24% 19.093us 1.58% 124.755us 41.585us 0.000us 0.00% 5.458ms 1.819ms 3
- aten::_efficient_attention_forward 0.36% 28.140us 1.04% 82.213us 27.404us 5.458ms 89.30% 5.458ms 1.819ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.458ms 89.30% 5.458ms 1.819ms 3
- aten::contiguous 0.10% 7.739us 24.57% 1.942ms 215.806us 0.000us 0.00% 711.998us 79.111us 9
- aten::clone 0.31% 24.450us 24.47% 1.935ms 214.946us 0.000us 0.00% 711.998us 79.111us 9
- aten::copy_ 0.86% 68.064us 23.51% 1.859ms 206.523us 653.982us 10.70% 711.998us 79.111us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.982us 10.70% 653.982us 72.665us 9
- Activity Buffer Request 18.84% 1.489ms 18.84% 1.489ms 1.489ms 58.016us 0.95% 58.016us 58.016us 1
- aten::transpose 0.62% 49.288us 0.84% 66.489us 2.770us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.22% 17.201us 0.22% 17.201us 0.717us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.15% 12.041us 0.65% 51.362us 5.707us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 0.83% 65.351us 0.83% 65.351us 3.112us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 4.09% 323.234us 4.09% 323.234us 26.936us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.03% 2.670us 0.03% 2.670us 0.890us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.04% 3.430us 0.04% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 70.22% 5.551ms 70.22% 5.551ms 5.551ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 3.21% 251.576us 29.97% 2.347ms 2.347ms 0.000us 0.00% 6.116ms 6.116ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.068ms 100.14% 6.068ms 6.068ms 1
+ aten::scaled_dot_product_attention 0.24% 18.800us 1.87% 146.693us 48.898us 0.000us 0.00% 5.408ms 1.803ms 3
+ aten::_scaled_dot_product_efficient_attention 0.25% 19.900us 1.63% 127.893us 42.631us 0.000us 0.00% 5.408ms 1.803ms 3
+ aten::_efficient_attention_forward 0.38% 29.372us 1.07% 83.903us 27.968us 5.408ms 89.25% 5.408ms 1.803ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.408ms 89.25% 5.408ms 1.803ms 3
+ aten::contiguous 0.10% 7.511us 24.29% 1.902ms 211.340us 0.000us 0.00% 708.735us 78.748us 9
+ aten::clone 0.28% 21.872us 24.19% 1.895ms 210.505us 0.000us 0.00% 708.735us 78.748us 9
+ aten::copy_ 0.85% 66.540us 23.20% 1.817ms 201.834us 651.551us 10.75% 708.735us 78.748us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 651.551us 10.75% 651.551us 72.395us 9
+ Activity Buffer Request 18.68% 1.462ms 18.68% 1.462ms 1.462ms 57.184us 0.94% 57.184us 57.184us 1
+ aten::transpose 0.65% 50.781us 0.90% 70.402us 2.933us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.25% 19.621us 0.25% 19.621us 0.818us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.15% 11.809us 0.72% 56.170us 6.241us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.90% 70.242us 0.90% 70.242us 3.345us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 3.97% 310.797us 3.97% 310.797us 25.900us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.04% 3.220us 0.04% 3.220us 1.073us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 70.03% 5.484ms 70.03% 5.484ms 5.484ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 7.905ms
-Self CUDA time total: 6.112ms
+Self CPU time total: 7.830ms
+Self CUDA time total: 6.059ms
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 2.78% 220.799us 28.42% 2.258ms 2.258ms 0.000us 0.00% 6.296ms 6.296ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.245ms 100.15% 6.245ms 6.245ms 1
- aten::scaled_dot_product_attention 0.24% 19.311us 1.79% 142.116us 47.372us 0.000us 0.00% 5.574ms 1.858ms 3
- aten::_scaled_dot_product_efficient_attention 0.23% 17.909us 1.55% 122.805us 40.935us 0.000us 0.00% 5.574ms 1.858ms 3
- aten::_efficient_attention_forward 0.36% 28.682us 1.03% 82.073us 27.358us 5.574ms 89.39% 5.574ms 1.858ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.574ms 89.39% 5.574ms 1.858ms 3
- aten::contiguous 0.09% 7.009us 23.32% 1.852ms 205.811us 0.000us 0.00% 721.599us 80.178us 9
- aten::clone 0.28% 22.450us 23.23% 1.845ms 205.033us 0.000us 0.00% 721.599us 80.178us 9
- aten::copy_ 0.87% 68.713us 22.33% 1.774ms 197.096us 661.695us 10.61% 721.599us 80.178us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 661.695us 10.61% 661.695us 73.522us 9
- Activity Buffer Request 17.91% 1.422ms 17.91% 1.422ms 1.422ms 59.904us 0.96% 59.904us 59.904us 1
- aten::transpose 0.61% 48.435us 0.82% 65.304us 2.721us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.21% 16.869us 0.21% 16.869us 0.703us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.14% 11.511us 0.62% 48.982us 5.442us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 0.78% 61.691us 0.78% 61.691us 2.938us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 3.85% 305.580us 3.85% 305.580us 25.465us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.03% 2.440us 0.03% 2.440us 0.813us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.05% 3.920us 0.05% 3.920us 1.307us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 71.58% 5.685ms 71.58% 5.685ms 5.685ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 3.15% 250.575us 28.50% 2.270ms 2.270ms 0.000us 0.00% 6.322ms 6.322ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.270ms 100.14% 6.270ms 6.270ms 1
+ aten::scaled_dot_product_attention 0.22% 17.572us 1.82% 145.084us 48.361us 0.000us 0.00% 5.598ms 1.866ms 3
+ aten::_scaled_dot_product_efficient_attention 0.24% 19.250us 1.60% 127.512us 42.504us 0.000us 0.00% 5.598ms 1.866ms 3
+ aten::_efficient_attention_forward 0.36% 28.812us 1.05% 83.962us 27.987us 5.598ms 89.40% 5.598ms 1.866ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.598ms 89.40% 5.598ms 1.866ms 3
+ aten::contiguous 0.09% 6.912us 22.94% 1.827ms 203.045us 0.000us 0.00% 724.000us 80.444us 9
+ aten::clone 0.28% 21.949us 22.86% 1.820ms 202.277us 0.000us 0.00% 724.000us 80.444us 9
+ aten::copy_ 0.82% 65.091us 21.89% 1.744ms 193.745us 664.032us 10.60% 724.000us 80.444us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.032us 10.60% 664.032us 73.781us 9
+ Activity Buffer Request 18.02% 1.435ms 18.02% 1.435ms 1.435ms 59.968us 0.96% 59.968us 59.968us 1
+ aten::transpose 0.64% 50.930us 0.89% 70.859us 2.952us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.25% 19.929us 0.25% 19.929us 0.830us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.15% 12.022us 0.69% 54.843us 6.094us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.87% 69.430us 0.87% 69.430us 3.306us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 3.34% 266.388us 3.34% 266.388us 22.199us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.320us 0.03% 2.320us 0.773us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.04% 3.120us 0.04% 3.120us 1.040us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 71.50% 5.695ms 71.50% 5.695ms 5.695ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 7.943ms
-Self CUDA time total: 6.236ms
+Self CPU time total: 7.965ms
+Self CUDA time total: 6.262ms
@@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 3.27% 267.711us 29.30% 2.401ms 2.401ms 0.000us 0.00% 6.459ms 6.459ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.406ms 100.13% 6.406ms 6.406ms 1
- aten::scaled_dot_product_attention 0.24% 19.643us 1.85% 151.176us 50.392us 0.000us 0.00% 5.726ms 1.909ms 3
- aten::_scaled_dot_product_efficient_attention 0.26% 20.920us 1.61% 131.533us 43.844us 0.000us 0.00% 5.726ms 1.909ms 3
- aten::_efficient_attention_forward 0.37% 30.563us 1.03% 84.603us 28.201us 5.726ms 89.50% 5.726ms 1.909ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.726ms 89.50% 5.726ms 1.909ms 3
- aten::contiguous 0.09% 7.670us 23.58% 1.932ms 214.647us 0.000us 0.00% 733.247us 81.472us 9
- aten::clone 0.31% 25.042us 23.48% 1.924ms 213.795us 0.000us 0.00% 733.247us 81.472us 9
- aten::copy_ 0.88% 72.162us 22.52% 1.845ms 205.052us 671.711us 10.50% 733.247us 81.472us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 671.711us 10.50% 671.711us 74.635us 9
- Activity Buffer Request 17.78% 1.456ms 17.78% 1.456ms 1.456ms 61.536us 0.96% 61.536us 61.536us 1
- aten::transpose 0.71% 58.110us 0.93% 75.842us 3.160us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.22% 17.732us 0.22% 17.732us 0.739us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.15% 12.319us 0.65% 53.641us 5.960us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 0.81% 66.513us 0.81% 66.513us 3.167us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 4.14% 339.159us 4.14% 339.159us 28.263us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.03% 2.379us 0.03% 2.379us 0.793us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.05% 4.230us 0.05% 4.230us 1.410us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 70.70% 5.793ms 70.70% 5.793ms 5.793ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 3.00% 248.403us 26.98% 2.232ms 2.232ms 0.000us 0.00% 6.668ms 6.668ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.616ms 100.13% 6.616ms 6.616ms 1
+ aten::scaled_dot_product_attention 0.21% 17.221us 1.72% 142.654us 47.551us 0.000us 0.00% 5.939ms 1.980ms 3
+ aten::_scaled_dot_product_efficient_attention 0.23% 18.779us 1.52% 125.433us 41.811us 0.000us 0.00% 5.939ms 1.980ms 3
+ aten::_efficient_attention_forward 0.34% 28.440us 0.99% 81.712us 27.237us 5.939ms 89.88% 5.939ms 1.980ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.939ms 89.88% 5.939ms 1.980ms 3
+ aten::contiguous 0.08% 6.861us 21.66% 1.792ms 199.142us 0.000us 0.00% 729.440us 81.049us 9
+ aten::clone 0.26% 21.352us 21.58% 1.785ms 198.379us 0.000us 0.00% 729.440us 81.049us 9
+ aten::copy_ 0.83% 69.012us 20.65% 1.709ms 189.858us 668.928us 10.12% 729.440us 81.049us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 668.928us 10.12% 668.928us 74.325us 9
+ Activity Buffer Request 17.29% 1.430ms 17.29% 1.430ms 1.430ms 60.512us 0.92% 60.512us 60.512us 1
+ aten::transpose 0.63% 51.780us 0.89% 73.784us 3.074us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.27% 22.004us 0.27% 22.004us 0.917us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.14% 11.870us 0.67% 55.340us 6.149us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.84% 69.312us 0.84% 69.312us 3.301us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 2.79% 231.145us 2.79% 231.145us 19.262us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.280us 0.03% 2.280us 0.760us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.04% 3.570us 0.04% 3.570us 1.190us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 73.02% 6.041ms 73.02% 6.041ms 6.041ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 8.193ms
-Self CUDA time total: 6.398ms
+Self CPU time total: 8.273ms
+Self CUDA time total: 6.608ms
impl wl p50(ms) ok
-torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
-torch_mem_eff cuda_attn_L256_bfloat16 1.97 True
-torch_mem_eff cuda_attn_L320_bfloat16 2.04 True
-torch_mem_eff cuda_attn_L384_bfloat16 2.06 True
-torch_mem_eff cuda_attn_L448_bfloat16 2.03 True
+torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
+torch_mem_eff cuda_attn_L256_bfloat16 1.89 True
+torch_mem_eff cuda_attn_L320_bfloat16 2.00 True
+torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
+torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
-
-
-
- Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
-Downloading networkx (1.9MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading sympy (6.0MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading numpy (16.2MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading pillow (6.7MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading triton (148.3MiB)
-Downloading torch (846.9MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
- Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
- Downloading nvidia-cuda-cupti-cu12
- Downloading matplotlib
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 216ms
-
-
Artifacts:
attention.jsonl
diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html
index 573b17feaed54b9320d6ff8e360dfa03da8f3be9..9d07a2ce157ec6414ddbe4c27bea52ef7ed253b0 100644
--- a/flash_attn/impls/sage_attention.html
+++ b/flash_attn/impls/sage_attention.html
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ code
▼ output
- ▶ uv-logs
+ ▶ uv-logs
|
-Cell: benchmark | 4.22s
+Cell: benchmark | 4.53s
|
▶ run
Copy
Raw
@@ -3920,23 +3920,28 @@ Cell: benchmark | 4.22s
Running attention benchmark on cuda with 6 workloads.
impl wl p50(ms) ok
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
-
-Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
-Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 13.92it/s]
-Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 19.13it/s]
+
+
+
+Installed 15 packages in 14ms
+
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
+Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:00, 15.79it/s]
+Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 13.55it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 18.83it/s]
Artifacts:
attention.jsonl
diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html
index 2dadff0b53907b1426c870df5e01dac812507a43..6363e024de1afb10cb31713f99cf844d998ebe90 100644
--- a/flash_attn/impls/xformers.html
+++ b/flash_attn/impls/xformers.html
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 9.93% 451.937us 49.71% 2.262ms 2.262ms 0.000us 0.00% 3.695ms 3.695ms 1
- xformers_flash3::flash_fwd 4.26% 193.656us 38.96% 1.773ms 590.904us 0.000us 0.00% 3.695ms 1.232ms 3
- flash_attn_3::fwd 1.62% 73.841us 34.71% 1.579ms 526.352us 2.795ms 100.00% 3.695ms 1.232ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.797ms 100.05% 2.797ms 2.797ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.773us 3
- Activity Buffer Request 31.17% 1.418ms 31.17% 1.418ms 1.418ms 899.421us 32.18% 899.421us 899.421us 1
- aten::empty 0.76% 34.741us 0.76% 34.741us 5.790us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.30% 13.732us 0.30% 13.732us 4.577us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.85% 38.662us 0.85% 38.662us 12.887us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.35% 15.860us 0.82% 37.181us 6.197us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.47% 21.321us 0.47% 21.321us 3.553us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 50.29% 2.288ms 50.29% 2.288ms 2.288ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 10.85% 481.112us 51.55% 2.285ms 2.285ms 0.000us 0.00% 3.582ms 3.582ms 1
+ xformers_flash3::flash_fwd 4.56% 202.185us 39.85% 1.766ms 588.715us 0.000us 0.00% 3.582ms 1.194ms 3
+ flash_attn_3::fwd 1.68% 74.662us 35.29% 1.564ms 521.320us 2.681ms 100.00% 3.582ms 1.194ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.682ms 100.06% 2.682ms 2.682ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.681ms 100.00% 2.681ms 893.515us 3
+ Activity Buffer Request 31.74% 1.407ms 31.74% 1.407ms 1.407ms 901.761us 33.64% 901.761us 901.761us 1
+ aten::empty 0.77% 33.920us 0.77% 33.920us 5.653us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.23% 10.152us 0.23% 10.152us 3.384us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.87% 38.521us 0.87% 38.521us 12.840us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.29% 13.028us 0.85% 37.710us 6.285us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.56% 24.682us 0.56% 24.682us 4.114us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 48.45% 2.147ms 48.45% 2.147ms 2.147ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.550ms
-Self CUDA time total: 2.795ms
+Self CPU time total: 4.432ms
+Self CUDA time total: 2.681ms
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 6.95% 312.321us 44.96% 2.021ms 2.021ms 0.000us 0.00% 3.832ms 3.832ms 1
- xformers_flash3::flash_fwd 3.14% 141.315us 37.51% 1.686ms 561.970us 0.000us 0.00% 3.832ms 1.277ms 3
- flash_attn_3::fwd 1.18% 53.030us 34.37% 1.545ms 514.865us 2.890ms 100.00% 3.832ms 1.277ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.05% 2.892ms 2.892ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.00% 2.890ms 963.329us 3
- Activity Buffer Request 31.64% 1.422ms 31.64% 1.422ms 1.422ms 942.465us 32.61% 942.465us 942.465us 1
- aten::empty 0.68% 30.660us 0.68% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.12% 5.592us 0.12% 5.592us 1.864us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.74% 33.432us 0.74% 33.432us 11.144us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.20% 8.951us 0.50% 22.691us 3.782us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.31% 13.740us 0.31% 13.740us 2.290us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 55.04% 2.474ms 55.04% 2.474ms 2.474ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 7.16% 317.438us 45.96% 2.036ms 2.036ms 0.000us 0.00% 3.779ms 3.779ms 1
+ xformers_flash3::flash_fwd 3.35% 148.243us 38.25% 1.695ms 564.991us 0.000us 0.00% 3.779ms 1.260ms 3
+ flash_attn_3::fwd 1.25% 55.403us 34.91% 1.547ms 515.576us 2.825ms 100.00% 3.779ms 1.260ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.827ms 100.05% 2.827ms 2.827ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.825ms 100.00% 2.825ms 941.739us 3
+ Activity Buffer Request 32.14% 1.424ms 32.14% 1.424ms 1.424ms 954.080us 33.77% 954.080us 954.080us 1
+ aten::empty 0.63% 27.720us 0.63% 27.720us 4.620us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.400us 0.12% 5.400us 1.800us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.77% 34.161us 0.77% 34.161us 11.387us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.21% 9.370us 0.54% 23.750us 3.958us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.32% 14.380us 0.32% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 54.04% 2.395ms 54.04% 2.395ms 2.395ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.495ms
-Self CUDA time total: 2.890ms
+Self CPU time total: 4.431ms
+Self CUDA time total: 2.825ms
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 6.65% 298.008us 44.73% 2.006ms 2.006ms 0.000us 0.00% 3.867ms 3.867ms 1
- xformers_flash3::flash_fwd 3.15% 141.235us 37.58% 1.685ms 561.690us 0.000us 0.00% 3.867ms 1.289ms 3
- flash_attn_3::fwd 1.18% 53.120us 34.43% 1.544ms 514.611us 2.888ms 100.00% 3.867ms 1.289ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.06% 2.890ms 2.890ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.888ms 100.00% 2.888ms 962.683us 3
- Activity Buffer Request 31.72% 1.422ms 31.72% 1.422ms 1.422ms 978.939us 33.90% 978.939us 978.939us 1
- aten::empty 0.67% 30.192us 0.67% 30.192us 5.032us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.12% 5.491us 0.12% 5.491us 1.830us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.73% 32.901us 0.73% 32.901us 10.967us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.20% 8.773us 0.50% 22.603us 3.767us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.31% 13.830us 0.31% 13.830us 2.305us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 55.27% 2.478ms 55.27% 2.478ms 2.478ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 6.87% 310.027us 44.72% 2.018ms 2.018ms 0.000us 0.00% 3.923ms 3.923ms 1
+ xformers_flash3::flash_fwd 3.22% 145.444us 37.33% 1.684ms 561.324us 0.000us 0.00% 3.923ms 1.308ms 3
+ flash_attn_3::fwd 1.15% 52.002us 34.10% 1.539ms 512.843us 2.919ms 100.00% 3.923ms 1.308ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.921ms 100.06% 2.921ms 2.921ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.919ms 100.00% 2.919ms 973.037us 3
+ Activity Buffer Request 31.44% 1.418ms 31.44% 1.418ms 1.418ms 1.004ms 34.40% 1.004ms 1.004ms 1
+ aten::empty 0.63% 28.392us 0.63% 28.392us 4.732us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.520us 0.12% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.76% 34.420us 0.76% 34.420us 11.473us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.21% 9.519us 0.52% 23.650us 3.942us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.31% 14.131us 0.31% 14.131us 2.355us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 55.28% 2.494ms 55.28% 2.494ms 2.494ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.484ms
-Self CUDA time total: 2.888ms
+Self CPU time total: 4.511ms
+Self CUDA time total: 2.919ms
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 6.31% 299.042us 46.56% 2.205ms 2.205ms 0.000us 0.00% 3.936ms 3.936ms 1
- xformers_flash3::flash_fwd 2.97% 140.784us 39.75% 1.883ms 627.609us 0.000us 0.00% 3.936ms 1.312ms 3
- flash_attn_3::fwd 1.10% 52.191us 36.78% 1.742ms 580.681us 2.941ms 100.00% 3.936ms 1.312ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.943ms 100.05% 2.943ms 2.943ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.941ms 100.00% 2.941ms 980.445us 3
- Activity Buffer Request 30.11% 1.426ms 30.11% 1.426ms 1.426ms 994.973us 33.83% 994.973us 994.973us 1
- aten::empty 0.64% 30.333us 0.64% 30.333us 5.055us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.11% 5.440us 0.11% 5.440us 1.813us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 4.81% 227.898us 4.81% 227.898us 75.966us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.19% 8.769us 0.49% 23.220us 3.870us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.31% 14.451us 0.31% 14.451us 2.409us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 53.44% 2.531ms 53.44% 2.531ms 2.531ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 6.73% 317.798us 47.46% 2.241ms 2.241ms 0.000us 0.00% 3.892ms 3.892ms 1
+ xformers_flash3::flash_fwd 3.10% 146.544us 40.23% 1.900ms 633.169us 0.000us 0.00% 3.892ms 1.297ms 3
+ flash_attn_3::fwd 1.15% 54.462us 37.13% 1.753ms 584.321us 2.910ms 100.00% 3.892ms 1.297ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.911ms 100.05% 2.911ms 2.911ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.910ms 100.00% 2.910ms 969.848us 3
+ Activity Buffer Request 30.01% 1.417ms 30.01% 1.417ms 1.417ms 982.915us 33.78% 982.915us 982.915us 1
+ aten::empty 0.62% 29.170us 0.62% 29.170us 4.862us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.11% 5.370us 0.11% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 5.23% 247.156us 5.23% 247.156us 82.385us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.20% 9.560us 0.50% 23.460us 3.910us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.29% 13.900us 0.29% 13.900us 2.317us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 52.54% 2.481ms 52.54% 2.481ms 2.481ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.736ms
-Self CUDA time total: 2.941ms
+Self CPU time total: 4.721ms
+Self CUDA time total: 2.910ms
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 5.82% 299.962us 41.73% 2.152ms 2.152ms 0.000us 0.00% 4.566ms 4.566ms 1
- xformers_flash3::flash_fwd 2.76% 142.114us 35.47% 1.829ms 609.751us 0.000us 0.00% 4.566ms 1.522ms 3
- flash_attn_3::fwd 1.04% 53.631us 32.71% 1.687ms 562.380us 3.419ms 100.00% 4.566ms 1.522ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.420ms 100.05% 3.420ms 3.420ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3
- Activity Buffer Request 27.56% 1.422ms 27.56% 1.422ms 1.422ms 1.148ms 33.58% 1.148ms 1.148ms 1
- aten::empty 0.60% 31.172us 0.60% 31.172us 5.195us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.11% 5.431us 0.11% 5.431us 1.810us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 3.40% 175.366us 3.40% 175.366us 58.455us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.17% 8.849us 0.45% 23.030us 3.838us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.27% 14.181us 0.27% 14.181us 2.363us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 58.27% 3.005ms 58.27% 3.005ms 3.005ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 5.86% 306.369us 41.94% 2.193ms 2.193ms 0.000us 0.00% 4.614ms 4.614ms 1
+ xformers_flash3::flash_fwd 2.85% 149.202us 35.63% 1.863ms 620.885us 0.000us 0.00% 4.614ms 1.538ms 3
+ flash_attn_3::fwd 1.03% 53.951us 32.77% 1.713ms 571.151us 3.461ms 100.00% 4.614ms 1.538ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.462ms 100.04% 3.462ms 3.462ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.461ms 100.00% 3.461ms 1.154ms 3
+ Activity Buffer Request 27.28% 1.426ms 27.28% 1.426ms 1.426ms 1.153ms 33.31% 1.153ms 1.153ms 1
+ aten::empty 0.55% 28.813us 0.55% 28.813us 4.802us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.11% 5.560us 0.11% 5.560us 1.853us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.80% 198.684us 3.80% 198.684us 66.228us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.18% 9.430us 0.46% 23.930us 3.988us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.28% 14.500us 0.28% 14.500us 2.417us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 58.06% 3.036ms 58.06% 3.036ms 3.036ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.157ms
-Self CUDA time total: 3.419ms
+Self CPU time total: 5.228ms
+Self CUDA time total: 3.461ms
@@ -4043,37 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 5.76% 295.800us 41.67% 2.139ms 2.139ms 0.000us 0.00% 4.557ms 4.557ms 1
- xformers_flash3::flash_fwd 2.75% 141.044us 35.47% 1.821ms 606.924us 0.000us 0.00% 4.557ms 1.519ms 3
- flash_attn_3::fwd 1.04% 53.523us 32.72% 1.680ms 559.910us 3.405ms 100.00% 4.557ms 1.519ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.407ms 100.05% 3.407ms 3.407ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.405ms 100.00% 3.405ms 1.135ms 3
- Activity Buffer Request 27.67% 1.420ms 27.67% 1.420ms 1.420ms 1.152ms 33.82% 1.152ms 1.152ms 1
- aten::empty 0.60% 30.610us 0.60% 30.610us 5.102us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.12% 6.310us 0.12% 6.310us 2.103us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 3.29% 168.946us 3.29% 168.946us 56.315us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.17% 8.721us 0.44% 22.392us 3.732us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.27% 13.671us 0.27% 13.671us 2.279us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 58.33% 2.994ms 58.33% 2.994ms 2.994ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 5.96% 310.158us 41.66% 2.167ms 2.167ms 0.000us 0.00% 4.643ms 4.643ms 1
+ xformers_flash3::flash_fwd 2.83% 146.954us 35.22% 1.832ms 610.728us 0.000us 0.00% 4.643ms 1.548ms 3
+ flash_attn_3::fwd 1.00% 51.911us 32.40% 1.685ms 561.744us 3.464ms 100.00% 4.643ms 1.548ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.465ms 100.04% 3.465ms 3.465ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.464ms 100.00% 3.464ms 1.155ms 3
+ Activity Buffer Request 27.49% 1.430ms 27.49% 1.430ms 1.430ms 1.179ms 34.05% 1.179ms 1.179ms 1
+ aten::empty 0.54% 28.311us 0.54% 28.311us 4.719us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.11% 5.750us 0.11% 5.750us 1.917us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.25% 169.084us 3.25% 169.084us 56.361us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.17% 8.670us 0.48% 24.720us 4.120us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.31% 16.050us 0.31% 16.050us 2.675us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 58.34% 3.035ms 58.34% 3.035ms 3.035ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.133ms
-Self CUDA time total: 3.405ms
+Self CPU time total: 5.202ms
+Self CUDA time total: 3.464ms
impl wl p50(ms) ok
-xformers_meff cuda_attn_L128_bfloat16 0.98 True
+xformers_meff cuda_attn_L128_bfloat16 1.00 True
xformers_meff cuda_attn_L256_bfloat16 1.03 True
xformers_meff cuda_attn_L320_bfloat16 1.08 True
-xformers_meff cuda_attn_L384_bfloat16 1.10 True
-xformers_meff cuda_attn_L448_bfloat16 1.23 True
-xformers_meff cuda_attn_L512_bfloat16 1.22 True
+xformers_meff cuda_attn_L384_bfloat16 1.09 True
+xformers_meff cuda_attn_L448_bfloat16 1.25 True
+xformers_meff cuda_attn_L512_bfloat16 1.24 True
Downloading xformers (111.8MiB)
Downloading xformers
-Installed 1 package in 14ms
+Installed 1 package in 13ms
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg
index 689e04d1be57a1e800f341bc84fe4bfaf1387666..19f0903d77a8fb32c0a3ed03553c82706371801e 100644
--- a/flash_attn/results/artifacts/combine/latency.svg
+++ b/flash_attn/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:168c229932ad06a68508a4a77b66485ff9bcf48ed736a5ffdd003f5cb9e8e639
-size 24777
+oid sha256:0a7d7b3dc8fc6b60a4b9f8bfcf3e229706548b71a8174822b89cc9a2746d3bbd
+size 24787
diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html
index 7a5f09ca394f53e1d971ad7b608a69d09750ab95..3a2204532e0ec8ef3588194f5c38935fb60f8208 100644
--- a/flash_attn/results/combined_results.html
+++ b/flash_attn/results/combined_results.html
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
- 2025-10-28T14:09:17.505622
+ 2025-10-29T14:28:03.109695
image/svg+xml
@@ -3982,96 +3982,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
-
+
- 1.0
+ 1.0
-
+
-
+
- 1.2
+ 1.2
-
+
-
+
- 1.4
+ 1.4
-
+
-
+
- 1.6
+ 1.6
-
+
-
+
- 1.8
+ 1.8
-
+
-
+
- 2.0
+ 2.0
-
+
-
+
- 2.2
+ 2.2
@@ -4079,73 +4079,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
+
+
+
+
+
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
+
+
+
+
+
@@ -4337,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing
COMBINED BENCHMARK SUMMARY
impl wl p50(ms) ok
-hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
+hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.97 True
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
-hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
-hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
-hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
-hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
-hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
-hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
-hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
-hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
+hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.09 True
+hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
+hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
+hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.94 True
+hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
+hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.04 True
+hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.05 True
+hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
- Error: module 'sage_attention_fd11035eb4318b27' has no attribute 'fwd'
+ Error: module 'sage_attention_5c963cbdaf16559b' has no attribute 'fwd'
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
-torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
-torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
-torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
+torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
+torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
+torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
-torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
-torch_mem_eff cuda_attn_L256_bfloat16 1.97 True
-torch_mem_eff cuda_attn_L320_bfloat16 2.04 True
-torch_mem_eff cuda_attn_L384_bfloat16 2.06 True
-torch_mem_eff cuda_attn_L448_bfloat16 2.03 True
+torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
+torch_mem_eff cuda_attn_L256_bfloat16 1.89 True
+torch_mem_eff cuda_attn_L320_bfloat16 2.00 True
+torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
+torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
-xformers_meff cuda_attn_L128_bfloat16 0.98 True
+xformers_meff cuda_attn_L128_bfloat16 1.00 True
xformers_meff cuda_attn_L256_bfloat16 1.03 True
xformers_meff cuda_attn_L320_bfloat16 1.08 True
-xformers_meff cuda_attn_L384_bfloat16 1.10 True
-xformers_meff cuda_attn_L448_bfloat16 1.23 True
-xformers_meff cuda_attn_L512_bfloat16 1.22 True
+xformers_meff cuda_attn_L384_bfloat16 1.09 True
+xformers_meff cuda_attn_L448_bfloat16 1.25 True
+xformers_meff cuda_attn_L512_bfloat16 1.24 True
GENERATING COMBINED VISUALIZATION
@@ -4402,7 +4402,7 @@ Implementations included:
-Installed 37 packages in 187ms
+Installed 37 packages in 208ms
@@ -4415,7 +4415,7 @@ Installed 37 packages in 187ms
- 2025-10-28T14:09:17.505622
+ 2025-10-29T14:28:03.109695
image/svg+xml
@@ -4525,96 +4525,96 @@ Installed 37 packages in 187ms
-
+
-
+
- 1.0
+ 1.0
-
+
-
+
- 1.2
+ 1.2
-
+
-
+
- 1.4
+ 1.4
-
+
-
+
- 1.6
+ 1.6
-
+
-
+
- 1.8
+ 1.8
-
+
-
+
- 2.0
+ 2.0
-
+
-
+
- 2.2
+ 2.2
@@ -4622,73 +4622,73 @@ Installed 37 packages in 187ms
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
+
+
+
+
+
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
+
+
+
+
+
diff --git a/index.html b/index.html
index 33ea1b019a71f451c81dbc10c5e67f8c6ca9b465..1061b4b3222caa3480fdd412bcf6f18bb97b54f9 100644
--- a/index.html
+++ b/index.html
@@ -1,89 +1,4029 @@
-
+
-
-
- Index of /
-
+
+
+ index
+
+
+
+
+
- Index of /
-
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
All Benchmarks Aggregated Report
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Implementation
+Description
+
+
+
+
+HF Kernels SwiGLU
+HuggingFace kernels SwiGLU implementation
+
+
+PyTorch SwiGLU
+PyTorch native SwiGLU implementation
+
+
+
+
+
+
+
+
+
+
+
+
+Implementation
+Description
+
+
+
+
+HF Kernels ReLU
+HuggingFace kernels ReLU implementation
+
+
+PyTorch ReLU
+PyTorch native ReLU implementation
+
+
+
+
+
\ No newline at end of file
diff --git a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
index fcd809d60a69166f4be7343612f4f810d256a506..611975ecd9585a8b6f1198e5f9cf417087baa85d 100644
--- a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
+++ b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
@@ -1,4 +1,4 @@
-{"ts": "2025-10-28T14:08:59Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8268990000033227, "p50": 0.8360890000176369, "p90": 0.8378790000733716, "mean": 0.8358750000070359, "iqr": 0.002010000116570154, "raw_times": [0.8426389999840467, 0.8268990000033227, 0.8378790000733716, 0.8360890000176369, 0.8358689999568014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8452999999235544, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
-{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6477070000746608, "p50": 1.6516379999984565, "p90": 1.6565669999408783, "mean": 1.6533151999965412, "iqr": 0.006360999918797461, "raw_times": [1.6565669999408783, 1.6516379999984565, 1.6477070000746608, 1.6604579999466296, 1.6502060000220808], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6544470000781075, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
-{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6441269999631913, "p50": 1.6532669999378413, "p90": 1.6534970000066096, "mean": 1.6500411999913922, "iqr": 0.009149999982582813, "raw_times": [1.6441269999631913, 1.6534970000066096, 1.6532669999378413, 1.6443470000240268, 1.654968000025292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6665570000213847, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
-{"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.251962999911484, "p50": 3.270412999995642, "p90": 3.2735430000911947, "mean": 3.2660931999998866, "iqr": 0.01632000009976764, "raw_times": [3.2735430000911947, 3.251962999911484, 3.257222999991427, 3.277324000009685, 3.270412999995642], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2640430000583365, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8322699999894212, "p50": 0.8364899999833142, "p90": 0.8382409999967422, "mean": 0.8359703999872181, "iqr": 0.0036810000096920703, "raw_times": [0.8322699999894212, 0.8382909999795629, 0.8345599999870501, 0.8382409999967422, 0.8364899999833142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8381600000006983, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6435499999829517, "p50": 1.6499199999771008, "p90": 1.6516499999852385, "mean": 1.650240399987979, "iqr": 0.0024989999474200886, "raw_times": [1.6516499999852385, 1.6435499999829517, 1.6499199999771008, 1.6491510000378184, 1.656930999956785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.644769999984419, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6425610000396773, "p50": 1.6517310000381258, "p90": 1.654420999955164, "mean": 1.6505027999983213, "iqr": 0.006990999963818467, "raw_times": [1.6474299999913455, 1.6517310000381258, 1.654420999955164, 1.6563709999672938, 1.6425610000396773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6471609999939574, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T14:26:39Z", "run": "a26c482e532b445fa4192ef14ec4850f", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.237169999977141, "p50": 3.2576509999557857, "p90": 3.264301000001524, "mean": 3.257706599993071, "iqr": 0.008230999981151399, "raw_times": [3.2576509999557857, 3.264301000001524, 3.2733410000105323, 3.2560700000203724, 3.237169999977141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2725309999932506, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html
index cfec5b11856445875be968b8022bf0064c0ca56f..9e9cf8da940eb80e201b94351f6e97b42048c103 100644
--- a/layer_norm/impls/hf_kernels_layer_norm.html
+++ b/layer_norm/impls/hf_kernels_layer_norm.html
@@ -3862,8 +3862,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
on_github: huggingface/kernels-uvnotes
-
HF Kernels LayerNorm Implementation
+
HF Kernels LayerNorm Implementation
Based on kernels-community layer-norm kernel.
LayerNorm Benchmark (HF Kernels)
@@ -3873,10 +3872,11 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: benchmark | 7.03s
+Cell: benchmark | 6.34s
|
▶ run
Copy
Raw
+
GitHub
@@ -3943,19 +3943,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_layer_norm 4.56% 180.575us 46.01% 1.822ms 1.822ms 0.000us 0.00% 3.098ms 3.098ms 1
- _layer_norm_f8ec252::dropout_add_ln_fwd 1.70% 67.272us 40.91% 1.619ms 539.829us 2.362ms 100.00% 3.098ms 1.033ms 3
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.363ms 100.06% 2.363ms 2.363ms 1
-void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.00% 2.362ms 787.305us 3
- Activity Buffer Request 36.75% 1.455ms 36.75% 1.455ms 1.455ms 736.127us 31.17% 736.127us 736.127us 1
- aten::view 0.54% 21.512us 0.54% 21.512us 3.585us 0.000us 0.00% 0.000us 0.000us 6
- aten::empty 1.17% 46.231us 1.17% 46.231us 5.137us 0.000us 0.00% 0.000us 0.000us 9
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.070us 0.23% 9.070us 3.023us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 1.06% 41.913us 1.06% 41.913us 13.971us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 53.99% 2.137ms 53.99% 2.137ms 2.137ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_layer_norm 5.26% 209.855us 46.73% 1.864ms 1.864ms 0.000us 0.00% 3.097ms 3.097ms 1
+ _layer_norm_f8ec252::dropout_add_ln_fwd 1.78% 70.832us 40.86% 1.630ms 543.337us 2.360ms 100.00% 3.097ms 1.032ms 3
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.06% 2.362ms 2.362ms 1
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.360ms 100.00% 2.360ms 786.699us 3
+ Activity Buffer Request 36.61% 1.461ms 36.61% 1.461ms 1.461ms 736.736us 31.22% 736.736us 736.736us 1
+ aten::view 0.61% 24.271us 0.61% 24.271us 4.045us 0.000us 0.00% 0.000us 0.000us 6
+ aten::empty 1.19% 47.642us 1.19% 47.642us 5.294us 0.000us 0.00% 0.000us 0.000us 9
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.27% 10.789us 0.27% 10.789us 3.596us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.01% 40.102us 1.01% 40.102us 13.367us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 53.27% 2.125ms 53.27% 2.125ms 2.125ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 3.959ms
-Self CUDA time total: 2.362ms
+Self CPU time total: 3.989ms
+Self CUDA time total: 2.360ms
@@ -3965,19 +3965,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_layer_norm 2.19% 144.024us 30.18% 1.989ms 1.989ms 0.000us 0.00% 6.322ms 6.322ms 1
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 45.641us 27.80% 1.832ms 610.764us 4.774ms 100.00% 6.322ms 2.107ms 3
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.776ms 100.03% 4.776ms 4.776ms 1
-void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.774ms 100.00% 4.774ms 1.591ms 3
- Activity Buffer Request 26.09% 1.720ms 26.09% 1.720ms 1.720ms 1.548ms 32.42% 1.548ms 1.548ms 1
- aten::view 0.20% 12.871us 0.20% 12.871us 2.145us 0.000us 0.00% 0.000us 0.000us 6
- aten::empty 0.50% 32.981us 0.50% 32.981us 3.665us 0.000us 0.00% 0.000us 0.000us 9
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.881us 0.07% 4.881us 1.627us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.44% 29.151us 0.44% 29.151us 9.717us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 69.82% 4.602ms 69.82% 4.602ms 4.602ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_layer_norm 2.24% 143.733us 27.27% 1.751ms 1.751ms 0.000us 0.00% 6.440ms 6.440ms 1
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 48.181us 24.84% 1.595ms 531.669us 4.846ms 100.00% 6.440ms 2.147ms 3
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.848ms 100.03% 4.848ms 4.848ms 1
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.846ms 100.00% 4.846ms 1.615ms 3
+ Activity Buffer Request 23.08% 1.482ms 23.08% 1.482ms 1.482ms 1.594ms 32.88% 1.594ms 1.594ms 1
+ aten::view 0.20% 12.572us 0.20% 12.572us 2.095us 0.000us 0.00% 0.000us 0.000us 6
+ aten::empty 0.46% 29.840us 0.46% 29.840us 3.316us 0.000us 0.00% 0.000us 0.000us 9
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.420us 0.08% 5.420us 1.807us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.46% 29.490us 0.46% 29.490us 9.830us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 72.73% 4.670ms 72.73% 4.670ms 4.670ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 6.591ms
-Self CUDA time total: 4.774ms
+Self CPU time total: 6.421ms
+Self CUDA time total: 4.846ms
@@ -3987,19 +3987,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_layer_norm 1.89% 121.823us 28.69% 1.852ms 1.852ms 0.000us 0.00% 6.323ms 6.323ms 1
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 44.435us 26.61% 1.718ms 572.663us 4.766ms 100.00% 6.323ms 2.108ms 3
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.767ms 100.03% 4.767ms 4.767ms 1
-void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.766ms 100.00% 4.766ms 1.589ms 3
- Activity Buffer Request 24.91% 1.608ms 24.91% 1.608ms 1.608ms 1.557ms 32.67% 1.557ms 1.557ms 1
- aten::view 0.19% 12.441us 0.19% 12.441us 2.074us 0.000us 0.00% 0.000us 0.000us 6
- aten::empty 0.50% 32.030us 0.50% 32.030us 3.559us 0.000us 0.00% 0.000us 0.000us 9
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.850us 0.08% 4.850us 1.617us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.44% 28.190us 0.44% 28.190us 9.397us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 71.31% 4.604ms 71.31% 4.604ms 4.604ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_layer_norm 1.96% 126.465us 27.43% 1.766ms 1.766ms 0.000us 0.00% 6.435ms 6.435ms 1
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.73% 46.779us 25.26% 1.627ms 542.360us 4.838ms 100.00% 6.435ms 2.145ms 3
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.839ms 100.03% 4.839ms 4.839ms 1
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.838ms 100.00% 4.838ms 1.613ms 3
+ Activity Buffer Request 23.54% 1.516ms 23.54% 1.516ms 1.516ms 1.597ms 33.01% 1.597ms 1.597ms 1
+ aten::view 0.20% 12.929us 0.20% 12.929us 2.155us 0.000us 0.00% 0.000us 0.000us 6
+ aten::empty 0.46% 29.911us 0.46% 29.911us 3.323us 0.000us 0.00% 0.000us 0.000us 9
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.300us 0.08% 5.300us 1.767us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.45% 29.003us 0.45% 29.003us 9.668us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 72.57% 4.674ms 72.57% 4.674ms 4.674ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 6.457ms
-Self CUDA time total: 4.766ms
+Self CPU time total: 6.440ms
+Self CUDA time total: 4.838ms
@@ -4009,37 +4009,38 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_layer_norm 1.32% 150.697us 17.31% 1.975ms 1.975ms 0.000us 0.00% 12.822ms 12.822ms 1
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.42% 47.993us 15.87% 1.810ms 603.497us 9.629ms 100.00% 12.822ms 4.274ms 3
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.631ms 100.01% 9.631ms 9.631ms 1
-void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.629ms 100.00% 9.629ms 3.210ms 3
- Activity Buffer Request 12.56% 1.433ms 12.56% 1.433ms 1.433ms 3.193ms 33.16% 3.193ms 3.193ms 1
- aten::view 0.12% 13.330us 0.12% 13.330us 2.222us 0.000us 0.00% 0.000us 0.000us 6
- aten::empty 0.28% 32.431us 0.28% 32.431us 3.603us 0.000us 0.00% 0.000us 0.000us 9
-cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.260us 0.05% 5.260us 1.753us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 2.56% 291.579us 2.56% 291.579us 97.193us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 82.69% 9.436ms 82.69% 9.436ms 9.436ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_layer_norm 1.17% 134.085us 17.09% 1.957ms 1.957ms 0.000us 0.00% 12.886ms 12.886ms 1
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.41% 46.869us 15.80% 1.809ms 603.015us 9.665ms 100.00% 12.886ms 4.295ms 3
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.667ms 100.01% 9.667ms 9.667ms 1
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.665ms 100.00% 9.665ms 3.222ms 3
+ Activity Buffer Request 12.76% 1.462ms 12.76% 1.462ms 1.462ms 3.220ms 33.32% 3.220ms 3.220ms 1
+ aten::view 0.12% 13.968us 0.12% 13.968us 2.328us 0.000us 0.00% 0.000us 0.000us 6
+ aten::empty 0.26% 30.043us 0.26% 30.043us 3.338us 0.000us 0.00% 0.000us 0.000us 9
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.590us 0.05% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 2.31% 264.797us 2.31% 264.797us 88.266us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 82.91% 9.495ms 82.91% 9.495ms 9.495ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 11.410ms
-Self CUDA time total: 9.629ms
+Self CPU time total: 11.452ms
+Self CUDA time total: 9.665ms
impl wl p50(ms) ok
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
-hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
+hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
+Downloading hf-xet (3.2MiB)
+ Downloading hf-xet
Installed 15 packages in 13ms
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
-Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 8.47it/s]
-Fetching 4 files: 50%|█████ | 2/4 [00:02<00:02, 1.44s/it]
-Fetching 4 files: 100%|██████████| 4/4 [00:02<00:00, 1.61it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.22it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.44it/s]
Artifacts:
layer_norm.jsonl
diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html
index 72ce43dc70edcb0cbcced09b58a31530fadba3d8..f5dd45a5ed15040ec9f80c48eca459fb67a1bc56 100644
--- a/layer_norm/impls/torch_layer_norm.html
+++ b/layer_norm/impls/torch_layer_norm.html
@@ -3862,8 +3862,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
on_github: huggingface/kernels-uvnotes
-
Torch LayerNorm Implementation
+
Torch LayerNorm Implementation
GPU Info
@@ -3887,7 +3887,7 @@ Cell: nv | 0.22s
-
Tue Oct 28 14:08:35 2025
+Wed Oct 29 14:26:26 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.22s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 31C P0 141W / 350W | 0MiB / 46068MiB | 21% Default |
+| N/A 30C P0 108W / 350W | 0MiB / 46068MiB | 100% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -3920,10 +3920,11 @@ Cell: nv | 0.22s
▼ output
▶ uv-logs
|
-Cell: benchmark | 7.39s
+Cell: benchmark | 7.36s
| ▶ run
Copy
Raw
+GitHub
@@ -3967,19 +3968,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_layer_norm 3.94% 153.126us 46.06% 1.791ms 1.791ms 0.000us 0.00% 3.027ms 3.027ms 1
- aten::layer_norm 0.44% 17.151us 42.12% 1.638ms 545.972us 0.000us 0.00% 3.027ms 1.009ms 3
- aten::native_layer_norm 1.99% 77.265us 41.68% 1.621ms 540.255us 2.317ms 100.00% 3.027ms 1.009ms 3
+ torch_layer_norm 3.90% 151.572us 46.01% 1.786ms 1.786ms 0.000us 0.00% 3.026ms 3.026ms 1
+ aten::layer_norm 0.43% 16.762us 42.11% 1.635ms 544.851us 0.000us 0.00% 3.026ms 1.009ms 3
+ aten::native_layer_norm 2.06% 80.009us 41.67% 1.618ms 539.263us 2.316ms 100.00% 3.026ms 1.009ms 3
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.318ms 100.06% 2.318ms 2.318ms 1
-void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.317ms 100.00% 2.317ms 772.230us 3
- Activity Buffer Request 37.14% 1.444ms 37.14% 1.444ms 1.444ms 709.980us 30.65% 709.980us 709.980us 1
- aten::empty 1.21% 46.960us 1.21% 46.960us 5.218us 0.000us 0.00% 0.000us 0.000us 9
- cudaLaunchKernel 1.16% 45.271us 1.16% 45.271us 15.090us 0.000us 0.00% 0.000us 0.000us 3
- aten::view 0.18% 7.130us 0.18% 7.130us 1.188us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 53.94% 2.098ms 53.94% 2.098ms 2.098ms 0.000us 0.00% 0.000us 0.000us 1
+void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.316ms 100.00% 2.316ms 772.127us 3
+ Activity Buffer Request 37.08% 1.440ms 37.08% 1.440ms 1.440ms 709.855us 30.65% 709.855us 709.855us 1
+ aten::empty 1.19% 46.261us 1.19% 46.261us 5.140us 0.000us 0.00% 0.000us 0.000us 9
+ cudaLaunchKernel 1.16% 45.163us 1.16% 45.163us 15.054us 0.000us 0.00% 0.000us 0.000us 3
+ aten::view 0.17% 6.761us 0.17% 6.761us 1.127us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 53.99% 2.096ms 53.99% 2.096ms 2.096ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 3.889ms
-Self CUDA time total: 2.317ms
+Self CPU time total: 3.882ms
+Self CUDA time total: 2.316ms
@@ -3989,19 +3990,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_layer_norm 1.11% 71.092us 25.40% 1.622ms 1.622ms 0.000us 0.00% 6.494ms 6.494ms 1
- aten::layer_norm 0.16% 10.119us 24.29% 1.551ms 517.038us 0.000us 0.00% 6.494ms 2.165ms 3
- aten::native_layer_norm 0.82% 52.103us 24.13% 1.541ms 513.665us 4.898ms 100.00% 6.494ms 2.165ms 3
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.899ms 100.03% 4.899ms 4.899ms 1
-void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.898ms 100.00% 4.898ms 1.633ms 3
- Activity Buffer Request 22.36% 1.428ms 22.36% 1.428ms 1.428ms 1.596ms 32.59% 1.596ms 1.596ms 1
- aten::empty 0.49% 31.052us 0.49% 31.052us 3.450us 0.000us 0.00% 0.000us 0.000us 9
- cudaLaunchKernel 0.41% 26.160us 0.41% 26.160us 8.720us 0.000us 0.00% 0.000us 0.000us 3
- aten::view 0.06% 3.830us 0.06% 3.830us 0.638us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 74.60% 4.764ms 74.60% 4.764ms 4.764ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_layer_norm 1.19% 75.581us 25.55% 1.628ms 1.628ms 0.000us 0.00% 6.473ms 6.473ms 1
+ aten::layer_norm 0.14% 9.142us 24.37% 1.553ms 517.550us 0.000us 0.00% 6.473ms 2.158ms 3
+ aten::native_layer_norm 0.81% 51.921us 24.22% 1.544ms 514.502us 4.881ms 100.00% 6.473ms 2.158ms 3
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.882ms 100.03% 4.882ms 4.882ms 1
+void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.881ms 100.00% 4.881ms 1.627ms 3
+ Activity Buffer Request 22.46% 1.431ms 22.46% 1.431ms 1.431ms 1.592ms 32.61% 1.592ms 1.592ms 1
+ aten::empty 0.44% 27.841us 0.44% 27.841us 3.093us 0.000us 0.00% 0.000us 0.000us 9
+ cudaLaunchKernel 0.45% 28.910us 0.45% 28.910us 9.637us 0.000us 0.00% 0.000us 0.000us 3
+ aten::view 0.06% 3.829us 0.06% 3.829us 0.638us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 74.45% 4.743ms 74.45% 4.743ms 4.743ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 6.386ms
-Self CUDA time total: 4.898ms
+Self CPU time total: 6.372ms
+Self CUDA time total: 4.881ms
@@ -4011,19 +4012,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_layer_norm 1.17% 72.893us 26.00% 1.616ms 1.616ms 0.000us 0.00% 6.248ms 6.248ms 1
- aten::layer_norm 0.15% 9.290us 24.82% 1.543ms 514.468us 0.000us 0.00% 6.248ms 2.083ms 3
- aten::native_layer_norm 0.84% 52.403us 24.67% 1.534ms 511.371us 4.735ms 100.00% 6.248ms 2.083ms 3
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.736ms 100.03% 4.736ms 4.736ms 1
-void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.735ms 100.00% 4.735ms 1.578ms 3
- Activity Buffer Request 22.86% 1.421ms 22.86% 1.421ms 1.421ms 1.513ms 31.96% 1.513ms 1.513ms 1
- aten::empty 0.47% 29.320us 0.47% 29.320us 3.258us 0.000us 0.00% 0.000us 0.000us 9
- cudaLaunchKernel 0.43% 26.781us 0.43% 26.781us 8.927us 0.000us 0.00% 0.000us 0.000us 3
- aten::view 0.07% 4.140us 0.07% 4.140us 0.690us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 74.00% 4.601ms 74.00% 4.601ms 4.601ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_layer_norm 1.15% 71.882us 26.71% 1.668ms 1.668ms 0.000us 0.00% 6.222ms 6.222ms 1
+ aten::layer_norm 0.15% 9.629us 25.56% 1.596ms 532.153us 0.000us 0.00% 6.222ms 2.074ms 3
+ aten::native_layer_norm 0.90% 56.373us 25.41% 1.587ms 528.943us 4.717ms 100.00% 6.222ms 2.074ms 3
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.718ms 100.03% 4.718ms 4.718ms 1
+void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.717ms 100.00% 4.717ms 1.572ms 3
+ Activity Buffer Request 23.44% 1.464ms 23.44% 1.464ms 1.464ms 1.506ms 31.93% 1.506ms 1.506ms 1
+ aten::empty 0.46% 28.850us 0.46% 28.850us 3.206us 0.000us 0.00% 0.000us 0.000us 9
+ cudaLaunchKernel 0.52% 32.781us 0.52% 32.781us 10.927us 0.000us 0.00% 0.000us 0.000us 3
+ aten::view 0.07% 4.590us 0.07% 4.590us 0.765us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 73.29% 4.577ms 73.29% 4.577ms 4.577ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 6.218ms
-Self CUDA time total: 4.735ms
+Self CPU time total: 6.246ms
+Self CUDA time total: 4.717ms
@@ -4033,19 +4034,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_layer_norm 0.66% 74.633us 14.54% 1.650ms 1.650ms 0.000us 0.00% 13.090ms 13.090ms 1
- aten::layer_norm 0.09% 9.800us 13.88% 1.575ms 525.028us 0.000us 0.00% 13.090ms 4.363ms 3
- aten::native_layer_norm 0.45% 51.390us 13.79% 1.565ms 521.762us 9.838ms 100.00% 13.090ms 4.363ms 3
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.839ms 100.01% 9.839ms 9.839ms 1
-void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.838ms 100.00% 9.838ms 3.279ms 3
- Activity Buffer Request 11.36% 1.289ms 11.36% 1.289ms 1.289ms 3.253ms 33.06% 3.253ms 3.253ms 1
- aten::empty 0.28% 31.381us 0.28% 31.381us 3.487us 0.000us 0.00% 0.000us 0.000us 9
- cudaLaunchKernel 1.67% 189.088us 1.67% 189.088us 63.029us 0.000us 0.00% 0.000us 0.000us 3
- aten::view 0.04% 4.121us 0.04% 4.121us 0.687us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 85.46% 9.697ms 85.46% 9.697ms 9.697ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_layer_norm 0.67% 74.340us 13.35% 1.490ms 1.490ms 0.000us 0.00% 13.028ms 13.028ms 1
+ aten::layer_norm 0.09% 9.510us 12.69% 1.416ms 471.835us 0.000us 0.00% 13.028ms 4.343ms 3
+ aten::native_layer_norm 0.47% 52.269us 12.60% 1.406ms 468.665us 9.808ms 100.00% 13.028ms 4.343ms 3
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.809ms 100.02% 9.809ms 9.809ms 1
+void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.808ms 100.00% 9.808ms 3.269ms 3
+ Activity Buffer Request 9.72% 1.085ms 9.72% 1.085ms 1.085ms 3.220ms 32.83% 3.220ms 3.220ms 1
+ aten::empty 0.26% 29.181us 0.26% 29.181us 3.242us 0.000us 0.00% 0.000us 0.000us 9
+ cudaLaunchKernel 2.11% 235.817us 2.11% 235.817us 78.606us 0.000us 0.00% 0.000us 0.000us 3
+ aten::view 0.04% 4.022us 0.04% 4.022us 0.670us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 86.65% 9.669ms 86.65% 9.669ms 9.669ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 11.347ms
-Self CUDA time total: 9.838ms
+Self CPU time total: 11.159ms
+Self CUDA time total: 9.808ms
impl wl p50(ms) ok
@@ -4057,7 +4058,7 @@ torch_layer_norm LN_B16_S4096_D8192 3.33 True
-Installed 37 packages in 221ms
+Installed 37 packages in 222ms
diff --git a/layer_norm/results/artifacts/combine/latency.svg b/layer_norm/results/artifacts/combine/latency.svg
index 51fba97fb0809dfd942d52b9b34e8a096d515676..c17ece602ed5ebc325bf99b71237b08ca31fbe89 100644
--- a/layer_norm/results/artifacts/combine/latency.svg
+++ b/layer_norm/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:1e41c135df9f0b506fa1ac950b90bd609d850f01d79b3171b3678c24fdab066a
-size 14645
+oid sha256:8fd53794c4617f7e947676c655de6f739b720b8f16a59432369c127bfc08190a
+size 14644
diff --git a/layer_norm/results/combined_results.html b/layer_norm/results/combined_results.html
index 616fba09e8126d17fe18ed8e4396c65eb84adaef..5a42e66a6787e88853b7090c03ba6d4a8cd04457 100644
--- a/layer_norm/results/combined_results.html
+++ b/layer_norm/results/combined_results.html
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
- 2025-10-28T14:09:21.825978
+ 2025-10-29T14:27:45.722521
image/svg+xml
@@ -3956,70 +3956,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
-
+
- 1.0
+ 1.0
-
+
-
+
- 1.5
+ 1.5
-
+
-
+
- 2.0
+ 2.0
-
+
-
+
- 2.5
+ 2.5
-
+
-
+
- 3.0
+ 3.0
@@ -4027,27 +4027,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
-
-
+
+
-
+
-
-
-
-
+
+
+
+
@@ -4105,7 +4105,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: combine | 4.25s
+Cell: combine | 4.21s
| ▶ run
Copy
Raw
@@ -4195,7 +4195,7 @@ impl wl p50(ms) ok
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
-hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
+hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
torch_layer_norm LN_B16_S2048_D4096 0.82 True
torch_layer_norm LN_B16_S2048_D8192 1.68 True
torch_layer_norm LN_B16_S4096_D4096 1.61 True
@@ -4219,7 +4219,7 @@ Implementations included:
-Installed 37 packages in 219ms
+Installed 37 packages in 210ms
@@ -4232,7 +4232,7 @@ Installed 37 packages in 219ms
- 2025-10-28T14:09:21.825978
+ 2025-10-29T14:27:45.722521
image/svg+xml
@@ -4316,70 +4316,70 @@ Installed 37 packages in 219ms
-
+
-
+
- 1.0
+ 1.0
-
+
-
+
- 1.5
+ 1.5
-
+
-
+
- 2.0
+ 2.0
-
+
-
+
- 2.5
+ 2.5
-
+
-
+
- 3.0
+ 3.0
@@ -4387,27 +4387,27 @@ Installed 37 packages in 219ms
-
+
-
-
+
+
-
+
-
-
-
-
+
+
+
+
diff --git a/rotary/impls/artifacts/benchmark/rotary.jsonl b/rotary/impls/artifacts/benchmark/rotary.jsonl
index 18ed4f37499b08e63b86a43f9ee0bdc193375b0d..e407db0807eb78b1db05edcb765f594b555812aa 100644
--- a/rotary/impls/artifacts/benchmark/rotary.jsonl
+++ b/rotary/impls/artifacts/benchmark/rotary.jsonl
@@ -1,24 +1,24 @@
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1724160000549091, "p50": 0.17308600001797458, "p90": 0.1756759999125279, "mean": 0.1760500000045795, "iqr": 0.0032199998258874984, "raw_times": [0.17245600008664042, 0.1756759999125279, 0.1724160000549091, 0.17308600001797458, 0.18661599995084543], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.17975699995531613, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22642799990535423, "p50": 0.2294280000114668, "p90": 0.23093799995876907, "mean": 0.23135619996992318, "iqr": 0.0026599999500831473, "raw_times": [0.23093799995876907, 0.22642799990535423, 0.22827800000868592, 0.2417089999653399, 0.2294280000114668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23494799995660287, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21688800006813835, "p50": 0.21992799997860857, "p90": 0.2219079999576934, "mean": 0.22172000001319248, "iqr": 0.004439999884198187, "raw_times": [0.2174680000734952, 0.2219079999576934, 0.21688800006813835, 0.23240799998802686, 0.21992799997860857], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.225418000013633, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21487700007583044, "p50": 0.21964699999443837, "p90": 0.22132800006602338, "mean": 0.21978760003094067, "iqr": 0.005100000066704524, "raw_times": [0.21487700007583044, 0.21622799999931885, 0.21964699999443837, 0.22132800006602338, 0.2268580000190923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24882799993974913, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21332699998311, "p50": 0.21615699995436444, "p90": 0.21744800005762954, "mean": 0.21590960000139603, "iqr": 0.0025000000505315256, "raw_times": [0.21332699998311, 0.21744800005762954, 0.21494800000709802, 0.21766800000477815, 0.21615699995436444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22001800005000405, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21038799991401902, "p50": 0.21561700009442575, "p90": 0.21720800009461527, "mean": 0.22098599999935686, "iqr": 0.004100000182916119, "raw_times": [0.21038799991401902, 0.21720800009461527, 0.21561700009442575, 0.24860899998202513, 0.21310799991169915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2178580000418151, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21404700009952649, "p50": 0.21557699994900759, "p90": 0.2158679999411106, "mean": 0.2152116000161186, "iqr": 0.0011999999287581886, "raw_times": [0.2158679999411106, 0.21589800007859594, 0.21404700009952649, 0.21466800001235242, 0.21557699994900759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21567799990407366, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21701799994389148, "p50": 0.21822700000484474, "p90": 0.22002800005793688, "mean": 0.2237478000097326, "iqr": 0.002031000008173578, "raw_times": [0.22002800005793688, 0.2179970000497633, 0.2454689999922266, 0.21822700000484474, 0.21701799994389148], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22291799996310147, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21712800003115262, "p50": 0.21885700004986575, "p90": 0.2196080000658185, "mean": 0.22401780001928273, "iqr": 0.001630000042496249, "raw_times": [0.21797800002332224, 0.2196080000658185, 0.24651799992625456, 0.21885700004986575, 0.21712800003115262], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2206780000051367, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21462800009430794, "p50": 0.21782799990432977, "p90": 0.21795700001803198, "mean": 0.21911359999648994, "iqr": 0.0030300000162242213, "raw_times": [0.21462800009430794, 0.23022799996397225, 0.21782799990432977, 0.21492700000180776, 0.21795700001803198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2186980000260519, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21579799999926763, "p50": 0.21701699995446688, "p90": 0.22130799993647088, "mean": 0.2237457999626713, "iqr": 0.004450000005817856, "raw_times": [0.21701699995446688, 0.22130799993647088, 0.21579799999926763, 0.24774799999249808, 0.21685799993065302], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22235700009787251, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22434800007431477, "p50": 0.2248280000003433, "p90": 0.22490799995011912, "mean": 0.22479799997654482, "iqr": 0.00031000001854408765, "raw_times": [0.2248280000003433, 0.22490799995011912, 0.22459799993157503, 0.22434800007431477, 0.22530799992637185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23522799995134847, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21574699997017888, "p50": 0.21802799994929956, "p90": 0.21904799996264046, "mean": 0.22033179998288688, "iqr": 0.0018999999156221747, "raw_times": [0.21714800004701829, 0.21802799994929956, 0.2316879999852972, 0.21904799996264046, 0.21574699997017888], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22168800001054478, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21753800001533818, "p50": 0.21888800006308884, "p90": 0.22129700005280029, "mean": 0.22190180004599824, "iqr": 0.003358999947522534, "raw_times": [0.21753800001533818, 0.23384799999348616, 0.21793800010527775, 0.21888800006308884, 0.22129700005280029], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22266799999215436, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2166670000178783, "p50": 0.21850699999959033, "p90": 0.21964699999443837, "mean": 0.21864339998955984, "iqr": 0.001419000000169035, "raw_times": [0.21850699999959033, 0.2166670000178783, 0.22016799994162284, 0.21822799999426934, 0.21964699999443837], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23552799996195972, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21624800001518452, "p50": 0.21773700007088337, "p90": 0.21802799994929956, "mean": 0.21774760000425886, "iqr": 0.0013409999155555852, "raw_times": [0.21668700003374397, 0.21773700007088337, 0.22003799995218287, 0.21624800001518452, 0.21802799994929956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226780000000872, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21748699998624943, "p50": 0.22014700005001941, "p90": 0.22206799997093185, "mean": 0.22232159999475698, "iqr": 0.0019999999949504854, "raw_times": [0.22014700005001941, 0.23183799999060284, 0.22206799997093185, 0.21748699998624943, 0.22006799997598137], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22040800001832395, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21601800006010308, "p50": 0.21957800004202, "p90": 0.22023799999715266, "mean": 0.2213318000030995, "iqr": 0.0024510000002919696, "raw_times": [0.23303799991936103, 0.21601800006010308, 0.2177869999968607, 0.21957800004202, 0.22023799999715266], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.220787999978711, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21692799998618284, "p50": 0.22003699996275827, "p90": 0.2230679999684071, "mean": 0.222287800011145, "iqr": 0.0031599998919773498, "raw_times": [0.21692799998618284, 0.21990800007642974, 0.2314980000619471, 0.2230679999684071, 0.22003699996275827], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22102700006598752, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160679999860804, "p50": 0.21972700005790102, "p90": 0.22029800004474964, "mean": 0.21970960001453932, "iqr": 0.0024610000082248007, "raw_times": [0.2160679999860804, 0.2246179999474407, 0.22029800004474964, 0.21972700005790102, 0.21783700003652484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22191799996562622, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2172279999967941, "p50": 0.21847799996521644, "p90": 0.22105800007921061, "mean": 0.22193580000475777, "iqr": 0.0035110000453641987, "raw_times": [0.21847799996521644, 0.22105800007921061, 0.23536799994872126, 0.21754700003384642, 0.2172279999967941], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22206799997093185, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21436800000174117, "p50": 0.21785799992812827, "p90": 0.2195579999124675, "mean": 0.2202379999744153, "iqr": 0.0030299999025373836, "raw_times": [0.21436800000174117, 0.21785799992812827, 0.2195579999124675, 0.2165280000099301, 0.2328780000198094], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25353900002755836, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22968799999034673, "p50": 0.23015800002212927, "p90": 0.23064800006977748, "mean": 0.23369620002995362, "iqr": 0.0006600000688194996, "raw_times": [0.23015800002212927, 0.24799900006655662, 0.22968799999034673, 0.22998800000095798, 0.23064800006977748], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23042800000894204, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
-{"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6347319999804313, "p50": 0.6375930000785957, "p90": 0.639283000055002, "mean": 0.6376124000325945, "iqr": 0.003270999968663091, "raw_times": [0.6375930000785957, 0.636012000086339, 0.6404419999626043, 0.6347319999804313, 0.639283000055002], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.639422999938688, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17006399997399058, "p50": 0.17533400000502297, "p90": 0.1853339999797754, "mean": 0.1802961999942454, "iqr": 0.014799999974002276, "raw_times": [0.17533400000502297, 0.20021500000666492, 0.17053400000577312, 0.1853339999797754, 0.17006399997399058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18331500001522727, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2246159999685915, "p50": 0.2266160000203854, "p90": 0.22888500001272405, "mean": 0.22735560000910482, "iqr": 0.002880000010918593, "raw_times": [0.22600500000180546, 0.22888500001272405, 0.2246159999685915, 0.2266160000203854, 0.2306560000420177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2471160000254713, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21813499995460006, "p50": 0.22189599997091136, "p90": 0.2272149999953399, "mean": 0.22315939997952228, "iqr": 0.007960000004914036, "raw_times": [0.2272149999953399, 0.22189599997091136, 0.21925499999042586, 0.21813499995460006, 0.2292959999863342], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2391049999914685, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21478600001501036, "p50": 0.21544499998071842, "p90": 0.2178249999928994, "mean": 0.2161891999890031, "iqr": 0.0027799999884337012, "raw_times": [0.21544499998071842, 0.2178249999928994, 0.21478600001501036, 0.21784499995192164, 0.2150450000044657], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22814599998355334, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:29Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160950000416051, "p50": 0.22390499998437008, "p90": 0.22473600000694205, "mean": 0.22559540000202105, "iqr": 0.00507100003233063, "raw_times": [0.22390499998437008, 0.24357600000257662, 0.22473600000694205, 0.2160950000416051, 0.21966499997461142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.229085999990275, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21366499998975996, "p50": 0.21597500000325454, "p90": 0.21670500001391702, "mean": 0.2158129999884295, "iqr": 0.0008600000569458643, "raw_times": [0.21366499998975996, 0.2168749999782449, 0.21597500000325454, 0.21670500001391702, 0.21584499995697115], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21572499997546402, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2142449999951168, "p50": 0.21574499999132968, "p90": 0.2169850000086626, "mean": 0.21585539999477987, "iqr": 0.0022990000161371427, "raw_times": [0.2142449999951168, 0.2176159999862648, 0.21468599999252547, 0.2169850000086626, 0.21574499999132968], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2192349999745602, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21675499999673775, "p50": 0.21711599998752718, "p90": 0.21833499999956985, "mean": 0.2174776000060774, "iqr": 0.0015789999565640755, "raw_times": [0.21675499999673775, 0.21711599998752718, 0.21833499999956985, 0.21675600004300577, 0.2184260000035465], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22064600000248902, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2153649999740992, "p50": 0.21702599997297511, "p90": 0.21829499996783852, "mean": 0.21729759998834197, "iqr": 0.0014989999499448459, "raw_times": [0.2153649999740992, 0.21679600001789368, 0.21900600000890336, 0.21702599997297511, 0.21829499996783852], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22154499998805477, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2143060000321384, "p50": 0.2161449999675824, "p90": 0.21640500000330576, "mean": 0.21578740000904872, "iqr": 0.0008589999538344273, "raw_times": [0.21653499999274572, 0.21640500000330576, 0.2143060000321384, 0.2161449999675824, 0.21554600004947133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23455599995259035, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21488499999122723, "p50": 0.21633500000461936, "p90": 0.21918499999173946, "mean": 0.21730919999072285, "iqr": 0.004300000000512227, "raw_times": [0.21488499999122723, 0.21918499999173946, 0.21488499999122723, 0.22125599997480094, 0.21633500000461936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2185359999771208, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2230359999657594, "p50": 0.22526600002947816, "p90": 0.22695600000588456, "mean": 0.22723160000168718, "iqr": 0.0026509999884183344, "raw_times": [0.22526600002947816, 0.2230359999657594, 0.23659499998984757, 0.22430500001746623, 0.22695600000588456], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22456599998577076, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21608499997682884, "p50": 0.2175149999743553, "p90": 0.22948600002337116, "mean": 0.2247094000040306, "iqr": 0.012610999988282856, "raw_times": [0.21608499997682884, 0.2168750000350883, 0.22948600002337116, 0.24358600001050945, 0.2175149999743553], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21851499997183055, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2162149999662688, "p50": 0.21694499997693129, "p90": 0.2171250000060354, "mean": 0.21706100000074002, "iqr": 0.0003099999617006688, "raw_times": [0.2162149999662688, 0.21694499997693129, 0.2171250000060354, 0.21820500001012988, 0.21681500004433474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21809500003655558, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2136749999976928, "p50": 0.21658500003240988, "p90": 0.21662599999672238, "mean": 0.21621120000645533, "iqr": 0.00066100000140068, "raw_times": [0.2136749999976928, 0.2159649999953217, 0.21658500003240988, 0.21662599999672238, 0.21820500001012988], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2181750000431748, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21695499998486412, "p50": 0.21774499998628016, "p90": 0.2285450000272249, "mean": 0.22256720000086716, "iqr": 0.010920000022451859, "raw_times": [0.21774499998628016, 0.21762500000477303, 0.2319660000011936, 0.21695499998486412, 0.2285450000272249], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22269599998026024, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21440599999777987, "p50": 0.21785499995985447, "p90": 0.2335159999802272, "mean": 0.2228595999895333, "iqr": 0.01891099998374557, "raw_times": [0.21440599999777987, 0.23391600001332336, 0.21785499995985447, 0.2335159999802272, 0.21460499999648164], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21932499998911226, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2133250000042608, "p50": 0.21422499997925115, "p90": 0.21653499999274572, "mean": 0.21708740000576654, "iqr": 0.0029589999712698045, "raw_times": [0.21357600002147592, 0.2133250000042608, 0.21653499999274572, 0.21422499997925115, 0.2277760000310991], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22739600001386862, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21597500000325454, "p50": 0.2176859999849512, "p90": 0.21771499996248167, "mean": 0.21758339998996234, "iqr": 0.0013999999737279722, "raw_times": [0.2176859999849512, 0.21771499996248167, 0.22022600001037063, 0.2163149999887537, 0.21597500000325454], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21613599994907418, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21577600000455277, "p50": 0.2173749999769825, "p90": 0.21900600000890336, "mean": 0.21836960000882755, "iqr": 0.0018509999790694565, "raw_times": [0.2171550000298339, 0.21577600000455277, 0.2225360000238652, 0.2173749999769825, 0.21900600000890336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22321599999486352, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21538499998996485, "p50": 0.21647599999141676, "p90": 0.21717500004569956, "mean": 0.2167214000110107, "iqr": 0.001030000021273736, "raw_times": [0.21717500004569956, 0.2184260000035465, 0.21538499998996485, 0.21614500002442583, 0.21647599999141676], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21872600001415776, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21501500003751062, "p50": 0.2168760000245129, "p90": 0.2187050000088675, "mean": 0.21949320001795058, "iqr": 0.0030500000320898835, "raw_times": [0.21565499997677762, 0.2187050000088675, 0.21501500003751062, 0.23121500004208428, 0.2168760000245129], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22076499999457155, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22937599999295344, "p50": 0.23008499999832566, "p90": 0.23144499999716572, "mean": 0.23359140000138723, "iqr": 0.0020100000028833165, "raw_times": [0.23144499999716572, 0.23008499999832566, 0.2294349999942824, 0.22937599999295344, 0.24761600002420892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23195599999326078, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T14:27:30Z", "run": "fb4d417a09cd449ead797cfa379bb3d8", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.637245999996594, "p50": 0.6388759999822469, "p90": 0.6389449999915087, "mean": 0.6396317999929124, "iqr": 0.0012190000120426703, "raw_times": [0.6388759999822469, 0.6453660000147465, 0.6389449999915087, 0.637245999996594, 0.637725999979466], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6364359999793123, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
diff --git a/rotary/impls/hf_kernels_rotary.html b/rotary/impls/hf_kernels_rotary.html
index 2749f9f6b5f352621fbf7d1a4c5db169ca775615..0608b9088d0d84399b39661fd8d9fc01a39dbda5 100644
--- a/rotary/impls/hf_kernels_rotary.html
+++ b/rotary/impls/hf_kernels_rotary.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: nv | 0.23s
+Cell: nv | 0.20s
| ▶ run
Copy
Raw
@@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
-
Tue Oct 28 14:08:24 2025
+Wed Oct 29 14:26:51 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 29C P0 90W / 350W | 0MiB / 46068MiB | 24% Default |
+| N/A 32C P0 76W / 350W | 0MiB / 46068MiB | 11% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -3920,7 +3920,7 @@ Cell: nv | 0.23s
▼ output
▶ uv-logs
|
-Cell: benchmark | 8.05s
+Cell: benchmark | 7.90s
| ▶ run
Copy
Raw
@@ -3989,23 +3989,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 452.802us 1907.02% 452.802us 452.802us 1
- hf_kernels_rotary 12.50% 264.332us 99.65% 2.107ms 2.107ms 0.000us 0.00% 24.960us 24.960us 1
- _rotary_dba7d1e::apply_rotary 2.70% 57.162us 4.91% 103.733us 17.289us 16.928us 71.29% 16.928us 2.821us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 71.29% 16.928us 2.821us 6
- aten::clone 2.21% 46.761us 79.27% 1.676ms 279.401us 0.000us 0.00% 8.032us 1.339us 6
- aten::copy_ 2.31% 48.833us 74.02% 1.565ms 260.899us 6.816us 28.71% 8.032us 1.339us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 28.71% 6.816us 1.136us 6
- Activity Buffer Request 68.03% 1.439ms 68.03% 1.439ms 1.439ms 1.216us 5.12% 1.216us 1.216us 1
- aten::empty_strided 3.04% 64.252us 3.04% 64.252us 10.709us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 3.68% 77.892us 3.68% 77.892us 12.982us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.33% 49.309us 2.97% 62.771us 5.231us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.64% 13.462us 0.64% 13.462us 1.122us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 2.20% 46.571us 2.20% 46.571us 7.762us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.35% 7.480us 0.35% 7.480us 7.480us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 433.056us 1833.74% 433.056us 433.056us 1
+ hf_kernels_rotary 12.39% 257.808us 99.67% 2.073ms 2.073ms 0.000us 0.00% 24.832us 24.832us 1
+ _rotary_dba7d1e::apply_rotary 2.75% 57.199us 5.11% 106.332us 17.722us 16.960us 71.82% 16.960us 2.827us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 71.82% 16.960us 2.827us 6
+ aten::clone 2.11% 43.871us 79.26% 1.649ms 274.763us 0.000us 0.00% 7.872us 1.312us 6
+ aten::copy_ 2.19% 45.572us 74.13% 1.542ms 256.978us 6.656us 28.18% 7.872us 1.312us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 28.18% 6.656us 1.109us 6
+ Activity Buffer Request 68.36% 1.422ms 68.36% 1.422ms 1.422ms 1.216us 5.15% 1.216us 1.216us 1
+ aten::empty_strided 3.02% 62.841us 3.02% 62.841us 10.473us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 3.58% 74.452us 3.58% 74.452us 12.409us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.28% 47.469us 2.90% 60.410us 5.034us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.62% 12.941us 0.62% 12.941us 1.078us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.36% 49.133us 2.36% 49.133us 8.189us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.33% 6.850us 0.33% 6.850us 6.850us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.115ms
-Self CUDA time total: 23.744us
+Self CPU time total: 2.080ms
+Self CUDA time total: 23.616us
@@ -4015,23 +4015,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 357.532us 1513.94% 357.532us 357.532us 1
- hf_kernels_rotary 9.61% 183.785us 99.72% 1.907ms 1.907ms 0.000us 0.00% 24.736us 24.736us 1
- _rotary_dba7d1e::apply_rotary 2.38% 45.511us 4.57% 87.364us 14.561us 16.832us 71.27% 16.832us 2.805us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 71.27% 16.832us 2.805us 6
- aten::clone 1.27% 24.322us 83.40% 1.595ms 265.794us 0.000us 0.00% 7.904us 1.317us 6
- aten::copy_ 1.98% 37.831us 80.39% 1.537ms 256.202us 6.784us 28.73% 7.904us 1.317us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 28.73% 6.784us 1.131us 6
- Activity Buffer Request 75.51% 1.444ms 75.51% 1.444ms 1.444ms 1.120us 4.74% 1.120us 1.120us 1
- aten::empty_strided 1.74% 33.230us 1.74% 33.230us 5.538us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 2.90% 55.533us 2.90% 55.533us 9.256us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 1.68% 32.211us 2.13% 40.791us 3.399us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.45% 8.580us 0.45% 8.580us 0.715us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 2.19% 41.853us 2.19% 41.853us 6.976us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.28% 5.420us 0.28% 5.420us 5.420us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 368.319us 1559.68% 368.319us 368.319us 1
+ hf_kernels_rotary 8.92% 167.782us 99.73% 1.876ms 1.876ms 0.000us 0.00% 24.767us 24.767us 1
+ _rotary_dba7d1e::apply_rotary 2.34% 44.032us 4.50% 84.553us 14.092us 16.832us 71.28% 16.832us 2.805us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 71.28% 16.832us 2.805us 6
+ aten::clone 1.16% 21.840us 83.94% 1.579ms 263.113us 0.000us 0.00% 7.935us 1.322us 6
+ aten::copy_ 2.86% 53.852us 81.07% 1.525ms 254.111us 6.783us 28.72% 7.935us 1.322us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 28.72% 6.783us 1.130us 6
+ Activity Buffer Request 75.10% 1.412ms 75.10% 1.412ms 1.412ms 1.152us 4.88% 1.152us 1.152us 1
+ aten::empty_strided 1.71% 32.171us 1.71% 32.171us 5.362us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 3.11% 58.461us 3.11% 58.461us 9.744us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.82% 34.274us 2.37% 44.512us 3.709us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.54% 10.238us 0.54% 10.238us 0.853us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.15% 40.521us 2.15% 40.521us 6.753us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.27% 5.140us 0.27% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.912ms
-Self CUDA time total: 23.616us
+Self CPU time total: 1.881ms
+Self CUDA time total: 23.615us
@@ -4041,23 +4041,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 340.570us 1359.24% 340.570us 340.570us 1
- hf_kernels_rotary 8.83% 169.069us 99.74% 1.910ms 1.910ms 0.000us 0.00% 26.368us 26.368us 1
- _rotary_dba7d1e::apply_rotary 2.33% 44.610us 4.50% 86.120us 14.353us 17.248us 68.84% 17.248us 2.875us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.248us 68.84% 17.248us 2.875us 6
- aten::clone 1.25% 23.991us 84.27% 1.614ms 269.024us 0.000us 0.00% 9.120us 1.520us 6
- aten::copy_ 1.92% 36.791us 81.38% 1.559ms 259.779us 7.808us 31.16% 9.120us 1.520us 6
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.939us 1384.60% 346.939us 346.939us 1
+ hf_kernels_rotary 8.57% 160.653us 99.71% 1.870ms 1.870ms 0.000us 0.00% 26.369us 26.369us 1
+ _rotary_dba7d1e::apply_rotary 2.32% 43.421us 4.67% 87.601us 14.600us 17.249us 68.84% 17.249us 2.875us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.249us 68.84% 17.249us 2.875us 6
+ aten::clone 1.23% 23.032us 84.13% 1.577ms 262.912us 0.000us 0.00% 9.120us 1.520us 6
+ aten::copy_ 1.94% 36.311us 81.17% 1.522ms 253.669us 7.808us 31.16% 9.120us 1.520us 6
Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 31.16% 7.808us 1.301us 6
- Activity Buffer Request 76.60% 1.467ms 76.60% 1.467ms 1.467ms 1.312us 5.24% 1.312us 1.312us 1
- aten::empty_strided 1.64% 31.482us 1.64% 31.482us 5.247us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 2.85% 54.600us 2.85% 54.600us 9.100us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 1.69% 32.440us 2.15% 41.092us 3.424us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.45% 8.652us 0.45% 8.652us 0.721us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 2.17% 41.510us 2.17% 41.510us 6.918us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.26% 4.990us 0.26% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
+ Activity Buffer Request 76.42% 1.433ms 76.42% 1.433ms 1.433ms 1.312us 5.24% 1.312us 1.312us 1
+ aten::empty_strided 1.73% 32.420us 1.73% 32.420us 5.403us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.81% 52.730us 2.81% 52.730us 8.788us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.83% 34.233us 2.34% 43.964us 3.664us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.52% 9.731us 0.52% 9.731us 0.811us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.36% 44.180us 2.36% 44.180us 7.363us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.29% 5.410us 0.29% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.915ms
-Self CUDA time total: 25.056us
+Self CPU time total: 1.875ms
+Self CUDA time total: 25.057us
@@ -4067,23 +4067,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.075us 1340.08% 346.075us 346.075us 1
- hf_kernels_rotary 7.97% 168.270us 99.76% 2.107ms 2.107ms 0.000us 0.00% 27.137us 27.137us 1
- _rotary_dba7d1e::apply_rotary 2.16% 45.651us 4.14% 87.411us 14.569us 18.049us 69.89% 18.049us 3.008us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.049us 69.89% 18.049us 3.008us 6
- aten::clone 1.15% 24.271us 85.69% 1.810ms 301.630us 0.000us 0.00% 9.088us 1.515us 6
- aten::copy_ 1.78% 37.581us 83.02% 1.753ms 292.225us 7.776us 30.11% 9.088us 1.515us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 30.11% 7.776us 1.296us 6
- Activity Buffer Request 68.60% 1.449ms 68.60% 1.449ms 1.449ms 1.312us 5.08% 1.312us 1.312us 1
- aten::empty_strided 1.52% 32.162us 1.52% 32.162us 5.360us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 12.64% 267.018us 12.64% 267.018us 44.503us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 1.55% 32.701us 1.96% 41.360us 3.447us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.41% 8.659us 0.41% 8.659us 0.722us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 1.98% 41.760us 1.98% 41.760us 6.960us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.24% 5.141us 0.24% 5.141us 5.141us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.904us 1355.61% 347.904us 347.904us 1
+ hf_kernels_rotary 7.92% 162.592us 99.76% 2.047ms 2.047ms 0.000us 0.00% 27.009us 27.009us 1
+ _rotary_dba7d1e::apply_rotary 2.09% 42.932us 4.15% 85.134us 14.189us 17.951us 69.95% 17.951us 2.992us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.951us 69.95% 17.951us 2.992us 6
+ aten::clone 1.22% 25.009us 85.61% 1.757ms 292.750us 0.000us 0.00% 9.058us 1.510us 6
+ aten::copy_ 1.81% 37.091us 82.80% 1.699ms 283.112us 7.713us 30.05% 9.058us 1.510us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.713us 30.05% 7.713us 1.285us 6
+ Activity Buffer Request 69.84% 1.433ms 69.84% 1.433ms 1.433ms 1.345us 5.24% 1.345us 1.345us 1
+ aten::empty_strided 1.60% 32.820us 1.60% 32.820us 5.470us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 11.14% 228.627us 11.14% 228.627us 38.104us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.59% 32.701us 2.07% 42.551us 3.546us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.48% 9.850us 0.48% 9.850us 0.821us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.06% 42.202us 2.06% 42.202us 7.034us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.24% 4.861us 0.24% 4.861us 4.861us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.112ms
-Self CUDA time total: 25.825us
+Self CPU time total: 2.052ms
+Self CUDA time total: 25.664us
@@ -4093,23 +4093,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 383.355us 1524.21% 383.355us 383.355us 1
- hf_kernels_rotary 8.48% 177.428us 99.77% 2.088ms 2.088ms 0.000us 0.00% 26.495us 26.495us 1
- _rotary_dba7d1e::apply_rotary 3.05% 63.861us 5.13% 107.442us 17.907us 17.215us 68.45% 17.215us 2.869us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.215us 68.45% 17.215us 2.869us 6
- aten::clone 1.13% 23.688us 84.02% 1.758ms 293.025us 0.000us 0.00% 9.280us 1.547us 6
- aten::copy_ 1.90% 39.711us 81.30% 1.701ms 283.530us 7.936us 31.55% 9.280us 1.547us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 31.55% 7.936us 1.323us 6
- Activity Buffer Request 67.53% 1.413ms 67.53% 1.413ms 1.413ms 1.344us 5.34% 1.344us 1.344us 1
- aten::empty_strided 1.59% 33.283us 1.59% 33.283us 5.547us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 11.87% 248.348us 11.87% 248.348us 41.391us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 1.70% 35.532us 2.14% 44.714us 3.726us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.44% 9.182us 0.44% 9.182us 0.765us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 2.08% 43.581us 2.08% 43.581us 7.264us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.23% 4.831us 0.23% 4.831us 4.831us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 356.192us 1425.17% 356.192us 356.192us 1
+ hf_kernels_rotary 9.03% 181.778us 99.74% 2.009ms 2.009ms 0.000us 0.00% 26.306us 26.306us 1
+ _rotary_dba7d1e::apply_rotary 2.18% 43.970us 4.25% 85.660us 14.277us 17.088us 68.37% 17.088us 2.848us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 68.37% 17.088us 2.848us 6
+ aten::clone 1.16% 23.451us 84.31% 1.698ms 283.035us 0.000us 0.00% 9.218us 1.536us 6
+ aten::copy_ 1.79% 36.151us 81.55% 1.643ms 273.753us 7.905us 31.63% 9.218us 1.536us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.905us 31.63% 7.905us 1.318us 6
+ Activity Buffer Request 70.14% 1.413ms 70.14% 1.413ms 1.413ms 1.313us 5.25% 1.313us 1.313us 1
+ aten::empty_strided 1.60% 32.242us 1.60% 32.242us 5.374us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.61% 193.593us 9.61% 193.593us 32.266us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.67% 33.621us 2.15% 43.371us 3.614us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.48% 9.750us 0.48% 9.750us 0.812us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.07% 41.690us 2.07% 41.690us 6.948us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.26% 5.140us 0.26% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.093ms
-Self CUDA time total: 25.151us
+Self CPU time total: 2.014ms
+Self CUDA time total: 24.993us
@@ -4119,23 +4119,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 348.288us 1348.70% 348.288us 348.288us 1
- hf_kernels_rotary 8.04% 167.026us 99.77% 2.072ms 2.072ms 0.000us 0.00% 27.136us 27.136us 1
- _rotary_dba7d1e::apply_rotary 2.17% 45.031us 4.15% 86.212us 14.369us 18.016us 69.76% 18.016us 3.003us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.016us 69.76% 18.016us 3.003us 6
- aten::clone 1.23% 25.613us 85.56% 1.777ms 296.124us 0.000us 0.00% 9.120us 1.520us 6
- aten::copy_ 1.80% 37.380us 82.71% 1.718ms 286.270us 7.808us 30.24% 9.120us 1.520us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 30.24% 7.808us 1.301us 6
- Activity Buffer Request 69.08% 1.434ms 69.08% 1.434ms 1.434ms 1.312us 5.08% 1.312us 1.312us 1
- aten::empty_strided 1.61% 33.511us 1.61% 33.511us 5.585us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 11.83% 245.758us 11.83% 245.758us 40.960us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 1.59% 33.022us 2.01% 41.843us 3.487us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.42% 8.821us 0.42% 8.821us 0.735us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 1.98% 41.181us 1.98% 41.181us 6.863us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.23% 4.770us 0.23% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.469us 1341.21% 345.469us 345.469us 1
+ hf_kernels_rotary 8.14% 161.605us 99.74% 1.979ms 1.979ms 0.000us 0.00% 27.070us 27.070us 1
+ _rotary_dba7d1e::apply_rotary 2.10% 41.690us 4.19% 83.112us 13.852us 17.982us 69.81% 17.982us 2.997us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.982us 69.81% 17.982us 2.997us 6
+ aten::clone 1.15% 22.842us 85.12% 1.689ms 281.515us 0.000us 0.00% 9.088us 1.515us 6
+ aten::copy_ 1.84% 36.466us 82.36% 1.634ms 272.405us 7.776us 30.19% 9.088us 1.515us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 30.19% 7.776us 1.296us 6
+ Activity Buffer Request 71.40% 1.417ms 71.40% 1.417ms 1.417ms 1.312us 5.09% 1.312us 1.312us 1
+ aten::empty_strided 1.60% 31.821us 1.60% 31.821us 5.303us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.12% 181.057us 9.12% 181.057us 30.176us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.80% 35.740us 2.29% 45.520us 3.793us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.49% 9.780us 0.49% 9.780us 0.815us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.09% 41.422us 2.09% 41.422us 6.904us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.26% 5.151us 0.26% 5.151us 5.151us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.077ms
-Self CUDA time total: 25.824us
+Self CPU time total: 1.984ms
+Self CUDA time total: 25.758us
@@ -4145,23 +4145,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 342.589us 1056.85% 342.589us 342.589us 1
- hf_kernels_rotary 8.06% 166.005us 99.77% 2.055ms 2.055ms 0.000us 0.00% 34.208us 34.208us 1
- _rotary_dba7d1e::apply_rotary 2.10% 43.163us 4.03% 82.914us 13.819us 21.856us 67.42% 21.856us 3.643us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.856us 67.42% 21.856us 3.643us 6
- aten::clone 1.18% 24.311us 85.73% 1.766ms 294.310us 0.000us 0.00% 12.352us 2.059us 6
- aten::copy_ 1.85% 38.151us 82.92% 1.708ms 284.677us 10.560us 32.58% 12.352us 2.059us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 32.58% 10.560us 1.760us 6
- Activity Buffer Request 69.37% 1.429ms 69.37% 1.429ms 1.429ms 1.792us 5.53% 1.792us 1.792us 1
- aten::empty_strided 1.63% 33.490us 1.63% 33.490us 5.582us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 11.70% 241.040us 11.70% 241.040us 40.173us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 1.54% 31.672us 1.96% 40.421us 3.368us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.42% 8.749us 0.42% 8.749us 0.729us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 1.93% 39.751us 1.93% 39.751us 6.625us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.23% 4.681us 0.23% 4.681us 4.681us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 370.847us 1148.52% 370.847us 370.847us 1
+ hf_kernels_rotary 8.48% 171.185us 99.77% 2.015ms 2.015ms 0.000us 0.00% 34.081us 34.081us 1
+ _rotary_dba7d1e::apply_rotary 2.32% 46.763us 4.49% 90.723us 15.120us 21.793us 67.49% 21.793us 3.632us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.793us 67.49% 21.793us 3.632us 6
+ aten::clone 1.25% 25.309us 84.59% 1.708ms 284.718us 0.000us 0.00% 12.288us 2.048us 6
+ aten::copy_ 1.96% 39.631us 81.62% 1.648ms 274.723us 10.496us 32.51% 12.288us 2.048us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 32.51% 10.496us 1.749us 6
+ Activity Buffer Request 70.18% 1.417ms 70.18% 1.417ms 1.417ms 1.792us 5.55% 1.792us 1.792us 1
+ aten::empty_strided 1.72% 34.661us 1.72% 34.661us 5.777us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.48% 191.424us 9.48% 191.424us 31.904us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.73% 34.932us 2.22% 44.771us 3.731us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.49% 9.839us 0.49% 9.839us 0.820us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.18% 43.960us 2.18% 43.960us 7.327us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.23% 4.601us 0.23% 4.601us 4.601us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.060ms
-Self CUDA time total: 32.416us
+Self CPU time total: 2.020ms
+Self CUDA time total: 32.289us
@@ -4171,23 +4171,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.021us 674.53% 349.021us 349.021us 1
- hf_kernels_rotary 8.13% 167.188us 99.77% 2.053ms 2.053ms 0.000us 0.00% 54.656us 54.656us 1
- _rotary_dba7d1e::apply_rotary 2.05% 42.101us 4.09% 84.171us 14.029us 34.590us 66.85% 34.590us 5.765us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.590us 66.85% 34.590us 5.765us 6
- aten::clone 1.20% 24.743us 85.45% 1.758ms 292.975us 0.000us 0.00% 20.066us 3.344us 6
- aten::copy_ 1.77% 36.360us 82.61% 1.700ms 283.256us 17.153us 33.15% 20.066us 3.344us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.153us 33.15% 17.153us 2.859us 6
- Activity Buffer Request 69.27% 1.425ms 69.27% 1.425ms 1.425ms 2.913us 5.63% 2.913us 2.913us 1
- aten::empty_strided 1.63% 33.571us 1.63% 33.571us 5.595us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 11.58% 238.157us 11.58% 238.157us 39.693us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 1.68% 34.499us 2.11% 43.362us 3.614us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.43% 8.863us 0.43% 8.863us 0.739us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 2.04% 42.070us 2.04% 42.070us 7.012us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.23% 4.701us 0.23% 4.701us 4.701us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.533us 668.21% 345.533us 345.533us 1
+ hf_kernels_rotary 8.13% 161.677us 99.76% 1.983ms 1.983ms 0.000us 0.00% 54.558us 54.558us 1
+ _rotary_dba7d1e::apply_rotary 2.15% 42.810us 4.29% 85.240us 14.207us 34.782us 67.26% 34.782us 5.797us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.782us 67.26% 34.782us 5.797us 6
+ aten::clone 1.16% 23.089us 85.02% 1.690ms 281.665us 0.000us 0.00% 19.776us 3.296us 6
+ aten::copy_ 1.78% 35.482us 82.32% 1.636ms 272.722us 16.928us 32.74% 19.776us 3.296us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 32.74% 16.928us 2.821us 6
+ Activity Buffer Request 71.53% 1.422ms 71.53% 1.422ms 1.422ms 2.848us 5.51% 2.848us 2.848us 1
+ aten::empty_strided 1.54% 30.571us 1.54% 30.571us 5.095us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.00% 178.904us 9.00% 178.904us 29.817us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.84% 36.581us 2.32% 46.051us 3.838us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.48% 9.470us 0.48% 9.470us 0.789us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.13% 42.430us 2.13% 42.430us 7.072us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.24% 4.870us 0.24% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.057ms
-Self CUDA time total: 51.743us
+Self CPU time total: 1.988ms
+Self CUDA time total: 51.710us
@@ -4197,23 +4197,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 342.845us 1058.69% 342.845us 342.845us 1
- hf_kernels_rotary 7.95% 162.638us 99.78% 2.041ms 2.041ms 0.000us 0.00% 34.176us 34.176us 1
- _rotary_dba7d1e::apply_rotary 2.08% 42.501us 4.07% 83.221us 13.870us 21.760us 67.19% 21.760us 3.627us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.760us 67.19% 21.760us 3.627us 6
- aten::clone 1.16% 23.762us 85.72% 1.754ms 292.258us 0.000us 0.00% 12.416us 2.069us 6
- aten::copy_ 1.82% 37.190us 83.02% 1.698ms 283.036us 10.624us 32.81% 12.416us 2.069us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.624us 32.81% 10.624us 1.771us 6
- Activity Buffer Request 69.60% 1.424ms 69.60% 1.424ms 1.424ms 1.792us 5.53% 1.792us 1.792us 1
- aten::empty_strided 1.54% 31.570us 1.54% 31.570us 5.262us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 11.60% 237.247us 11.60% 237.247us 39.541us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 1.62% 33.195us 2.03% 41.584us 3.465us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.41% 8.389us 0.41% 8.389us 0.699us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 1.99% 40.720us 1.99% 40.720us 6.787us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.22% 4.600us 0.22% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.136us 1047.28% 338.136us 338.136us 1
+ hf_kernels_rotary 19.11% 157.801us 99.43% 820.869us 820.869us 0.000us 0.00% 34.078us 34.078us 1
+ _rotary_dba7d1e::apply_rotary 5.12% 42.269us 10.18% 84.080us 14.013us 21.792us 67.49% 21.792us 3.632us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.792us 67.49% 21.792us 3.632us 6
+ aten::clone 2.56% 21.133us 65.13% 537.684us 89.614us 0.000us 0.00% 12.286us 2.048us 6
+ aten::copy_ 4.56% 37.650us 58.77% 485.172us 80.862us 10.495us 32.51% 12.286us 2.048us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.495us 32.51% 10.495us 1.749us 6
+ Activity Buffer Request 32.51% 268.347us 32.51% 268.347us 268.347us 1.791us 5.55% 1.791us 1.791us 1
+ aten::empty_strided 3.80% 31.379us 3.80% 31.379us 5.230us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 21.70% 179.175us 21.70% 179.175us 29.862us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.93% 32.405us 5.00% 41.304us 3.442us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.08% 8.899us 1.08% 8.899us 0.742us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.06% 41.811us 5.06% 41.811us 6.969us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.57% 4.680us 0.57% 4.680us 4.680us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.046ms
-Self CUDA time total: 32.384us
+Self CPU time total: 825.549us
+Self CUDA time total: 32.287us
@@ -4223,23 +4223,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.276us 667.68% 345.276us 345.276us 1
- hf_kernels_rotary 17.87% 159.778us 99.47% 889.262us 889.262us 0.000us 0.00% 54.593us 54.593us 1
- _rotary_dba7d1e::apply_rotary 4.83% 43.201us 9.55% 85.402us 14.234us 34.656us 67.02% 34.656us 5.776us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.656us 67.02% 34.656us 5.776us 6
- aten::clone 2.69% 24.052us 67.57% 604.071us 100.678us 0.000us 0.00% 19.937us 3.323us 6
- aten::copy_ 3.98% 35.591us 61.32% 548.169us 91.362us 17.057us 32.98% 19.937us 3.323us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.057us 32.98% 17.057us 2.843us 6
- Activity Buffer Request 31.28% 279.600us 31.28% 279.600us 279.600us 2.880us 5.57% 2.880us 2.880us 1
- aten::empty_strided 3.56% 31.850us 3.56% 31.850us 5.308us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 26.06% 232.978us 26.06% 232.978us 38.830us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.51% 31.369us 4.48% 40.011us 3.334us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.97% 8.642us 0.97% 8.642us 0.720us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 4.72% 42.201us 4.72% 42.201us 7.034us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.53% 4.740us 0.53% 4.740us 4.740us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.832us 672.66% 347.832us 347.832us 1
+ hf_kernels_rotary 18.98% 156.996us 99.42% 822.501us 822.501us 0.000us 0.00% 54.558us 54.558us 1
+ _rotary_dba7d1e::apply_rotary 5.15% 42.621us 10.22% 84.512us 14.085us 34.783us 67.27% 34.783us 5.797us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.783us 67.27% 34.783us 5.797us 6
+ aten::clone 2.65% 21.930us 64.92% 537.102us 89.517us 0.000us 0.00% 19.775us 3.296us 6
+ aten::copy_ 4.53% 37.450us 58.33% 482.542us 80.424us 16.927us 32.73% 19.775us 3.296us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.927us 32.73% 16.927us 2.821us 6
+ Activity Buffer Request 32.06% 265.247us 32.06% 265.247us 265.247us 2.848us 5.51% 2.848us 2.848us 1
+ aten::empty_strided 3.94% 32.630us 3.94% 32.630us 5.438us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 21.74% 179.845us 21.74% 179.845us 29.974us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.14% 34.239us 5.31% 43.891us 3.658us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.17% 9.652us 1.17% 9.652us 0.804us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.06% 41.891us 5.06% 41.891us 6.982us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.58% 4.770us 0.58% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 894.002us
-Self CUDA time total: 51.713us
+Self CPU time total: 827.271us
+Self CUDA time total: 51.710us
@@ -4249,23 +4249,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 372.345us 343.04% 372.345us 372.345us 1
- hf_kernels_rotary 19.45% 178.278us 99.48% 911.643us 911.643us 0.000us 0.00% 126.592us 126.592us 1
- aten::clone 2.39% 21.900us 65.33% 598.671us 99.778us 0.000us 0.00% 69.792us 11.632us 6
- aten::copy_ 4.20% 38.503us 59.48% 545.071us 90.845us 51.744us 47.67% 69.792us 11.632us 6
- _rotary_dba7d1e::apply_rotary 5.03% 46.070us 9.81% 89.853us 14.975us 56.800us 52.33% 56.800us 9.467us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 56.800us 52.33% 56.800us 9.467us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.744us 47.67% 51.744us 8.624us 6
- Activity Buffer Request 29.76% 272.689us 29.76% 272.689us 272.689us 18.048us 16.63% 18.048us 18.048us 1
- aten::empty_strided 3.46% 31.700us 3.46% 31.700us 5.283us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 25.52% 233.879us 25.52% 233.879us 38.980us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.90% 35.730us 4.89% 44.841us 3.737us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.99% 9.111us 0.99% 9.111us 0.759us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 4.78% 43.783us 4.78% 43.783us 7.297us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.52% 4.730us 0.52% 4.730us 4.730us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.413us 323.34% 352.413us 352.413us 1
+ hf_kernels_rotary 18.38% 152.793us 99.44% 826.801us 826.801us 0.000us 0.00% 127.423us 127.423us 1
+ aten::clone 2.64% 21.959us 64.91% 539.754us 89.959us 0.000us 0.00% 69.984us 11.664us 6
+ aten::copy_ 4.48% 37.251us 58.50% 486.434us 81.072us 51.552us 47.30% 69.984us 11.664us 6
+ _rotary_dba7d1e::apply_rotary 5.35% 44.522us 10.55% 87.704us 14.617us 57.439us 52.70% 57.439us 9.573us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 57.439us 52.70% 57.439us 9.573us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.552us 47.30% 51.552us 8.592us 6
+ Activity Buffer Request 32.52% 270.437us 32.52% 270.437us 270.437us 18.432us 16.91% 18.432us 18.432us 1
+ aten::empty_strided 3.77% 31.361us 3.77% 31.361us 5.227us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 21.50% 178.746us 21.50% 178.746us 29.791us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.45% 36.960us 5.60% 46.550us 3.879us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.15% 9.590us 1.15% 9.590us 0.799us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.19% 43.182us 5.19% 43.182us 7.197us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.56% 4.690us 0.56% 4.690us 4.690us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 916.373us
-Self CUDA time total: 108.544us
+Self CPU time total: 831.491us
+Self CUDA time total: 108.991us
@@ -4275,23 +4275,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 373.881us 208.27% 373.881us 373.881us 1
- hf_kernels_rotary 17.56% 156.837us 99.52% 888.752us 888.752us 0.000us 0.00% 203.231us 203.231us 1
- aten::clone 2.51% 22.450us 65.45% 584.500us 97.417us 0.000us 0.00% 102.431us 17.072us 6
- aten::copy_ 4.24% 37.839us 59.27% 529.299us 88.217us 78.719us 43.85% 102.431us 17.072us 6
- _rotary_dba7d1e::apply_rotary 4.89% 43.682us 11.68% 104.316us 17.386us 100.800us 56.15% 100.800us 16.800us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 100.800us 56.15% 100.800us 16.800us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.719us 43.85% 78.719us 13.120us 6
- Activity Buffer Request 29.56% 264.020us 29.56% 264.020us 264.020us 23.712us 13.21% 23.712us 23.712us 1
- aten::empty_strided 3.67% 32.751us 3.67% 32.751us 5.458us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 25.47% 227.440us 25.47% 227.440us 37.907us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.79% 33.838us 4.83% 43.099us 3.592us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 1.04% 9.261us 1.04% 9.261us 0.772us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 6.79% 60.634us 6.79% 60.634us 10.106us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.48% 4.320us 0.48% 4.320us 4.320us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 354.429us 196.77% 354.429us 354.429us 1
+ hf_kernels_rotary 18.96% 156.272us 99.48% 819.980us 819.980us 0.000us 0.00% 203.900us 203.900us 1
+ aten::clone 2.73% 22.479us 64.84% 534.473us 89.079us 0.000us 0.00% 102.557us 17.093us 6
+ aten::copy_ 4.31% 35.551us 58.35% 480.933us 80.156us 78.782us 43.74% 102.557us 17.093us 6
+ _rotary_dba7d1e::apply_rotary 5.14% 42.393us 10.35% 85.274us 14.212us 101.343us 56.26% 101.343us 16.890us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 101.343us 56.26% 101.343us 16.890us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.782us 43.74% 78.782us 13.130us 6
+ Activity Buffer Request 32.52% 268.027us 32.52% 268.027us 268.027us 23.775us 13.20% 23.775us 23.775us 1
+ aten::empty_strided 3.77% 31.061us 3.77% 31.061us 5.177us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 21.52% 177.355us 21.52% 177.355us 29.559us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.12% 33.982us 5.33% 43.961us 3.663us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.21% 9.979us 1.21% 9.979us 0.832us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.20% 42.881us 5.20% 42.881us 7.147us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.52% 4.300us 0.52% 4.300us 4.300us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 893.072us
-Self CUDA time total: 179.519us
+Self CPU time total: 824.280us
+Self CUDA time total: 180.125us
@@ -4301,23 +4301,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 339.068us 1293.81% 339.068us 339.068us 1
- hf_kernels_rotary 18.21% 158.266us 99.46% 864.691us 864.691us 0.000us 0.00% 27.359us 27.359us 1
- _rotary_dba7d1e::apply_rotary 4.98% 43.284us 9.71% 84.425us 14.071us 19.391us 73.99% 19.391us 3.232us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.391us 73.99% 19.391us 3.232us 6
- aten::clone 2.67% 23.179us 66.79% 580.620us 96.770us 0.000us 0.00% 7.968us 1.328us 6
- aten::copy_ 4.38% 38.042us 60.58% 526.630us 87.772us 6.816us 26.01% 7.968us 1.328us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 26.01% 6.816us 1.136us 6
- Activity Buffer Request 29.98% 260.620us 29.98% 260.620us 260.620us 1.152us 4.40% 1.152us 1.152us 1
- aten::empty_strided 3.54% 30.811us 3.54% 30.811us 5.135us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 26.22% 227.968us 26.22% 227.968us 37.995us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.77% 32.731us 4.76% 41.380us 3.448us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.99% 8.649us 0.99% 8.649us 0.721us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 4.73% 41.141us 4.73% 41.141us 6.857us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.54% 4.651us 0.54% 4.651us 4.651us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 338.587us 1293.50% 338.587us 338.587us 1
+ hf_kernels_rotary 19.34% 157.366us 99.42% 808.960us 808.960us 0.000us 0.00% 27.296us 27.296us 1
+ _rotary_dba7d1e::apply_rotary 5.26% 42.761us 10.55% 85.842us 14.307us 19.392us 74.08% 19.392us 3.232us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.392us 74.08% 19.392us 3.232us 6
+ aten::clone 2.60% 21.121us 64.41% 524.052us 87.342us 0.000us 0.00% 7.904us 1.317us 6
+ aten::copy_ 4.60% 37.442us 58.06% 472.441us 78.740us 6.784us 25.92% 7.904us 1.317us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 25.92% 6.784us 1.131us 6
+ Activity Buffer Request 31.61% 257.196us 31.61% 257.196us 257.196us 1.120us 4.28% 1.120us 1.120us 1
+ aten::empty_strided 3.75% 30.490us 3.75% 30.490us 5.082us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 21.85% 177.803us 21.85% 177.803us 29.634us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.95% 32.140us 5.12% 41.700us 3.475us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.17% 9.560us 1.17% 9.560us 0.797us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.29% 43.081us 5.29% 43.081us 7.180us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.58% 4.711us 0.58% 4.711us 4.711us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 869.342us
-Self CUDA time total: 26.207us
+Self CPU time total: 813.671us
+Self CUDA time total: 26.176us
@@ -4327,23 +4327,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 345.689us 1259.02% 345.689us 345.689us 1
- hf_kernels_rotary 18.17% 159.455us 99.46% 872.870us 872.870us 0.000us 0.00% 28.769us 28.769us 1
- _rotary_dba7d1e::apply_rotary 4.92% 43.180us 9.80% 85.973us 14.329us 19.616us 71.44% 19.616us 3.269us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.616us 71.44% 19.616us 3.269us 6
- aten::clone 2.64% 23.140us 66.83% 586.460us 97.743us 0.000us 0.00% 9.153us 1.526us 6
- aten::copy_ 4.27% 37.430us 60.39% 529.960us 88.327us 7.841us 28.56% 9.153us 1.526us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.841us 28.56% 7.841us 1.307us 6
- Activity Buffer Request 29.89% 262.350us 29.89% 262.350us 262.350us 1.312us 4.78% 1.312us 1.312us 1
- aten::empty_strided 3.80% 33.360us 3.80% 33.360us 5.560us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 26.23% 230.180us 26.23% 230.180us 38.363us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.66% 32.161us 4.67% 40.982us 3.415us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 1.01% 8.821us 1.01% 8.821us 0.735us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 4.88% 42.793us 4.88% 42.793us 7.132us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.54% 4.730us 0.54% 4.730us 4.730us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.862us 1278.50% 349.862us 349.862us 1
+ hf_kernels_rotary 19.32% 156.134us 99.42% 803.460us 803.460us 0.000us 0.00% 28.709us 28.709us 1
+ _rotary_dba7d1e::apply_rotary 5.33% 43.099us 10.84% 87.643us 14.607us 19.428us 71.00% 19.428us 3.238us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.428us 71.00% 19.428us 3.238us 6
+ aten::clone 2.80% 22.600us 63.71% 514.893us 85.816us 0.000us 0.00% 9.281us 1.547us 6
+ aten::copy_ 4.89% 39.481us 56.99% 460.582us 76.764us 7.937us 29.00% 9.281us 1.547us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.937us 29.00% 7.937us 1.323us 6
+ Activity Buffer Request 27.85% 225.076us 27.85% 225.076us 225.076us 1.344us 4.91% 1.344us 1.344us 1
+ aten::empty_strided 3.92% 31.711us 3.92% 31.711us 5.285us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 24.26% 196.025us 24.26% 196.025us 32.671us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.38% 35.400us 5.54% 44.790us 3.732us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.16% 9.390us 1.16% 9.390us 0.782us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.51% 44.544us 5.51% 44.544us 7.424us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.58% 4.720us 0.58% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 877.600us
-Self CUDA time total: 27.457us
+Self CPU time total: 808.180us
+Self CUDA time total: 27.365us
@@ -4353,23 +4353,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.280us 1238.42% 352.280us 352.280us 1
- hf_kernels_rotary 18.63% 163.526us 99.48% 873.041us 873.041us 0.000us 0.00% 29.790us 29.790us 1
- _rotary_dba7d1e::apply_rotary 4.98% 43.742us 9.85% 86.414us 14.402us 20.606us 72.44% 20.606us 3.434us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.606us 72.44% 20.606us 3.434us 6
- aten::clone 2.59% 22.720us 66.23% 581.279us 96.880us 0.000us 0.00% 9.184us 1.531us 6
- aten::copy_ 4.14% 36.351us 59.98% 526.379us 87.730us 7.840us 27.56% 9.184us 1.531us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 27.56% 7.840us 1.307us 6
- Activity Buffer Request 30.03% 263.549us 30.03% 263.549us 263.549us 1.344us 4.72% 1.344us 1.344us 1
- aten::empty_strided 3.67% 32.180us 3.67% 32.180us 5.363us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 25.81% 226.479us 25.81% 226.479us 37.747us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.76% 33.033us 4.77% 41.822us 3.485us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 1.00% 8.789us 1.00% 8.789us 0.732us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 4.86% 42.672us 4.86% 42.672us 7.112us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.52% 4.560us 0.52% 4.560us 4.560us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.981us 1235.85% 349.981us 349.981us 1
+ hf_kernels_rotary 8.03% 161.215us 99.76% 2.003ms 2.003ms 0.000us 0.00% 29.663us 29.663us 1
+ _rotary_dba7d1e::apply_rotary 2.11% 42.422us 4.23% 84.982us 14.164us 20.544us 72.54% 20.544us 3.424us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.544us 72.54% 20.544us 3.424us 6
+ aten::clone 1.12% 22.572us 85.29% 1.712ms 285.349us 0.000us 0.00% 9.119us 1.520us 6
+ aten::copy_ 1.91% 38.260us 82.54% 1.657ms 276.143us 7.775us 27.46% 9.119us 1.520us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 27.46% 7.775us 1.296us 6
+ Activity Buffer Request 71.67% 1.439ms 71.67% 1.439ms 1.439ms 1.344us 4.75% 1.344us 1.344us 1
+ aten::empty_strided 1.63% 32.660us 1.63% 32.660us 5.443us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.96% 179.936us 8.96% 179.936us 29.989us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.74% 34.910us 2.20% 44.250us 3.688us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.47% 9.340us 0.47% 9.340us 0.778us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.12% 42.560us 2.12% 42.560us 7.093us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.24% 4.741us 0.24% 4.741us 4.741us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 877.601us
-Self CUDA time total: 28.446us
+Self CPU time total: 2.007ms
+Self CUDA time total: 28.319us
@@ -4379,23 +4379,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 341.881us 953.86% 341.881us 341.881us 1
- hf_kernels_rotary 17.61% 155.956us 99.45% 880.921us 880.921us 0.000us 0.00% 37.634us 37.634us 1
- _rotary_dba7d1e::apply_rotary 4.86% 43.060us 9.73% 86.184us 14.364us 25.312us 70.62% 25.312us 4.219us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.312us 70.62% 25.312us 4.219us 6
- aten::clone 2.52% 22.319us 67.43% 597.290us 99.548us 0.000us 0.00% 12.322us 2.054us 6
- aten::copy_ 4.12% 36.502us 61.34% 543.331us 90.555us 10.530us 29.38% 12.322us 2.054us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.530us 29.38% 10.530us 1.755us 6
- Activity Buffer Request 31.67% 280.550us 31.67% 280.550us 280.550us 1.792us 5.00% 1.792us 1.792us 1
- aten::empty_strided 3.57% 31.640us 3.57% 31.640us 5.273us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 25.54% 226.279us 25.54% 226.279us 37.713us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.70% 32.812us 4.68% 41.491us 3.458us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.98% 8.679us 0.98% 8.679us 0.723us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 4.87% 43.124us 4.87% 43.124us 7.187us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.55% 4.910us 0.55% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.238us 971.27% 346.238us 346.238us 1
+ hf_kernels_rotary 8.04% 160.124us 99.76% 1.988ms 1.988ms 0.000us 0.00% 37.440us 37.440us 1
+ _rotary_dba7d1e::apply_rotary 2.20% 43.921us 4.24% 84.493us 14.082us 25.216us 70.74% 25.216us 4.203us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.216us 70.74% 25.216us 4.203us 6
+ aten::clone 1.14% 22.762us 85.30% 1.700ms 283.325us 0.000us 0.00% 12.224us 2.037us 6
+ aten::copy_ 1.84% 36.620us 82.53% 1.645ms 274.105us 10.432us 29.26% 12.224us 2.037us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 29.26% 10.432us 1.739us 6
+ Activity Buffer Request 71.70% 1.429ms 71.70% 1.429ms 1.429ms 1.792us 5.03% 1.792us 1.792us 1
+ aten::empty_strided 1.63% 32.561us 1.63% 32.561us 5.427us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.99% 179.114us 8.99% 179.114us 29.852us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.72% 34.250us 2.18% 43.390us 3.616us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.46% 9.140us 0.46% 9.140us 0.762us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.04% 40.572us 2.04% 40.572us 6.762us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.24% 4.860us 0.24% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 885.831us
-Self CUDA time total: 35.842us
+Self CPU time total: 1.993ms
+Self CUDA time total: 35.648us
@@ -4405,23 +4405,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 348.158us 1221.01% 348.158us 348.158us 1
- hf_kernels_rotary 7.73% 158.832us 99.76% 2.051ms 2.051ms 0.000us 0.00% 29.858us 29.858us 1
- _rotary_dba7d1e::apply_rotary 2.18% 44.723us 4.13% 84.825us 14.138us 20.674us 72.50% 20.674us 3.446us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.674us 72.50% 20.674us 3.446us 6
- aten::clone 1.24% 25.490us 85.81% 1.764ms 294.032us 0.000us 0.00% 9.184us 1.531us 6
- aten::copy_ 1.80% 37.082us 83.01% 1.707ms 284.462us 7.840us 27.50% 9.184us 1.531us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 27.50% 7.840us 1.307us 6
- Activity Buffer Request 70.14% 1.442ms 70.14% 1.442ms 1.442ms 1.344us 4.71% 1.344us 1.344us 1
- aten::empty_strided 1.55% 31.931us 1.55% 31.931us 5.322us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 11.07% 227.598us 11.07% 227.598us 37.933us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 1.67% 34.312us 2.11% 43.312us 3.609us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.44% 9.000us 0.44% 9.000us 0.750us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 1.95% 40.102us 1.95% 40.102us 6.684us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.24% 4.880us 0.24% 4.880us 4.880us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.675us 1229.10% 347.675us 347.675us 1
+ hf_kernels_rotary 8.06% 160.274us 99.76% 1.984ms 1.984ms 0.000us 0.00% 29.631us 29.631us 1
+ _rotary_dba7d1e::apply_rotary 2.18% 43.331us 4.28% 85.164us 14.194us 20.511us 72.51% 20.511us 3.418us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.511us 72.51% 20.511us 3.418us 6
+ aten::clone 1.13% 22.531us 85.26% 1.696ms 282.610us 0.000us 0.00% 9.120us 1.520us 6
+ aten::copy_ 1.97% 39.252us 82.52% 1.641ms 273.528us 7.776us 27.49% 9.120us 1.520us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 27.49% 7.776us 1.296us 6
+ Activity Buffer Request 71.58% 1.424ms 71.58% 1.424ms 1.424ms 1.344us 4.75% 1.344us 1.344us 1
+ aten::empty_strided 1.61% 31.959us 1.61% 31.959us 5.326us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.97% 178.354us 8.97% 178.354us 29.726us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.68% 33.430us 2.16% 42.920us 3.577us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.48% 9.490us 0.48% 9.490us 0.791us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.10% 41.833us 2.10% 41.833us 6.972us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.24% 4.801us 0.24% 4.801us 4.801us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.056ms
-Self CUDA time total: 28.514us
+Self CPU time total: 1.989ms
+Self CUDA time total: 28.287us
@@ -4431,23 +4431,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 344.320us 959.86% 344.320us 344.320us 1
- hf_kernels_rotary 18.29% 156.315us 99.44% 849.960us 849.960us 0.000us 0.00% 37.664us 37.664us 1
- _rotary_dba7d1e::apply_rotary 5.15% 43.990us 10.72% 91.654us 15.276us 25.312us 70.56% 25.312us 4.219us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.312us 70.56% 25.312us 4.219us 6
- aten::clone 2.62% 22.368us 65.70% 561.560us 93.593us 0.000us 0.00% 12.352us 2.059us 6
- aten::copy_ 4.13% 35.283us 59.24% 506.308us 84.385us 10.560us 29.44% 12.352us 2.059us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 29.44% 10.560us 1.760us 6
- Activity Buffer Request 29.39% 251.239us 29.39% 251.239us 251.239us 1.792us 5.00% 1.792us 1.792us 1
- aten::empty_strided 3.85% 32.884us 3.85% 32.884us 5.481us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 25.71% 219.786us 25.71% 219.786us 36.631us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.67% 31.402us 4.73% 40.431us 3.369us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 1.06% 9.029us 1.06% 9.029us 0.752us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 5.58% 47.664us 5.58% 47.664us 7.944us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.56% 4.781us 0.56% 4.781us 4.781us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 341.434us 959.52% 341.434us 341.434us 1
+ hf_kernels_rotary 20.68% 156.375us 99.37% 751.248us 751.248us 0.000us 0.00% 37.312us 37.312us 1
+ _rotary_dba7d1e::apply_rotary 5.66% 42.780us 11.14% 84.232us 14.039us 25.184us 70.77% 25.184us 4.197us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.184us 70.77% 25.184us 4.197us 6
+ aten::clone 3.01% 22.779us 61.92% 468.081us 78.014us 0.000us 0.00% 12.128us 2.021us 6
+ aten::copy_ 4.78% 36.161us 54.65% 413.150us 68.858us 10.400us 29.23% 12.128us 2.021us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 29.23% 10.400us 1.733us 6
+ Activity Buffer Request 26.22% 198.225us 26.22% 198.225us 198.225us 1.728us 4.86% 1.728us 1.728us 1
+ aten::empty_strided 4.25% 32.152us 4.25% 32.152us 5.359us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 23.65% 178.764us 23.65% 178.764us 29.794us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.40% 33.290us 5.63% 42.560us 3.547us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.23% 9.270us 1.23% 9.270us 0.773us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.48% 41.452us 5.48% 41.452us 6.909us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.63% 4.741us 0.63% 4.741us 4.741us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 854.741us
-Self CUDA time total: 35.872us
+Self CPU time total: 755.989us
+Self CUDA time total: 35.584us
@@ -4457,23 +4457,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 335.158us 593.10% 335.158us 335.158us 1
- hf_kernels_rotary 18.22% 154.324us 99.44% 842.379us 842.379us 0.000us 0.00% 59.390us 59.390us 1
- _rotary_dba7d1e::apply_rotary 4.99% 42.273us 9.84% 83.374us 13.896us 39.454us 69.82% 39.454us 6.576us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.454us 69.82% 39.454us 6.576us 6
- aten::clone 2.56% 21.663us 66.58% 564.010us 94.002us 0.000us 0.00% 19.936us 3.323us 6
- aten::copy_ 4.16% 35.260us 60.33% 511.017us 85.169us 17.056us 30.18% 19.936us 3.323us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 30.18% 17.056us 2.843us 6
- Activity Buffer Request 30.26% 256.319us 30.26% 256.319us 256.319us 2.880us 5.10% 2.880us 2.880us 1
- aten::empty_strided 3.70% 31.330us 3.70% 31.330us 5.222us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 25.90% 219.438us 25.90% 219.438us 36.573us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.75% 31.762us 4.80% 40.671us 3.389us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 1.05% 8.909us 1.05% 8.909us 0.742us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 4.85% 41.101us 4.85% 41.101us 6.850us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.56% 4.710us 0.56% 4.710us 4.710us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.886us 617.06% 349.886us 349.886us 1
+ hf_kernels_rotary 15.93% 158.238us 99.46% 988.285us 988.285us 0.000us 0.00% 59.582us 59.582us 1
+ _rotary_dba7d1e::apply_rotary 4.43% 44.009us 8.77% 87.171us 14.528us 39.742us 70.09% 39.742us 6.624us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.742us 70.09% 39.742us 6.624us 6
+ aten::clone 2.20% 21.907us 70.33% 698.845us 116.474us 0.000us 0.00% 19.840us 3.307us 6
+ aten::copy_ 3.76% 37.392us 65.02% 646.067us 107.678us 16.960us 29.91% 19.840us 3.307us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 29.91% 16.960us 2.827us 6
+ Activity Buffer Request 43.30% 430.221us 43.30% 430.221us 430.221us 2.880us 5.08% 2.880us 2.880us 1
+ aten::empty_strided 3.11% 30.871us 3.11% 30.871us 5.145us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 17.96% 178.454us 17.96% 178.454us 29.742us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.43% 34.051us 4.43% 44.031us 3.669us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.00% 9.980us 1.00% 9.980us 0.832us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 4.34% 43.162us 4.34% 43.162us 7.194us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.54% 5.320us 0.54% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 847.089us
-Self CUDA time total: 56.510us
+Self CPU time total: 993.605us
+Self CUDA time total: 56.702us
@@ -4483,23 +4483,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 369.080us 312.82% 369.080us 369.080us 1
- hf_kernels_rotary 20.18% 177.506us 99.45% 874.621us 874.621us 0.000us 0.00% 134.912us 134.912us 1
- aten::clone 2.49% 21.878us 64.31% 565.600us 94.267us 0.000us 0.00% 69.696us 11.616us 6
- aten::copy_ 4.23% 37.163us 58.33% 512.969us 85.495us 52.768us 44.72% 69.696us 11.616us 6
- _rotary_dba7d1e::apply_rotary 5.24% 46.042us 10.09% 88.704us 14.784us 65.216us 55.28% 65.216us 10.869us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 65.216us 55.28% 65.216us 10.869us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.768us 44.72% 52.768us 8.795us 6
- Activity Buffer Request 28.97% 254.819us 28.97% 254.819us 254.819us 16.928us 14.35% 16.928us 16.928us 1
- aten::empty_strided 3.50% 30.753us 3.50% 30.753us 5.126us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 25.13% 220.987us 25.13% 220.987us 36.831us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.86% 33.990us 4.87% 42.811us 3.568us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 1.00% 8.821us 1.00% 8.821us 0.735us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 4.85% 42.662us 4.85% 42.662us 7.110us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.55% 4.870us 0.55% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.574us 297.38% 352.574us 352.574us 1
+ hf_kernels_rotary 18.56% 157.003us 99.43% 841.041us 841.041us 0.000us 0.00% 135.680us 135.680us 1
+ aten::clone 2.59% 21.881us 65.75% 556.174us 92.696us 0.000us 0.00% 69.984us 11.664us 6
+ aten::copy_ 4.37% 36.992us 59.34% 501.912us 83.652us 52.864us 44.59% 69.984us 11.664us 6
+ _rotary_dba7d1e::apply_rotary 5.11% 43.221us 10.14% 85.754us 14.292us 65.696us 55.41% 65.696us 10.949us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 65.696us 55.41% 65.696us 10.949us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.864us 44.59% 52.864us 8.811us 6
+ Activity Buffer Request 33.65% 284.597us 33.65% 284.597us 284.597us 17.120us 14.44% 17.120us 17.120us 1
+ aten::empty_strided 3.83% 32.381us 3.83% 32.381us 5.397us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 21.32% 180.323us 21.32% 180.323us 30.054us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.89% 32.880us 4.98% 42.110us 3.509us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.09% 9.230us 1.09% 9.230us 0.769us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.03% 42.533us 5.03% 42.533us 7.089us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.57% 4.810us 0.57% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 879.491us
-Self CUDA time total: 117.984us
+Self CPU time total: 845.851us
+Self CUDA time total: 118.560us
@@ -4509,23 +4509,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 360.471us 637.52% 360.471us 360.471us 1
- hf_kernels_rotary 18.70% 161.865us 99.47% 860.760us 860.760us 0.000us 0.00% 59.391us 59.391us 1
- _rotary_dba7d1e::apply_rotary 5.21% 45.111us 10.32% 89.333us 14.889us 39.487us 69.84% 39.487us 6.581us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.487us 69.84% 39.487us 6.581us 6
- aten::clone 2.76% 23.842us 65.28% 564.941us 94.157us 0.000us 0.00% 19.904us 3.317us 6
- aten::copy_ 4.31% 37.312us 58.89% 509.589us 84.931us 17.056us 30.16% 19.904us 3.317us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 30.16% 17.056us 2.843us 6
- Activity Buffer Request 29.00% 250.989us 29.00% 250.989us 250.989us 2.848us 5.04% 2.848us 2.848us 1
- aten::empty_strided 3.64% 31.510us 3.64% 31.510us 5.252us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 25.57% 221.288us 25.57% 221.288us 36.881us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 4.04% 34.983us 5.16% 44.621us 3.718us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 1.11% 9.638us 1.11% 9.638us 0.803us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 5.11% 44.222us 5.11% 44.222us 7.370us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.53% 4.600us 0.53% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 341.982us 603.45% 341.982us 341.982us 1
+ hf_kernels_rotary 18.98% 155.712us 99.43% 815.710us 815.710us 0.000us 0.00% 59.487us 59.487us 1
+ _rotary_dba7d1e::apply_rotary 5.25% 43.112us 10.37% 85.045us 14.174us 39.839us 70.30% 39.839us 6.640us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.839us 70.30% 39.839us 6.640us 6
+ aten::clone 2.51% 20.600us 64.82% 531.763us 88.627us 0.000us 0.00% 19.648us 3.275us 6
+ aten::copy_ 4.52% 37.100us 58.54% 480.262us 80.044us 16.832us 29.70% 19.648us 3.275us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 29.70% 16.832us 2.805us 6
+ Activity Buffer Request 32.45% 266.237us 32.45% 266.237us 266.237us 2.816us 4.97% 2.816us 2.816us 1
+ aten::empty_strided 3.77% 30.901us 3.77% 30.901us 5.150us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 21.57% 176.925us 21.57% 176.925us 29.488us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.05% 33.240us 5.26% 43.190us 3.599us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.21% 9.950us 1.21% 9.950us 0.829us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.11% 41.933us 5.11% 41.933us 6.989us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.57% 4.700us 0.57% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 865.360us
-Self CUDA time total: 56.543us
+Self CPU time total: 820.410us
+Self CUDA time total: 56.671us
@@ -4535,23 +4535,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 348.053us 293.57% 348.053us 348.053us 1
- hf_kernels_rotary 18.59% 158.086us 99.46% 845.630us 845.630us 0.000us 0.00% 135.933us 135.933us 1
- aten::clone 2.59% 22.020us 65.95% 560.690us 93.448us 0.000us 0.00% 70.752us 11.792us 6
- aten::copy_ 4.43% 37.632us 59.68% 507.389us 84.565us 53.376us 45.02% 70.752us 11.792us 6
- _rotary_dba7d1e::apply_rotary 5.16% 43.870us 10.14% 86.234us 14.372us 65.181us 54.98% 65.181us 10.864us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 65.181us 54.98% 65.181us 10.864us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.376us 45.02% 53.376us 8.896us 6
- Activity Buffer Request 29.66% 252.179us 29.66% 252.179us 252.179us 17.376us 14.66% 17.376us 17.376us 1
- aten::empty_strided 3.68% 31.281us 3.68% 31.281us 5.213us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 25.59% 217.578us 25.59% 217.578us 36.263us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.78% 32.121us 4.78% 40.620us 3.385us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 1.00% 8.499us 1.00% 8.499us 0.708us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 4.98% 42.364us 4.98% 42.364us 7.061us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.54% 4.590us 0.54% 4.590us 4.590us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 388.726us 325.86% 388.726us 388.726us 1
+ hf_kernels_rotary 19.76% 169.936us 99.45% 855.401us 855.401us 0.000us 0.00% 136.923us 136.923us 1
+ aten::clone 2.64% 22.710us 63.15% 543.123us 90.521us 0.000us 0.00% 70.877us 11.813us 6
+ aten::copy_ 4.46% 38.370us 56.50% 485.931us 80.988us 53.246us 44.64% 70.877us 11.813us 6
+ _rotary_dba7d1e::apply_rotary 5.64% 48.490us 10.91% 93.801us 15.634us 66.046us 55.36% 66.046us 11.008us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 66.046us 55.36% 66.046us 11.008us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.246us 44.64% 53.246us 8.874us 6
+ Activity Buffer Request 30.83% 265.147us 30.83% 265.147us 265.147us 17.631us 14.78% 17.631us 17.631us 1
+ aten::empty_strided 4.01% 34.482us 4.01% 34.482us 5.747us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 21.21% 182.414us 21.21% 182.414us 30.402us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.39% 37.781us 5.64% 48.541us 4.045us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.25% 10.760us 1.25% 10.760us 0.897us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.27% 45.311us 5.27% 45.311us 7.552us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.55% 4.700us 0.55% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 850.220us
-Self CUDA time total: 118.557us
+Self CPU time total: 860.101us
+Self CUDA time total: 119.292us
@@ -4561,23 +4561,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 361.432us 183.93% 361.432us 361.432us 1
- hf_kernels_rotary 18.55% 158.934us 99.44% 851.909us 851.909us 0.000us 0.00% 220.221us 220.221us 1
- _rotary_dba7d1e::apply_rotary 5.09% 43.629us 10.06% 86.174us 14.362us 115.517us 58.78% 115.517us 19.253us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 115.517us 58.78% 115.517us 19.253us 6
- aten::clone 2.64% 22.651us 66.00% 565.440us 94.240us 0.000us 0.00% 104.704us 17.451us 6
- aten::copy_ 4.43% 37.970us 59.78% 512.129us 85.355us 80.992us 41.22% 104.704us 17.451us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 80.992us 41.22% 80.992us 13.499us 6
- Activity Buffer Request 29.36% 251.489us 29.36% 251.489us 251.489us 23.712us 12.07% 23.712us 23.712us 1
- aten::empty_strided 3.58% 30.660us 3.58% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 25.99% 222.670us 25.99% 222.670us 37.112us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.80% 32.582us 4.83% 41.361us 3.447us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 1.02% 8.779us 1.02% 8.779us 0.732us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 4.97% 42.545us 4.97% 42.545us 7.091us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.56% 4.770us 0.56% 4.770us 4.770us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 357.115us 181.96% 357.115us 357.115us 1
+ hf_kernels_rotary 18.86% 155.885us 99.43% 821.750us 821.750us 0.000us 0.00% 219.904us 219.904us 1
+ _rotary_dba7d1e::apply_rotary 5.36% 44.321us 10.59% 87.561us 14.594us 115.808us 59.01% 115.808us 19.301us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 115.808us 59.01% 115.808us 19.301us 6
+ aten::clone 2.51% 20.740us 64.81% 535.643us 89.274us 0.000us 0.00% 104.096us 17.349us 6
+ aten::copy_ 4.34% 35.891us 58.73% 485.402us 80.900us 80.448us 40.99% 104.096us 17.349us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 80.448us 40.99% 80.448us 13.408us 6
+ Activity Buffer Request 32.66% 269.957us 32.66% 269.957us 269.957us 23.648us 12.05% 23.648us 23.648us 1
+ aten::empty_strided 3.57% 29.501us 3.57% 29.501us 4.917us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 21.72% 179.554us 21.72% 179.554us 29.926us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.97% 32.801us 5.16% 42.661us 3.555us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.19% 9.860us 1.19% 9.860us 0.822us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.23% 43.240us 5.23% 43.240us 7.207us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.57% 4.750us 0.57% 4.750us 4.750us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 856.679us
-Self CUDA time total: 196.509us
+Self CPU time total: 826.500us
+Self CUDA time total: 196.256us
@@ -4587,29 +4587,29 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_rotary 12.27% 154.345us 67.03% 843.460us 843.460us 0.000us 0.00% 849.461us 849.461us 1
- hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 791.349us 101.00% 791.349us 791.349us 1
- aten::clone 1.79% 22.531us 44.41% 558.811us 93.135us 0.000us 0.00% 577.848us 96.308us 6
- aten::copy_ 2.94% 36.962us 40.15% 505.198us 84.200us 511.865us 65.33% 577.848us 96.308us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 511.865us 65.33% 511.865us 85.311us 6
- _rotary_dba7d1e::apply_rotary 3.50% 44.071us 7.04% 88.532us 14.755us 271.613us 34.67% 271.613us 45.269us 6
-void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 271.613us 34.67% 271.613us 45.269us 6
- Activity Buffer Request 20.09% 252.769us 20.09% 252.769us 252.769us 65.983us 8.42% 65.983us 65.983us 1
- aten::empty_strided 2.47% 31.082us 2.47% 31.082us 5.180us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 17.12% 215.467us 17.12% 215.467us 35.911us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.61% 32.851us 3.32% 41.772us 3.481us 0.000us 0.00% 0.000us 0.000us 12
- aten::as_strided 0.71% 8.921us 0.71% 8.921us 0.743us 0.000us 0.00% 0.000us 0.000us 12
- cudaLaunchKernel 3.53% 44.461us 3.53% 44.461us 7.410us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 32.97% 414.834us 32.97% 414.834us 414.834us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_rotary 13.04% 159.984us 66.42% 814.800us 814.800us 0.000us 0.00% 847.705us 847.705us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 789.466us 101.01% 789.466us 789.466us 1
+ aten::clone 1.84% 22.521us 42.98% 527.184us 87.864us 0.000us 0.00% 577.883us 96.314us 6
+ aten::copy_ 2.96% 36.311us 38.61% 473.681us 78.947us 511.772us 65.48% 577.883us 96.314us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 511.772us 65.48% 511.772us 85.295us 6
+ _rotary_dba7d1e::apply_rotary 3.59% 44.023us 6.92% 84.943us 14.157us 269.822us 34.52% 269.822us 44.970us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 269.822us 34.52% 269.822us 44.970us 6
+ Activity Buffer Request 21.07% 258.456us 21.07% 258.456us 258.456us 66.111us 8.46% 66.111us 66.111us 1
+ aten::empty_strided 2.53% 30.982us 2.53% 30.982us 5.164us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 14.58% 178.914us 14.58% 178.914us 29.819us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.74% 33.620us 3.48% 42.689us 3.557us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.74% 9.069us 0.74% 9.069us 0.756us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 3.34% 40.920us 3.34% 40.920us 6.820us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 33.58% 411.910us 33.58% 411.910us 411.910us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.258ms
-Self CUDA time total: 783.478us
+Self CPU time total: 1.227ms
+Self CUDA time total: 781.594us
impl wl p50(ms) ok
hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False
hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 False
-hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.09 False
hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False
@@ -4635,13 +4635,12 @@ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
-Installed 52 packages in 230ms
+Installed 52 packages in 233ms
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
-Fetching 5 files: 20%|██ | 1/5 [00:00<00:00, 7.39it/s]
-Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 16.59it/s]
-Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 15.43it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 22.14it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 22.12it/s]
Artifacts:
rotary.jsonl
diff --git a/rotary/impls/torch_rotary.html b/rotary/impls/torch_rotary.html
index f2e07316cf3df8891afa30950cda265901d2fcae..7606a093a65d04c40d580abf67d210368fd50dcd 100644
--- a/rotary/impls/torch_rotary.html
+++ b/rotary/impls/torch_rotary.html
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: nv | 0.23s
+Cell: nv | 0.20s
|
▶ run
Copy
Raw
@@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
-
Tue Oct 28 14:08:24 2025
+Wed Oct 29 14:26:51 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 29C P0 90W / 350W | 0MiB / 46068MiB | 24% Default |
+| N/A 32C P0 76W / 350W | 0MiB / 46068MiB | 11% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -3920,7 +3920,7 @@ Cell: nv | 0.23s
▼ output
▶ uv-logs
|
-Cell: benchmark | 3.87s
+Cell: benchmark | 3.84s
| ▶ run
Copy
Raw
@@ -3999,27 +3999,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.099ms 1229.41% 1.099ms 1.099ms 1
- torch_eager 14.68% 402.893us 99.74% 2.737ms 2.737ms 0.000us 0.00% 90.654us 90.654us 1
- aten::mul 6.18% 169.712us 10.63% 291.789us 12.158us 46.975us 52.54% 46.975us 1.957us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.975us 52.54% 46.975us 1.957us 24
- aten::copy_ 5.12% 140.498us 62.48% 1.714ms 95.244us 29.151us 32.61% 30.399us 1.689us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.400us 25.05% 22.400us 1.867us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.280us 14.85% 13.280us 1.107us 12
- aten::clone 1.37% 37.603us 60.57% 1.662ms 277.027us 0.000us 0.00% 7.999us 1.333us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 7.55% 6.751us 1.125us 6
- aten::sub 1.57% 43.112us 2.52% 69.272us 11.545us 6.688us 7.48% 6.688us 1.115us 6
- aten::add 1.32% 36.261us 2.18% 59.731us 9.955us 6.592us 7.37% 6.592us 1.099us 6
- Activity Buffer Request 52.27% 1.434ms 52.27% 1.434ms 1.434ms 1.248us 1.40% 1.248us 1.248us 1
- aten::empty_strided 2.02% 55.541us 2.02% 55.541us 9.257us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 2.66% 72.862us 2.66% 72.862us 12.144us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.02% 82.803us 3.84% 105.504us 4.396us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.83% 22.701us 0.83% 22.701us 0.946us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.69% 238.340us 8.69% 238.340us 4.965us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.26% 7.250us 0.26% 7.250us 7.250us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.124ms 1261.56% 1.124ms 1.124ms 1
+ torch_eager 14.73% 412.767us 99.72% 2.794ms 2.794ms 0.000us 0.00% 90.337us 90.337us 1
+ aten::mul 6.25% 175.043us 11.07% 310.105us 12.921us 46.912us 52.64% 46.912us 1.955us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.912us 52.64% 46.912us 1.955us 24
+ aten::copy_ 4.12% 115.463us 61.76% 1.730ms 96.132us 28.993us 32.53% 30.210us 1.678us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.368us 25.10% 22.368us 1.864us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.215us 14.83% 13.215us 1.101us 12
+ aten::clone 1.31% 36.692us 59.66% 1.671ms 278.565us 0.000us 0.00% 7.842us 1.307us 6
+ aten::sub 1.68% 47.063us 2.72% 76.213us 12.702us 6.655us 7.47% 6.655us 1.109us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.625us 7.43% 6.625us 1.104us 6
+ aten::add 1.39% 39.044us 2.34% 65.583us 10.930us 6.560us 7.36% 6.560us 1.093us 6
+ Activity Buffer Request 52.45% 1.470ms 52.45% 1.470ms 1.470ms 1.217us 1.37% 1.217us 1.217us 1
+ aten::empty_strided 1.99% 55.621us 1.99% 55.621us 9.270us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.66% 74.431us 2.66% 74.431us 12.405us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.98% 83.492us 3.80% 106.494us 4.437us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.82% 23.002us 0.82% 23.002us 0.958us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 9.34% 261.675us 9.34% 261.675us 5.452us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.28% 7.890us 0.28% 7.890us 7.890us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.744ms
-Self CUDA time total: 89.406us
+Self CPU time total: 2.802ms
+Self CUDA time total: 89.120us
@@ -4029,27 +4029,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.001ms 1104.88% 1.001ms 1.001ms 1
- torch_eager 13.31% 340.683us 99.79% 2.555ms 2.555ms 0.000us 0.00% 91.680us 91.680us 1
- aten::mul 6.04% 154.674us 10.48% 268.377us 11.182us 47.810us 52.79% 47.810us 1.992us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.810us 52.79% 47.810us 1.992us 24
- aten::copy_ 4.35% 111.424us 65.16% 1.668ms 92.682us 29.407us 32.47% 30.527us 1.696us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.559us 24.91% 22.559us 1.880us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.343us 14.73% 13.343us 1.112us 12
- aten::clone 1.08% 27.742us 62.03% 1.588ms 264.676us 0.000us 0.00% 7.968us 1.328us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.848us 7.56% 6.848us 1.141us 6
- aten::sub 1.52% 38.791us 2.50% 64.042us 10.674us 6.720us 7.42% 6.720us 1.120us 6
- aten::add 1.27% 32.413us 2.18% 55.903us 9.317us 6.623us 7.31% 6.623us 1.104us 6
- Activity Buffer Request 56.03% 1.434ms 56.03% 1.434ms 1.434ms 1.120us 1.24% 1.120us 1.120us 1
- aten::empty_strided 1.42% 36.451us 1.42% 36.451us 6.075us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 2.10% 53.872us 2.10% 53.872us 8.979us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.86% 73.182us 3.65% 93.342us 3.889us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.79% 20.160us 0.79% 20.160us 0.840us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 9.02% 231.028us 9.02% 231.028us 4.813us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.21% 5.420us 0.21% 5.420us 5.420us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 968.092us 1071.28% 968.092us 968.092us 1
+ torch_eager 12.50% 317.076us 99.79% 2.532ms 2.532ms 0.000us 0.00% 91.488us 91.488us 1
+ aten::mul 6.07% 153.959us 10.35% 262.528us 10.939us 47.648us 52.73% 47.648us 1.985us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.648us 52.73% 47.648us 1.985us 24
+ aten::copy_ 4.16% 105.603us 65.14% 1.653ms 91.828us 29.344us 32.47% 30.464us 1.692us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.592us 25.00% 22.592us 1.883us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.376us 14.80% 13.376us 1.115us 12
+ aten::clone 1.12% 28.391us 62.74% 1.592ms 265.351us 0.000us 0.00% 7.872us 1.312us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 7.47% 6.752us 1.125us 6
+ aten::sub 1.55% 39.261us 2.49% 63.132us 10.522us 6.688us 7.40% 6.688us 1.115us 6
+ aten::add 1.47% 37.180us 2.35% 59.741us 9.957us 6.688us 7.40% 6.688us 1.115us 6
+ Activity Buffer Request 56.17% 1.425ms 56.17% 1.425ms 1.425ms 1.120us 1.24% 1.120us 1.120us 1
+ aten::empty_strided 2.04% 51.662us 2.04% 51.662us 8.610us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.12% 53.792us 2.12% 53.792us 8.965us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.04% 77.153us 3.82% 96.932us 4.039us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.78% 19.779us 0.78% 19.779us 0.824us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.79% 223.101us 8.79% 223.101us 4.648us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.21% 5.210us 0.21% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.560ms
-Self CUDA time total: 90.560us
+Self CPU time total: 2.538ms
+Self CUDA time total: 90.368us
@@ -4059,27 +4059,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.341us 1003.42% 944.341us 944.341us 1
- torch_eager 12.66% 316.554us 99.80% 2.495ms 2.495ms 0.000us 0.00% 95.424us 95.424us 1
- aten::mul 6.01% 150.161us 10.40% 259.987us 10.833us 48.863us 51.92% 48.863us 2.036us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.863us 51.92% 48.863us 2.036us 24
- aten::copy_ 4.06% 101.511us 66.21% 1.655ms 91.941us 30.785us 32.71% 32.097us 1.783us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.009us 24.45% 23.009us 1.917us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.464us 15.37% 14.464us 1.205us 12
- aten::clone 1.08% 26.971us 63.11% 1.577ms 262.904us 0.000us 0.00% 9.088us 1.515us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 8.26% 7.776us 1.296us 6
- aten::add 1.43% 35.631us 2.33% 58.151us 9.692us 7.233us 7.69% 7.233us 1.205us 6
- aten::sub 1.42% 35.432us 2.34% 58.413us 9.736us 7.231us 7.68% 7.231us 1.205us 6
- Activity Buffer Request 57.41% 1.435ms 57.41% 1.435ms 1.435ms 1.312us 1.39% 1.312us 1.312us 1
- aten::empty_strided 1.23% 30.860us 1.23% 30.860us 5.143us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 2.03% 50.692us 2.03% 50.692us 8.449us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.76% 69.107us 3.55% 88.725us 3.697us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.78% 19.618us 0.78% 19.618us 0.817us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.92% 222.961us 8.92% 222.961us 4.645us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.20% 5.071us 0.20% 5.071us 5.071us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.007ms 1071.77% 1.007ms 1.007ms 1
+ torch_eager 12.81% 333.813us 99.77% 2.600ms 2.600ms 0.000us 0.00% 95.234us 95.234us 1
+ aten::mul 6.17% 160.752us 10.75% 280.063us 11.669us 48.706us 51.86% 48.706us 2.029us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.706us 51.86% 48.706us 2.029us 24
+ aten::copy_ 4.30% 112.081us 64.85% 1.690ms 93.891us 30.753us 32.74% 32.065us 1.781us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.009us 24.50% 23.009us 1.917us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.463us 15.40% 14.463us 1.205us 12
+ aten::clone 1.08% 28.070us 62.18% 1.621ms 270.093us 0.000us 0.00% 9.056us 1.509us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 8.25% 7.744us 1.291us 6
+ aten::sub 1.50% 39.201us 2.50% 65.063us 10.844us 7.263us 7.73% 7.263us 1.211us 6
+ aten::add 1.40% 36.592us 2.30% 59.882us 9.980us 7.200us 7.67% 7.200us 1.200us 6
+ Activity Buffer Request 55.61% 1.449ms 55.61% 1.449ms 1.449ms 1.312us 1.40% 1.312us 1.312us 1
+ aten::empty_strided 1.87% 48.773us 1.87% 48.773us 8.129us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.21% 57.593us 2.21% 57.593us 9.599us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.85% 74.230us 3.62% 94.450us 3.935us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.78% 20.220us 0.78% 20.220us 0.842us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 9.19% 239.464us 9.19% 239.464us 4.989us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.23% 5.970us 0.23% 5.970us 5.970us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.500ms
-Self CUDA time total: 94.112us
+Self CPU time total: 2.606ms
+Self CUDA time total: 93.922us
@@ -4089,27 +4089,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 949.272us 934.99% 949.272us 949.272us 1
- torch_eager 11.74% 319.184us 99.83% 2.715ms 2.715ms 0.000us 0.00% 102.839us 102.839us 1
- aten::mul 5.42% 147.290us 9.69% 263.662us 10.986us 53.022us 52.22% 53.022us 2.209us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 53.022us 52.22% 53.022us 2.209us 24
- aten::copy_ 3.75% 101.924us 68.58% 1.865ms 103.635us 32.444us 31.96% 33.755us 1.875us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.637us 24.27% 24.637us 2.053us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.062us 15.82% 16.062us 1.339us 12
- aten::clone 1.13% 30.729us 66.03% 1.796ms 299.314us 0.000us 0.00% 9.118us 1.520us 6
- aten::add 1.18% 32.140us 2.02% 54.851us 9.142us 8.032us 7.91% 8.032us 1.339us 6
- aten::sub 1.29% 35.030us 2.16% 58.621us 9.770us 8.030us 7.91% 8.030us 1.338us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.807us 7.69% 7.807us 1.301us 6
- Activity Buffer Request 53.21% 1.447ms 53.21% 1.447ms 1.447ms 1.311us 1.29% 1.311us 1.311us 1
- aten::empty_strided 1.17% 31.801us 1.17% 31.801us 5.300us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 9.34% 254.009us 9.34% 254.009us 42.335us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.60% 70.842us 3.35% 90.984us 3.791us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.74% 20.142us 0.74% 20.142us 0.839us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.27% 224.985us 8.27% 224.985us 4.687us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.17% 4.671us 0.17% 4.671us 4.671us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 976.889us 967.02% 976.889us 976.889us 1
+ torch_eager 12.01% 329.416us 99.82% 2.739ms 2.739ms 0.000us 0.00% 102.333us 102.333us 1
+ aten::mul 5.67% 155.545us 9.73% 266.927us 11.122us 52.800us 52.27% 52.800us 2.200us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.800us 52.27% 52.800us 2.200us 24
+ aten::copy_ 3.82% 104.765us 68.18% 1.871ms 103.922us 32.349us 32.02% 33.661us 1.870us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.574us 24.33% 24.574us 2.048us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.872us 15.71% 15.872us 1.323us 12
+ aten::clone 1.07% 29.290us 65.23% 1.790ms 298.277us 0.000us 0.00% 9.087us 1.515us 6
+ aten::sub 1.39% 38.150us 2.28% 62.431us 10.405us 7.936us 7.86% 7.936us 1.323us 6
+ aten::add 1.24% 34.113us 2.07% 56.743us 9.457us 7.936us 7.86% 7.936us 1.323us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 7.70% 7.775us 1.296us 6
+ Activity Buffer Request 52.33% 1.436ms 52.33% 1.436ms 1.436ms 1.312us 1.30% 1.312us 1.312us 1
+ aten::empty_strided 1.16% 31.821us 1.16% 31.821us 5.304us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.42% 258.335us 9.42% 258.335us 43.056us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.63% 72.071us 3.33% 91.411us 3.809us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.70% 19.340us 0.70% 19.340us 0.806us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.39% 230.176us 8.39% 230.176us 4.795us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.18% 5.010us 0.18% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.720ms
-Self CUDA time total: 101.528us
+Self CPU time total: 2.744ms
+Self CUDA time total: 101.021us
@@ -4119,27 +4119,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.887us 1005.38% 944.887us 944.887us 1
- torch_eager 11.86% 320.838us 99.82% 2.700ms 2.700ms 0.000us 0.00% 95.295us 95.295us 1
- aten::mul 5.37% 145.335us 9.42% 254.837us 10.618us 49.024us 52.16% 49.024us 2.043us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.024us 52.16% 49.024us 2.043us 24
- aten::copy_ 3.87% 104.672us 68.80% 1.861ms 103.396us 30.783us 32.75% 32.095us 1.783us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 24.38% 22.912us 1.909us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.176us 15.08% 14.176us 1.181us 12
- aten::clone 1.07% 28.861us 66.14% 1.789ms 298.231us 0.000us 0.00% 9.183us 1.530us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.871us 8.37% 7.871us 1.312us 6
- aten::sub 1.26% 33.972us 2.12% 57.464us 9.577us 7.103us 7.56% 7.103us 1.184us 6
- aten::add 1.16% 31.253us 1.99% 53.964us 8.994us 7.073us 7.53% 7.073us 1.179us 6
- Activity Buffer Request 53.80% 1.456ms 53.80% 1.456ms 1.456ms 1.312us 1.40% 1.312us 1.312us 1
- aten::empty_strided 1.17% 31.633us 1.17% 31.633us 5.272us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 8.82% 238.648us 8.82% 238.648us 39.775us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.67% 72.119us 3.38% 91.532us 3.814us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.72% 19.413us 0.72% 19.413us 0.809us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.06% 217.970us 8.06% 217.970us 4.541us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.18% 4.990us 0.18% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 972.954us 1035.95% 972.954us 972.954us 1
+ torch_eager 11.82% 323.628us 99.83% 2.734ms 2.734ms 0.000us 0.00% 95.231us 95.231us 1
+ aten::mul 5.48% 150.092us 9.71% 265.906us 11.079us 48.958us 52.13% 48.958us 2.040us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.958us 52.13% 48.958us 2.040us 24
+ aten::copy_ 4.01% 109.805us 68.55% 1.878ms 104.307us 30.784us 32.78% 32.096us 1.783us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 24.40% 22.912us 1.909us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.177us 15.09% 14.177us 1.181us 12
+ aten::clone 0.98% 26.740us 65.50% 1.794ms 299.012us 0.000us 0.00% 9.184us 1.531us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 8.38% 7.872us 1.312us 6
+ aten::sub 1.35% 37.100us 2.22% 60.781us 10.130us 7.106us 7.57% 7.106us 1.184us 6
+ aten::add 1.26% 34.471us 2.07% 56.641us 9.440us 7.071us 7.53% 7.071us 1.178us 6
+ Activity Buffer Request 53.28% 1.459ms 53.28% 1.459ms 1.459ms 1.312us 1.40% 1.312us 1.312us 1
+ aten::empty_strided 1.12% 30.591us 1.12% 30.591us 5.098us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.84% 242.034us 8.84% 242.034us 40.339us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.64% 72.284us 3.37% 92.363us 3.848us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.73% 20.079us 0.73% 20.079us 0.837us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.33% 228.067us 8.33% 228.067us 4.751us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.17% 4.701us 0.17% 4.701us 4.701us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.705ms
-Self CUDA time total: 93.983us
+Self CPU time total: 2.739ms
+Self CUDA time total: 93.919us
@@ -4149,27 +4149,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 912.250us 902.45% 912.250us 912.250us 1
- torch_eager 10.84% 287.380us 99.80% 2.646ms 2.646ms 0.000us 0.00% 102.398us 102.398us 1
- aten::mul 5.43% 143.901us 9.61% 254.716us 10.613us 52.767us 52.20% 52.767us 2.199us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.767us 52.20% 52.767us 2.199us 24
- aten::copy_ 3.82% 101.373us 69.76% 1.849ms 102.733us 32.416us 32.07% 33.728us 1.874us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.608us 24.34% 24.608us 2.051us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.903us 15.73% 15.903us 1.325us 12
- aten::clone 0.89% 23.520us 66.94% 1.774ms 295.745us 0.000us 0.00% 9.120us 1.520us 6
- aten::add 1.25% 33.223us 2.12% 56.323us 9.387us 7.968us 7.88% 7.968us 1.328us 6
- aten::sub 1.34% 35.391us 2.21% 58.453us 9.742us 7.935us 7.85% 7.935us 1.322us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 7.72% 7.808us 1.301us 6
- Activity Buffer Request 54.59% 1.447ms 54.59% 1.447ms 1.447ms 1.312us 1.30% 1.312us 1.312us 1
- aten::empty_strided 1.14% 30.292us 1.14% 30.292us 5.049us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 9.04% 239.538us 9.04% 239.538us 39.923us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.52% 66.730us 3.23% 85.664us 3.569us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.71% 18.934us 0.71% 18.934us 0.789us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.23% 218.091us 8.23% 218.091us 4.544us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.20% 5.360us 0.20% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 940.506us 929.78% 940.506us 940.506us 1
+ torch_eager 10.47% 280.203us 99.80% 2.672ms 2.672ms 0.000us 0.00% 102.466us 102.466us 1
+ aten::mul 5.68% 151.942us 9.93% 265.874us 11.078us 52.767us 52.17% 52.767us 2.199us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.767us 52.17% 52.767us 2.199us 24
+ aten::copy_ 3.99% 106.699us 69.68% 1.866ms 103.641us 32.384us 32.01% 33.696us 1.872us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.672us 24.39% 24.672us 2.056us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.003us 15.82% 16.003us 1.334us 12
+ aten::clone 0.80% 21.540us 66.42% 1.778ms 296.379us 0.000us 0.00% 9.024us 1.504us 6
+ aten::sub 1.42% 38.052us 2.40% 64.133us 10.689us 8.002us 7.91% 8.002us 1.334us 6
+ aten::add 1.23% 32.860us 2.10% 56.182us 9.364us 8.001us 7.91% 8.001us 1.333us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.62% 7.712us 1.285us 6
+ Activity Buffer Request 54.45% 1.458ms 54.45% 1.458ms 1.458ms 1.312us 1.30% 1.312us 1.312us 1
+ aten::empty_strided 1.14% 30.450us 1.14% 30.450us 5.075us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.74% 234.006us 8.74% 234.006us 39.001us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.58% 69.109us 3.28% 87.850us 3.660us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.70% 18.741us 0.70% 18.741us 0.781us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.61% 230.527us 8.61% 230.527us 4.803us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.20% 5.400us 0.20% 5.400us 5.400us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.651ms
-Self CUDA time total: 101.086us
+Self CPU time total: 2.677ms
+Self CUDA time total: 101.154us
@@ -4179,27 +4179,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 920.762us 761.21% 920.762us 920.762us 1
- torch_eager 10.74% 283.666us 99.80% 2.636ms 2.636ms 0.000us 0.00% 122.785us 122.785us 1
- aten::mul 5.61% 148.102us 9.80% 258.888us 10.787us 62.177us 51.40% 62.177us 2.591us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.177us 51.40% 62.177us 2.591us 24
- aten::copy_ 4.01% 105.842us 69.73% 1.842ms 102.324us 39.520us 32.67% 41.344us 2.297us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.896us 23.89% 28.896us 2.408us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.93% 19.264us 1.605us 12
- aten::clone 0.81% 21.319us 66.69% 1.761ms 293.582us 0.000us 0.00% 12.448us 2.075us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.624us 8.78% 10.624us 1.771us 6
- aten::add 1.23% 32.431us 2.08% 54.912us 9.152us 9.696us 8.02% 9.696us 1.616us 6
- aten::sub 1.34% 35.510us 2.24% 59.050us 9.842us 9.568us 7.91% 9.568us 1.595us 6
- Activity Buffer Request 54.62% 1.443ms 54.62% 1.443ms 1.443ms 1.824us 1.51% 1.824us 1.824us 1
- aten::empty_strided 1.13% 29.871us 1.13% 29.871us 4.979us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 8.76% 231.329us 8.76% 231.329us 38.555us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.53% 66.872us 3.28% 86.661us 3.611us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.75% 19.789us 0.75% 19.789us 0.825us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.28% 218.631us 8.28% 218.631us 4.555us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.20% 5.190us 0.20% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.015ms 844.44% 1.015ms 1.015ms 1
+ torch_eager 10.99% 299.529us 99.80% 2.720ms 2.720ms 0.000us 0.00% 122.045us 122.045us 1
+ aten::mul 5.97% 162.734us 10.28% 280.227us 11.676us 61.856us 51.45% 61.856us 2.577us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.856us 51.45% 61.856us 2.577us 24
+ aten::copy_ 4.97% 135.364us 68.63% 1.870ms 103.912us 39.199us 32.61% 41.023us 2.279us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.704us 23.88% 28.704us 2.392us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.166us 15.94% 19.166us 1.597us 12
+ aten::clone 0.84% 22.992us 64.39% 1.755ms 292.512us 0.000us 0.00% 12.319us 2.053us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.495us 8.73% 10.495us 1.749us 6
+ aten::add 1.19% 32.530us 2.08% 56.691us 9.448us 9.598us 7.98% 9.598us 1.600us 6
+ aten::sub 1.40% 38.111us 2.30% 62.811us 10.468us 9.568us 7.96% 9.568us 1.595us 6
+ Activity Buffer Request 52.53% 1.432ms 52.53% 1.432ms 1.432ms 1.824us 1.52% 1.824us 1.824us 1
+ aten::empty_strided 1.18% 32.290us 1.18% 32.290us 5.382us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.53% 232.585us 8.53% 232.585us 38.764us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.71% 73.938us 3.49% 95.000us 3.958us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.77% 21.062us 0.77% 21.062us 0.878us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.70% 237.086us 8.70% 237.086us 4.939us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.20% 5.570us 0.20% 5.570us 5.570us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.641ms
-Self CUDA time total: 120.961us
+Self CPU time total: 2.726ms
+Self CUDA time total: 120.221us
@@ -4209,27 +4209,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 939.640us 544.89% 939.640us 939.640us 1
- torch_eager 12.08% 323.576us 99.81% 2.674ms 2.674ms 0.000us 0.00% 175.325us 175.325us 1
- aten::mul 5.49% 147.107us 9.55% 255.901us 10.663us 89.504us 51.90% 89.504us 3.729us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.504us 51.90% 89.504us 3.729us 24
- aten::copy_ 3.83% 102.724us 68.48% 1.835ms 101.930us 57.918us 33.59% 60.798us 3.378us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.734us 23.62% 40.734us 3.395us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.023us 14.51% 25.023us 2.085us 12
- aten::clone 1.06% 28.292us 65.67% 1.760ms 293.252us 0.000us 0.00% 20.064us 3.344us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.184us 9.96% 17.184us 2.864us 6
- aten::add 1.22% 32.572us 2.05% 54.872us 9.145us 12.512us 7.26% 12.512us 2.085us 6
- aten::sub 1.28% 34.403us 2.15% 57.513us 9.586us 12.511us 7.26% 12.511us 2.085us 6
- Activity Buffer Request 53.69% 1.438ms 53.69% 1.438ms 1.438ms 2.880us 1.67% 2.880us 2.880us 1
- aten::empty_strided 1.12% 30.100us 1.12% 30.100us 5.017us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 8.57% 229.599us 8.57% 229.599us 38.267us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.59% 69.394us 3.32% 89.005us 3.709us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.73% 19.611us 0.73% 19.611us 0.817us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.14% 218.155us 8.14% 218.155us 4.545us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.19% 5.191us 0.19% 5.191us 5.191us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 951.101us 552.87% 951.101us 951.101us 1
+ torch_eager 11.67% 313.772us 99.81% 2.683ms 2.683ms 0.000us 0.00% 174.878us 174.878us 1
+ aten::mul 5.73% 154.081us 9.89% 265.836us 11.076us 89.599us 52.08% 89.599us 3.733us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.599us 52.08% 89.599us 3.733us 24
+ aten::copy_ 3.89% 104.453us 68.40% 1.838ms 102.128us 57.664us 33.52% 60.512us 3.362us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.832us 23.74% 40.832us 3.403us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.767us 14.40% 24.767us 2.064us 12
+ aten::clone 1.01% 27.120us 65.39% 1.758ms 292.937us 0.000us 0.00% 19.680us 3.280us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.832us 9.78% 16.832us 2.805us 6
+ aten::add 1.27% 34.231us 2.14% 57.531us 9.588us 12.416us 7.22% 12.416us 2.069us 6
+ aten::sub 1.34% 36.001us 2.22% 59.581us 9.930us 12.351us 7.18% 12.351us 2.059us 6
+ Activity Buffer Request 53.45% 1.437ms 53.45% 1.437ms 1.437ms 2.848us 1.66% 2.848us 2.848us 1
+ aten::empty_strided 1.13% 30.290us 1.13% 30.290us 5.048us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.55% 229.865us 8.55% 229.865us 38.311us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.63% 70.721us 3.36% 90.322us 3.763us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.73% 19.601us 0.73% 19.601us 0.817us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.41% 225.976us 8.41% 225.976us 4.708us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.19% 5.001us 0.19% 5.001us 5.001us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.679ms
-Self CUDA time total: 172.445us
+Self CPU time total: 2.688ms
+Self CUDA time total: 172.030us
@@ -4239,27 +4239,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 910.515us 751.54% 910.515us 910.515us 1
- torch_eager 19.90% 282.972us 99.65% 1.417ms 1.417ms 0.000us 0.00% 123.009us 123.009us 1
- aten::mul 10.25% 145.781us 17.92% 254.851us 10.619us 62.146us 51.30% 62.146us 2.589us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.146us 51.30% 62.146us 2.589us 24
- aten::copy_ 7.07% 100.509us 44.20% 628.439us 34.913us 39.743us 32.80% 41.599us 2.311us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 29.055us 23.98% 29.055us 2.421us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.90% 19.264us 1.605us 12
- aten::clone 1.59% 22.604us 38.82% 551.881us 91.980us 0.000us 0.00% 12.544us 2.091us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.688us 8.82% 10.688us 1.781us 6
- aten::add 2.23% 31.661us 3.79% 53.922us 8.987us 9.633us 7.95% 9.633us 1.606us 6
- aten::sub 2.49% 35.352us 4.13% 58.732us 9.789us 9.631us 7.95% 9.631us 1.605us 6
- Activity Buffer Request 16.91% 240.489us 16.91% 240.489us 240.489us 1.856us 1.53% 1.856us 1.856us 1
- aten::empty_strided 2.06% 29.230us 2.06% 29.230us 4.872us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 15.93% 226.498us 15.93% 226.498us 37.750us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 4.75% 67.473us 6.05% 86.070us 3.586us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.31% 18.597us 1.31% 18.597us 0.775us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 15.17% 215.654us 15.17% 215.654us 4.493us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.35% 4.980us 0.35% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 927.996us 768.63% 927.996us 927.996us 1
+ torch_eager 20.13% 284.369us 99.65% 1.408ms 1.408ms 0.000us 0.00% 122.557us 122.557us 1
+ aten::mul 10.77% 152.163us 18.72% 264.405us 11.017us 62.048us 51.39% 62.048us 2.585us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.048us 51.39% 62.048us 2.585us 24
+ aten::copy_ 7.56% 106.823us 43.43% 613.475us 34.082us 39.390us 32.63% 41.213us 2.290us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.864us 23.91% 28.864us 2.405us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.296us 15.98% 19.296us 1.608us 12
+ aten::clone 1.39% 19.620us 37.04% 523.281us 87.213us 0.000us 0.00% 12.349us 2.058us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.526us 8.72% 10.526us 1.754us 6
+ aten::add 2.28% 32.232us 3.86% 54.523us 9.087us 9.696us 8.03% 9.696us 1.616us 6
+ aten::sub 2.48% 35.082us 4.10% 57.982us 9.664us 9.600us 7.95% 9.600us 1.600us 6
+ Activity Buffer Request 14.96% 211.375us 14.96% 211.375us 211.375us 1.823us 1.51% 1.823us 1.823us 1
+ aten::empty_strided 2.07% 29.290us 2.07% 29.290us 4.882us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 16.27% 229.815us 16.27% 229.815us 38.302us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.68% 66.168us 5.95% 84.051us 3.502us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.27% 17.883us 1.27% 17.883us 0.745us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.78% 222.895us 15.78% 222.895us 4.644us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.35% 4.970us 0.35% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.422ms
-Self CUDA time total: 121.153us
+Self CPU time total: 1.413ms
+Self CUDA time total: 120.734us
@@ -4269,27 +4269,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 918.443us 533.10% 918.443us 918.443us 1
- torch_eager 20.03% 279.953us 99.65% 1.393ms 1.393ms 0.000us 0.00% 175.133us 175.133us 1
- aten::mul 10.59% 147.997us 18.47% 258.229us 10.760us 89.472us 51.93% 89.472us 3.728us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.472us 51.93% 89.472us 3.728us 24
- aten::copy_ 7.43% 103.844us 43.15% 603.182us 33.510us 57.887us 33.60% 60.735us 3.374us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.831us 23.70% 40.831us 3.403us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.926us 14.47% 24.926us 2.077us 12
- aten::clone 1.45% 20.289us 37.34% 521.998us 87.000us 0.000us 0.00% 19.904us 3.317us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 9.90% 17.056us 2.843us 6
- aten::add 2.21% 30.953us 3.79% 53.002us 8.834us 12.480us 7.24% 12.480us 2.080us 6
- aten::sub 2.40% 33.491us 4.09% 57.142us 9.524us 12.446us 7.22% 12.446us 2.074us 6
- Activity Buffer Request 14.98% 209.468us 14.98% 209.468us 209.468us 2.848us 1.65% 2.848us 2.848us 1
- aten::empty_strided 2.03% 28.380us 2.03% 28.380us 4.730us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 16.36% 228.728us 16.36% 228.728us 38.121us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 5.25% 73.370us 6.64% 92.881us 3.870us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.40% 19.511us 1.40% 19.511us 0.813us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 15.53% 217.074us 15.53% 217.074us 4.522us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.35% 4.950us 0.35% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 941.367us 547.21% 941.367us 941.367us 1
+ torch_eager 19.36% 280.543us 99.66% 1.444ms 1.444ms 0.000us 0.00% 174.877us 174.877us 1
+ aten::mul 10.67% 154.592us 18.48% 267.677us 11.153us 89.535us 52.05% 89.535us 3.731us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.535us 52.05% 89.535us 3.731us 24
+ aten::copy_ 7.38% 106.934us 44.27% 641.329us 35.629us 57.694us 33.54% 60.542us 3.363us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.701us 23.66% 40.701us 3.392us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.800us 14.42% 24.800us 2.067us 12
+ aten::clone 1.44% 20.830us 37.97% 550.103us 91.684us 0.000us 0.00% 19.841us 3.307us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.993us 9.88% 16.993us 2.832us 6
+ aten::add 2.36% 34.121us 3.90% 56.522us 9.420us 12.448us 7.24% 12.448us 2.075us 6
+ aten::sub 2.56% 37.161us 4.27% 61.881us 10.313us 12.352us 7.18% 12.352us 2.059us 6
+ Activity Buffer Request 16.20% 234.686us 16.20% 234.686us 234.686us 2.848us 1.66% 2.848us 2.848us 1
+ aten::empty_strided 2.02% 29.270us 2.02% 29.270us 4.878us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.95% 231.027us 15.95% 231.027us 38.505us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.63% 67.091us 5.92% 85.764us 3.573us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.29% 18.673us 1.29% 18.673us 0.778us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.80% 228.888us 15.80% 228.888us 4.768us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.34% 4.980us 0.34% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.398ms
-Self CUDA time total: 172.285us
+Self CPU time total: 1.449ms
+Self CUDA time total: 172.029us
@@ -4299,27 +4299,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 945.822us 332.63% 945.822us 945.822us 1
- torch_eager 11.69% 314.391us 99.81% 2.685ms 2.685ms 0.000us 0.00% 302.941us 302.941us 1
- aten::mul 5.41% 145.454us 9.45% 254.127us 10.589us 133.310us 46.88% 133.310us 5.555us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.310us 46.88% 133.310us 5.555us 24
- aten::copy_ 4.13% 111.027us 68.93% 1.854ms 103.002us 109.662us 38.57% 128.254us 7.125us 18
- aten::clone 1.07% 28.661us 65.93% 1.773ms 295.570us 0.000us 0.00% 70.912us 11.819us 6
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.342us 20.17% 57.342us 4.779us 12
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.320us 18.40% 52.320us 8.720us 6
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.377us 14.55% 41.377us 3.448us 12
- aten::sub 1.27% 34.091us 2.15% 57.911us 9.652us 20.704us 7.28% 20.704us 3.451us 6
- aten::add 1.22% 32.950us 2.07% 55.610us 9.268us 20.673us 7.27% 20.673us 3.446us 6
- Activity Buffer Request 54.12% 1.456ms 54.12% 1.456ms 1.456ms 18.592us 6.54% 18.592us 18.592us 1
- aten::empty_strided 1.18% 31.741us 1.18% 31.741us 5.290us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 8.32% 223.797us 8.32% 223.797us 37.300us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.55% 68.485us 3.28% 88.267us 3.678us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.74% 19.782us 0.74% 19.782us 0.824us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.13% 218.664us 8.13% 218.664us 4.555us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.19% 5.100us 0.19% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 950.141us 334.64% 950.141us 950.141us 1
+ torch_eager 11.47% 310.562us 99.82% 2.702ms 2.702ms 0.000us 0.00% 302.012us 302.012us 1
+ aten::mul 5.57% 150.802us 9.64% 260.955us 10.873us 133.822us 47.13% 133.822us 5.576us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.822us 47.13% 133.822us 5.576us 24
+ aten::copy_ 3.88% 105.155us 69.00% 1.868ms 103.782us 109.151us 38.44% 127.231us 7.068us 18
+ aten::clone 0.99% 26.749us 66.03% 1.788ms 297.926us 0.000us 0.00% 69.886us 11.648us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.345us 20.20% 57.345us 4.779us 12
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.806us 18.25% 51.806us 8.634us 6
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.959us 14.43% 40.959us 3.413us 12
+ aten::sub 1.29% 34.831us 2.15% 58.172us 9.695us 20.607us 7.26% 20.607us 3.435us 6
+ aten::add 1.26% 34.242us 2.11% 57.104us 9.517us 20.352us 7.17% 20.352us 3.392us 6
+ Activity Buffer Request 54.34% 1.471ms 54.34% 1.471ms 1.471ms 18.080us 6.37% 18.080us 18.080us 1
+ aten::empty_strided 1.13% 30.492us 1.13% 30.492us 5.082us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.33% 225.535us 8.33% 225.535us 37.589us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.63% 71.143us 3.33% 90.164us 3.757us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.70% 19.021us 0.70% 19.021us 0.793us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.22% 222.598us 8.22% 222.598us 4.637us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.18% 4.920us 0.18% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.690ms
-Self CUDA time total: 284.349us
+Self CPU time total: 2.707ms
+Self CUDA time total: 283.932us
@@ -4329,27 +4329,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 938.033us 165.64% 938.033us 938.033us 1
- torch_eager 20.89% 291.484us 99.63% 1.390ms 1.390ms 0.000us 0.00% 590.004us 590.004us 1
- aten::copy_ 7.34% 102.395us 41.53% 579.320us 32.184us 273.370us 48.27% 297.081us 16.504us 18
- aten::mul 10.73% 149.623us 18.75% 261.638us 10.902us 225.916us 39.89% 225.916us 9.413us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 225.916us 39.89% 225.916us 9.413us 24
- aten::clone 1.46% 20.369us 35.71% 498.147us 83.025us 0.000us 0.00% 206.459us 34.410us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.748us 32.27% 182.748us 30.458us 6
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.622us 16.00% 90.622us 7.552us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 67.007us 11.83% 67.007us 5.584us 12
- aten::sub 2.52% 35.222us 4.78% 66.682us 11.114us 34.272us 6.05% 34.272us 5.712us 6
- aten::add 2.30% 32.121us 4.02% 56.063us 9.344us 32.735us 5.78% 32.735us 5.456us 6
- Activity Buffer Request 14.16% 197.506us 14.16% 197.506us 197.506us 23.711us 4.19% 23.711us 23.711us 1
- aten::empty_strided 2.10% 29.332us 2.10% 29.332us 4.889us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 15.61% 217.828us 15.61% 217.828us 36.305us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 4.72% 65.792us 6.10% 85.041us 3.543us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.38% 19.249us 1.38% 19.249us 0.802us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 16.42% 229.008us 16.42% 229.008us 4.771us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.37% 5.150us 0.37% 5.150us 5.150us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 966.098us 169.87% 966.098us 966.098us 1
+ torch_eager 20.40% 290.715us 99.64% 1.420ms 1.420ms 0.000us 0.00% 592.377us 592.377us 1
+ aten::copy_ 7.41% 105.615us 41.73% 594.574us 33.032us 275.293us 48.40% 298.941us 16.608us 18
+ aten::mul 10.90% 155.244us 18.92% 269.648us 11.235us 227.071us 39.93% 227.071us 9.461us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 227.071us 39.93% 227.071us 9.461us 24
+ aten::clone 1.44% 20.483us 35.30% 502.923us 83.821us 0.000us 0.00% 207.134us 34.522us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 183.486us 32.26% 183.486us 30.581us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.807us 16.14% 91.807us 7.651us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.365us 11.67% 66.365us 5.530us 12
+ aten::sub 2.66% 37.929us 4.43% 63.131us 10.522us 33.790us 5.94% 33.790us 5.632us 6
+ aten::add 2.47% 35.251us 4.15% 59.172us 9.862us 32.575us 5.73% 32.575us 5.429us 6
+ Activity Buffer Request 13.81% 196.814us 13.81% 196.814us 196.814us 23.648us 4.16% 23.648us 23.648us 1
+ aten::empty_strided 2.02% 28.790us 2.02% 28.790us 4.798us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.63% 222.685us 15.63% 222.685us 37.114us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 5.20% 74.092us 6.55% 93.282us 3.887us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.35% 19.190us 1.35% 19.190us 0.800us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.35% 232.987us 16.35% 232.987us 4.854us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.36% 5.080us 0.36% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.395ms
-Self CUDA time total: 566.293us
+Self CPU time total: 1.425ms
+Self CUDA time total: 568.729us
@@ -4359,27 +4359,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 912.211us 984.01% 912.211us 912.211us 1
- torch_eager 20.74% 286.708us 99.62% 1.377ms 1.377ms 0.000us 0.00% 93.855us 93.855us 1
- aten::mul 10.48% 144.890us 18.31% 253.080us 10.545us 49.856us 53.78% 49.856us 2.077us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.856us 53.78% 49.856us 2.077us 24
- aten::copy_ 7.33% 101.333us 42.51% 587.542us 32.641us 29.407us 31.72% 30.559us 1.698us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.623us 24.40% 22.623us 1.885us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.440us 14.50% 13.440us 1.120us 12
- aten::clone 1.54% 21.251us 36.76% 508.068us 84.678us 0.000us 0.00% 7.936us 1.323us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 7.32% 6.784us 1.131us 6
- aten::sub 2.53% 34.908us 4.26% 58.910us 9.818us 6.720us 7.25% 6.720us 1.120us 6
- aten::add 2.34% 32.341us 3.97% 54.832us 9.139us 6.720us 7.25% 6.720us 1.120us 6
- Activity Buffer Request 14.89% 205.787us 14.89% 205.787us 205.787us 1.152us 1.24% 1.152us 1.152us 1
- aten::empty_strided 2.09% 28.901us 2.09% 28.901us 4.817us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 15.89% 219.618us 15.89% 219.618us 36.603us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 4.84% 66.885us 6.21% 85.845us 3.577us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.37% 18.960us 1.37% 18.960us 0.790us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 15.59% 215.487us 15.59% 215.487us 4.489us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.38% 5.210us 0.38% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 975.032us 1053.20% 975.032us 975.032us 1
+ torch_eager 19.78% 289.798us 99.66% 1.460ms 1.460ms 0.000us 0.00% 93.698us 93.698us 1
+ aten::mul 11.08% 162.260us 19.21% 281.475us 11.728us 49.665us 53.65% 49.665us 2.069us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.665us 53.65% 49.665us 2.069us 24
+ aten::copy_ 7.16% 104.830us 42.02% 615.673us 34.204us 29.441us 31.80% 30.561us 1.698us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.657us 24.47% 22.657us 1.888us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.472us 14.55% 13.472us 1.123us 12
+ aten::clone 1.39% 20.311us 36.25% 531.032us 88.505us 0.000us 0.00% 7.904us 1.317us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 7.33% 6.784us 1.131us 6
+ aten::add 2.30% 33.730us 3.98% 58.302us 9.717us 6.752us 7.29% 6.752us 1.125us 6
+ aten::sub 2.57% 37.640us 4.45% 65.262us 10.877us 6.720us 7.26% 6.720us 1.120us 6
+ Activity Buffer Request 14.75% 216.135us 14.75% 216.135us 216.135us 1.120us 1.21% 1.120us 1.120us 1
+ aten::empty_strided 2.59% 37.931us 2.59% 37.931us 6.322us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.29% 223.986us 15.29% 223.986us 37.331us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.89% 71.623us 6.23% 91.274us 3.803us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.34% 19.651us 1.34% 19.651us 0.819us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.53% 242.131us 16.53% 242.131us 5.044us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.34% 5.040us 0.34% 5.040us 5.040us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.382ms
-Self CUDA time total: 92.703us
+Self CPU time total: 1.465ms
+Self CUDA time total: 92.578us
@@ -4389,27 +4389,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 938.901us 973.14% 938.901us 938.901us 1
- torch_eager 11.77% 313.313us 99.82% 2.656ms 2.656ms 0.000us 0.00% 97.825us 97.825us 1
- aten::mul 5.60% 148.957us 9.78% 260.340us 10.847us 51.266us 53.14% 51.266us 2.136us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.266us 53.14% 51.266us 2.136us 24
- aten::copy_ 3.87% 103.023us 68.29% 1.817ms 100.957us 30.976us 32.11% 32.319us 1.795us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.072us 23.91% 23.072us 1.923us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.240us 14.76% 14.240us 1.187us 12
- aten::clone 1.07% 28.429us 65.69% 1.748ms 291.327us 0.000us 0.00% 9.247us 1.541us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 8.19% 7.904us 1.317us 6
- aten::add 1.24% 33.110us 2.10% 56.011us 9.335us 7.137us 7.40% 7.137us 1.189us 6
- aten::sub 1.37% 36.490us 2.25% 59.790us 9.965us 7.103us 7.36% 7.103us 1.184us 6
- Activity Buffer Request 53.84% 1.433ms 53.84% 1.433ms 1.433ms 1.343us 1.39% 1.343us 1.343us 1
- aten::empty_strided 1.19% 31.751us 1.19% 31.751us 5.292us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 8.25% 219.470us 8.25% 219.470us 36.578us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.63% 69.934us 3.35% 89.134us 3.714us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.72% 19.200us 0.72% 19.200us 0.800us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.25% 219.576us 8.25% 219.576us 4.574us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.18% 4.910us 0.18% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 963.643us 1001.81% 963.643us 963.643us 1
+ torch_eager 11.60% 311.071us 99.82% 2.676ms 2.676ms 0.000us 0.00% 97.534us 97.534us 1
+ aten::mul 5.66% 151.593us 10.00% 268.127us 11.172us 51.103us 53.13% 51.103us 2.129us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.103us 53.13% 51.103us 2.129us 24
+ aten::copy_ 3.93% 105.441us 68.13% 1.826ms 101.459us 30.911us 32.14% 32.255us 1.792us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.007us 23.92% 23.007us 1.917us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.176us 14.74% 14.176us 1.181us 12
+ aten::clone 1.04% 27.830us 65.21% 1.748ms 291.325us 0.000us 0.00% 9.248us 1.541us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 8.22% 7.904us 1.317us 6
+ aten::sub 1.38% 37.040us 2.30% 61.581us 10.264us 7.103us 7.38% 7.103us 1.184us 6
+ aten::add 1.19% 32.000us 2.05% 54.860us 9.143us 7.073us 7.35% 7.073us 1.179us 6
+ Activity Buffer Request 53.57% 1.436ms 53.57% 1.436ms 1.436ms 1.344us 1.40% 1.344us 1.344us 1
+ aten::empty_strided 1.19% 31.921us 1.19% 31.921us 5.320us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.14% 218.236us 8.14% 218.236us 36.373us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.76% 74.059us 3.52% 94.290us 3.929us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.75% 20.231us 0.75% 20.231us 0.843us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.60% 230.408us 8.60% 230.408us 4.800us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.18% 4.700us 0.18% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.661ms
-Self CUDA time total: 96.482us
+Self CPU time total: 2.681ms
+Self CUDA time total: 96.190us
@@ -4419,27 +4419,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 932.446us 897.69% 932.446us 932.446us 1
- torch_eager 11.60% 307.685us 99.81% 2.647ms 2.647ms 0.000us 0.00% 105.184us 105.184us 1
- aten::mul 5.51% 146.123us 9.64% 255.679us 10.653us 55.362us 53.30% 55.362us 2.307us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.362us 53.30% 55.362us 2.307us 24
- aten::copy_ 3.78% 100.194us 68.64% 1.821ms 101.144us 32.478us 31.27% 33.790us 1.877us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 23.78% 24.703us 2.059us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.032us 15.43% 16.032us 1.336us 12
- aten::clone 1.02% 27.179us 65.92% 1.748ms 291.378us 0.000us 0.00% 9.087us 1.515us 6
- aten::add 1.19% 31.489us 2.03% 53.840us 8.973us 8.064us 7.76% 8.064us 1.344us 6
- aten::sub 1.35% 35.692us 2.26% 59.843us 9.974us 7.968us 7.67% 7.968us 1.328us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 7.49% 7.775us 1.296us 6
- Activity Buffer Request 54.18% 1.437ms 54.18% 1.437ms 1.437ms 1.312us 1.26% 1.312us 1.312us 1
- aten::empty_strided 1.21% 32.003us 1.21% 32.003us 5.334us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 8.25% 218.717us 8.25% 218.717us 36.453us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.67% 70.760us 3.41% 90.371us 3.765us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.74% 19.611us 0.74% 19.611us 0.817us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.32% 220.800us 8.32% 220.800us 4.600us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.19% 5.070us 0.19% 5.070us 5.070us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 984.120us 950.08% 984.120us 984.120us 1
+ torch_eager 21.32% 307.609us 99.66% 1.438ms 1.438ms 0.000us 0.00% 104.863us 104.863us 1
+ aten::mul 11.11% 160.241us 19.03% 274.535us 11.439us 55.232us 53.32% 55.232us 2.301us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.232us 53.32% 55.232us 2.301us 24
+ aten::copy_ 7.56% 109.063us 40.34% 581.983us 32.332us 32.383us 31.26% 33.663us 1.870us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.639us 23.79% 24.639us 2.053us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.968us 15.42% 15.968us 1.331us 12
+ aten::clone 1.50% 21.672us 34.18% 493.044us 82.174us 0.000us 0.00% 9.024us 1.504us 6
+ aten::add 2.60% 37.520us 4.33% 62.511us 10.418us 8.031us 7.75% 8.031us 1.339us 6
+ aten::sub 2.72% 39.231us 4.56% 65.841us 10.973us 7.937us 7.66% 7.937us 1.323us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.48% 7.744us 1.291us 6
+ Activity Buffer Request 13.05% 188.244us 13.05% 188.244us 188.244us 1.280us 1.24% 1.280us 1.280us 1
+ aten::empty_strided 2.28% 32.882us 2.28% 32.882us 5.480us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 14.94% 215.555us 14.94% 215.555us 35.926us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.93% 71.162us 6.28% 90.612us 3.776us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.35% 19.450us 1.35% 19.450us 0.810us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.29% 235.016us 16.29% 235.016us 4.896us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.34% 4.880us 0.34% 4.880us 4.880us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.652ms
-Self CUDA time total: 103.872us
+Self CPU time total: 1.443ms
+Self CUDA time total: 103.583us
@@ -4449,27 +4449,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 914.130us 736.81% 914.130us 914.130us 1
- torch_eager 19.76% 284.015us 99.65% 1.432ms 1.432ms 0.000us 0.00% 125.858us 125.858us 1
- aten::mul 10.20% 146.586us 17.70% 254.419us 10.601us 65.313us 52.64% 65.313us 2.721us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.313us 52.64% 65.313us 2.721us 24
- aten::copy_ 7.71% 110.793us 44.82% 644.172us 35.787us 39.489us 31.83% 41.281us 2.293us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.961us 23.34% 28.961us 2.413us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.53% 19.264us 1.605us 12
- aten::clone 1.45% 20.820us 39.14% 562.560us 93.760us 0.000us 0.00% 12.320us 2.053us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 8.49% 10.528us 1.755us 6
- aten::add 2.34% 33.572us 3.91% 56.142us 9.357us 9.664us 7.79% 9.664us 1.611us 6
- aten::sub 2.40% 34.530us 4.02% 57.751us 9.625us 9.600us 7.74% 9.600us 1.600us 6
- Activity Buffer Request 17.82% 256.078us 17.82% 256.078us 256.078us 1.792us 1.44% 1.792us 1.792us 1
- aten::empty_strided 2.04% 29.262us 2.04% 29.262us 4.877us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 14.99% 215.437us 14.99% 215.437us 35.906us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 4.63% 66.508us 5.96% 85.660us 3.569us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.33% 19.152us 1.33% 19.152us 0.798us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 14.99% 215.488us 14.99% 215.488us 4.489us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.35% 5.000us 0.35% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 935.122us 757.84% 935.122us 935.122us 1
+ torch_eager 19.99% 283.519us 99.60% 1.412ms 1.412ms 0.000us 0.00% 125.153us 125.153us 1
+ aten::mul 10.97% 155.634us 18.77% 266.135us 11.089us 65.024us 52.70% 65.024us 2.709us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.024us 52.70% 65.024us 2.709us 24
+ aten::copy_ 7.53% 106.809us 43.10% 611.203us 33.956us 39.201us 31.77% 40.961us 2.276us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.769us 23.31% 28.769us 2.397us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.168us 15.53% 19.168us 1.597us 12
+ aten::clone 1.50% 21.262us 37.00% 524.722us 87.454us 0.000us 0.00% 12.192us 2.032us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.45% 10.432us 1.739us 6
+ aten::add 2.41% 34.151us 3.94% 55.922us 9.320us 9.664us 7.83% 9.664us 1.611us 6
+ aten::sub 2.49% 35.371us 4.21% 59.711us 9.952us 9.504us 7.70% 9.504us 1.584us 6
+ Activity Buffer Request 14.55% 206.375us 14.55% 206.375us 206.375us 1.760us 1.43% 1.760us 1.760us 1
+ aten::empty_strided 2.12% 30.049us 2.12% 30.049us 5.008us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 16.20% 229.735us 16.20% 229.735us 38.289us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.63% 65.693us 5.97% 84.623us 3.526us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.33% 18.930us 1.33% 18.930us 0.789us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.86% 224.896us 15.86% 224.896us 4.685us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.40% 5.729us 0.40% 5.729us 5.729us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.437ms
-Self CUDA time total: 124.066us
+Self CPU time total: 1.418ms
+Self CUDA time total: 123.393us
@@ -4479,27 +4479,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 921.138us 886.26% 921.138us 921.138us 1
- torch_eager 20.59% 281.307us 99.64% 1.361ms 1.361ms 0.000us 0.00% 105.280us 105.280us 1
- aten::mul 10.84% 148.087us 18.91% 258.361us 10.765us 55.487us 53.39% 55.487us 2.312us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.487us 53.39% 55.487us 2.312us 24
- aten::copy_ 7.39% 100.946us 41.35% 564.842us 31.380us 32.481us 31.25% 33.825us 1.879us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 23.71% 24.640us 2.053us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.968us 15.36% 15.968us 1.331us 12
- aten::clone 1.54% 21.041us 35.66% 487.118us 81.186us 0.000us 0.00% 9.185us 1.531us 6
- aten::sub 2.75% 37.531us 4.47% 61.012us 10.169us 8.031us 7.73% 8.031us 1.339us 6
- aten::add 2.35% 32.112us 3.97% 54.222us 9.037us 7.937us 7.64% 7.937us 1.323us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.841us 7.54% 7.841us 1.307us 6
- Activity Buffer Request 13.62% 186.046us 13.62% 186.046us 186.046us 1.344us 1.29% 1.344us 1.344us 1
- aten::empty_strided 2.20% 30.110us 2.20% 30.110us 5.018us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 15.76% 215.337us 15.76% 215.337us 35.890us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 5.18% 70.704us 6.60% 90.193us 3.758us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.43% 19.489us 1.43% 19.489us 0.812us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 15.99% 218.378us 15.99% 218.378us 4.550us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.36% 4.960us 0.36% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 964.101us 931.86% 964.101us 964.101us 1
+ torch_eager 11.58% 311.269us 99.80% 2.682ms 2.682ms 0.000us 0.00% 104.772us 104.772us 1
+ aten::mul 5.74% 154.165us 9.94% 267.067us 11.128us 55.236us 53.39% 55.236us 2.301us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.236us 53.39% 55.236us 2.301us 24
+ aten::copy_ 4.07% 109.351us 68.30% 1.836ms 101.989us 32.287us 31.21% 33.599us 1.867us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.511us 23.69% 24.511us 2.043us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.937us 15.40% 15.937us 1.328us 12
+ aten::clone 1.02% 27.532us 65.06% 1.749ms 291.482us 0.000us 0.00% 9.088us 1.515us 6
+ aten::add 1.31% 35.310us 2.20% 59.141us 9.857us 7.969us 7.70% 7.969us 1.328us 6
+ aten::sub 1.38% 37.131us 2.33% 62.602us 10.434us 7.968us 7.70% 7.968us 1.328us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 7.52% 7.776us 1.296us 6
+ Activity Buffer Request 53.54% 1.439ms 53.54% 1.439ms 1.439ms 1.312us 1.27% 1.312us 1.312us 1
+ aten::empty_strided 1.12% 30.190us 1.12% 30.190us 5.032us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.09% 217.335us 8.09% 217.335us 36.223us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.62% 70.291us 3.31% 88.901us 3.704us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.69% 18.610us 0.69% 18.610us 0.775us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.64% 232.137us 8.64% 232.137us 4.836us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.20% 5.481us 0.20% 5.481us 5.481us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.366ms
-Self CUDA time total: 103.936us
+Self CPU time total: 2.688ms
+Self CUDA time total: 103.460us
@@ -4509,27 +4509,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.466us 759.69% 943.466us 943.466us 1
- torch_eager 21.73% 302.071us 99.63% 1.385ms 1.385ms 0.000us 0.00% 125.950us 125.950us 1
- aten::mul 10.55% 146.657us 18.63% 259.039us 10.793us 65.378us 52.64% 65.378us 2.724us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.378us 52.64% 65.378us 2.724us 24
- aten::copy_ 7.63% 106.103us 41.12% 571.631us 31.757us 39.519us 31.82% 41.278us 2.293us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 29.024us 23.37% 29.024us 2.419us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.294us 15.54% 19.294us 1.608us 12
- aten::clone 1.52% 21.080us 35.11% 488.057us 81.343us 0.000us 0.00% 12.254us 2.042us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.495us 8.45% 10.495us 1.749us 6
- aten::sub 2.46% 34.153us 4.15% 57.634us 9.606us 9.727us 7.83% 9.727us 1.621us 6
- aten::add 2.41% 33.450us 4.05% 56.342us 9.390us 9.567us 7.70% 9.567us 1.595us 6
- Activity Buffer Request 13.70% 190.466us 13.70% 190.466us 190.466us 1.759us 1.42% 1.759us 1.759us 1
- aten::empty_strided 2.14% 29.791us 2.14% 29.791us 4.965us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 15.29% 212.610us 15.29% 212.610us 35.435us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 4.88% 67.802us 6.29% 87.511us 3.646us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.42% 19.709us 1.42% 19.709us 0.821us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 15.91% 221.207us 15.91% 221.207us 4.608us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.37% 5.080us 0.37% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 964.072us 780.68% 964.072us 964.072us 1
+ torch_eager 11.45% 316.268us 99.81% 2.758ms 2.758ms 0.000us 0.00% 125.283us 125.283us 1
+ aten::mul 5.46% 150.776us 9.46% 261.336us 10.889us 65.090us 52.71% 65.090us 2.712us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.090us 52.71% 65.090us 2.712us 24
+ aten::copy_ 3.85% 106.511us 68.83% 1.902ms 105.647us 39.266us 31.80% 41.058us 2.281us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.802us 23.32% 28.802us 2.400us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.135us 15.50% 19.135us 1.595us 12
+ aten::clone 1.09% 30.231us 66.11% 1.827ms 304.441us 0.000us 0.00% 12.256us 2.043us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.464us 8.47% 10.464us 1.744us 6
+ aten::add 1.22% 33.650us 2.08% 57.431us 9.572us 9.599us 7.77% 9.599us 1.600us 6
+ aten::sub 1.35% 37.292us 2.48% 68.652us 11.442us 9.536us 7.72% 9.536us 1.589us 6
+ Activity Buffer Request 54.53% 1.507ms 54.53% 1.507ms 1.507ms 1.792us 1.45% 1.792us 1.792us 1
+ aten::empty_strided 1.19% 32.821us 1.19% 32.821us 5.470us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.01% 221.424us 8.01% 221.424us 36.904us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.55% 70.592us 3.23% 89.363us 3.723us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.68% 18.771us 0.68% 18.771us 0.782us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.42% 232.664us 8.42% 232.664us 4.847us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.19% 5.190us 0.19% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.390ms
-Self CUDA time total: 124.191us
+Self CPU time total: 2.763ms
+Self CUDA time total: 123.491us
@@ -4539,27 +4539,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 909.497us 512.75% 909.497us 909.497us 1
- torch_eager 20.85% 278.298us 99.63% 1.330ms 1.330ms 0.000us 0.00% 180.288us 180.288us 1
- aten::mul 10.86% 144.977us 19.10% 254.920us 10.622us 94.591us 53.33% 94.591us 3.941us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.591us 53.33% 94.591us 3.941us 24
- aten::copy_ 7.76% 103.603us 40.90% 545.870us 30.326us 57.919us 32.65% 60.831us 3.380us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.767us 22.98% 40.767us 3.397us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.866us 14.02% 24.866us 2.072us 12
- aten::clone 1.59% 21.200us 34.96% 466.526us 77.754us 0.000us 0.00% 20.064us 3.344us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.152us 9.67% 17.152us 2.859us 6
- aten::sub 2.64% 35.242us 4.38% 58.452us 9.742us 12.450us 7.02% 12.450us 2.075us 6
- aten::add 2.38% 31.821us 4.13% 55.081us 9.180us 12.416us 7.00% 12.416us 2.069us 6
- Activity Buffer Request 12.93% 172.606us 12.93% 172.606us 172.606us 2.912us 1.64% 2.912us 2.912us 1
- aten::empty_strided 2.27% 30.341us 2.27% 30.341us 5.057us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 15.64% 208.798us 15.64% 208.798us 34.800us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 4.99% 66.616us 6.40% 85.475us 3.561us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.41% 18.859us 1.41% 18.859us 0.786us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 16.28% 217.276us 16.28% 217.276us 4.527us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.37% 5.001us 0.37% 5.001us 5.001us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 934.855us 527.63% 934.855us 934.855us 1
+ torch_eager 19.51% 283.728us 99.66% 1.450ms 1.450ms 0.000us 0.00% 180.061us 180.061us 1
+ aten::mul 10.43% 151.748us 18.10% 263.338us 10.972us 95.007us 53.62% 95.007us 3.959us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.007us 53.62% 95.007us 3.959us 24
+ aten::copy_ 7.11% 103.461us 44.35% 645.065us 35.837us 57.664us 32.55% 60.544us 3.364us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.608us 22.92% 40.608us 3.384us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.510us 13.83% 24.510us 2.042us 12
+ aten::clone 1.46% 21.280us 38.39% 558.424us 93.071us 0.000us 0.00% 19.936us 3.323us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 9.63% 17.056us 2.843us 6
+ aten::add 2.36% 34.271us 3.99% 58.001us 9.667us 12.287us 6.93% 12.287us 2.048us 6
+ aten::sub 2.55% 37.161us 4.24% 61.641us 10.274us 12.223us 6.90% 12.223us 2.037us 6
+ Activity Buffer Request 17.53% 255.006us 17.53% 255.006us 255.006us 2.880us 1.63% 2.880us 2.880us 1
+ aten::empty_strided 2.02% 29.311us 2.02% 29.311us 4.885us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.21% 221.267us 15.21% 221.267us 36.878us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.73% 68.750us 6.01% 87.372us 3.641us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.28% 18.622us 1.28% 18.622us 0.776us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.48% 225.131us 15.48% 225.131us 4.690us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.34% 4.880us 0.34% 4.880us 4.880us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.335ms
-Self CUDA time total: 177.376us
+Self CPU time total: 1.455ms
+Self CUDA time total: 177.181us
@@ -4569,27 +4569,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 908.914us 305.78% 908.914us 908.914us 1
- torch_eager 20.55% 283.527us 99.64% 1.375ms 1.375ms 0.000us 0.00% 314.296us 314.296us 1
- aten::mul 10.61% 146.340us 18.54% 255.803us 10.658us 145.086us 48.81% 145.086us 6.045us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.086us 48.81% 145.086us 6.045us 24
- aten::copy_ 7.34% 101.324us 42.67% 588.790us 32.711us 111.099us 37.38% 128.154us 7.120us 18
- aten::clone 1.50% 20.722us 37.09% 511.699us 85.283us 0.000us 0.00% 70.718us 11.786us 6
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.436us 19.32% 57.436us 4.786us 12
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.663us 18.05% 53.663us 8.944us 6
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.056us 13.81% 41.056us 3.421us 12
- aten::sub 2.49% 34.330us 4.16% 57.351us 9.558us 20.672us 6.95% 20.672us 3.445us 6
- aten::add 2.29% 31.611us 3.89% 53.723us 8.954us 20.384us 6.86% 20.384us 3.397us 6
- Activity Buffer Request 15.84% 218.487us 15.84% 218.487us 218.487us 17.055us 5.74% 17.055us 17.055us 1
- aten::empty_strided 2.18% 30.110us 2.18% 30.110us 5.018us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 15.10% 208.357us 15.10% 208.357us 34.726us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 4.74% 65.442us 6.15% 84.803us 3.533us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.40% 19.361us 1.40% 19.361us 0.807us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 15.60% 215.218us 15.60% 215.218us 4.484us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.36% 4.930us 0.36% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 936.902us 314.34% 936.902us 936.902us 1
+ torch_eager 19.95% 279.505us 99.63% 1.396ms 1.396ms 0.000us 0.00% 315.267us 315.267us 1
+ aten::mul 10.85% 152.079us 18.94% 265.395us 11.058us 146.176us 49.04% 146.176us 6.091us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.176us 49.04% 146.176us 6.091us 24
+ aten::copy_ 7.66% 107.385us 42.60% 596.937us 33.163us 110.978us 37.23% 128.194us 7.122us 18
+ aten::clone 1.45% 20.319us 36.31% 508.783us 84.797us 0.000us 0.00% 70.625us 11.771us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.569us 19.32% 57.569us 4.797us 12
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.409us 17.92% 53.409us 8.902us 6
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.897us 13.72% 40.897us 3.408us 12
+ aten::sub 2.61% 36.531us 4.38% 61.402us 10.234us 20.449us 6.86% 20.449us 3.408us 6
+ aten::add 2.39% 33.533us 3.98% 55.753us 9.292us 20.448us 6.86% 20.448us 3.408us 6
+ Activity Buffer Request 14.75% 206.705us 14.75% 206.705us 206.705us 17.216us 5.78% 17.216us 17.216us 1
+ aten::empty_strided 2.13% 29.842us 2.13% 29.842us 4.974us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.44% 216.385us 15.44% 216.385us 36.064us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.91% 68.874us 6.21% 87.042us 3.627us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.30% 18.168us 1.30% 18.168us 0.757us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.19% 226.869us 16.19% 226.869us 4.726us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.37% 5.161us 0.37% 5.161us 5.161us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.380ms
-Self CUDA time total: 297.241us
+Self CPU time total: 1.401ms
+Self CUDA time total: 298.051us
@@ -4599,27 +4599,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 939.162us 529.48% 939.162us 939.162us 1
- torch_eager 11.57% 307.472us 99.80% 2.653ms 2.653ms 0.000us 0.00% 180.256us 180.256us 1
- aten::mul 5.55% 147.649us 9.66% 256.649us 10.694us 94.851us 53.47% 94.851us 3.952us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.851us 53.47% 94.851us 3.952us 24
- aten::copy_ 3.85% 102.292us 68.52% 1.821ms 101.186us 57.759us 32.56% 60.639us 3.369us 18
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.671us 22.93% 40.671us 3.389us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.766us 13.96% 24.766us 2.064us 12
- aten::clone 1.06% 28.080us 65.81% 1.749ms 291.547us 0.000us 0.00% 19.968us 3.328us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.088us 9.63% 17.088us 2.848us 6
- aten::add 1.13% 30.133us 1.96% 52.053us 8.675us 12.384us 6.98% 12.384us 2.064us 6
- aten::sub 1.27% 33.752us 2.15% 57.162us 9.527us 12.382us 6.98% 12.382us 2.064us 6
- Activity Buffer Request 54.50% 1.449ms 54.50% 1.449ms 1.449ms 2.880us 1.62% 2.880us 2.880us 1
- aten::empty_strided 1.13% 30.142us 1.13% 30.142us 5.024us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 7.84% 208.428us 7.84% 208.428us 34.738us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 3.02% 80.309us 3.76% 99.911us 4.163us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.74% 19.602us 0.74% 19.602us 0.817us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 8.14% 216.293us 8.14% 216.293us 4.506us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.20% 5.200us 0.20% 5.200us 5.200us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 953.069us 538.57% 953.069us 953.069us 1
+ torch_eager 19.36% 280.983us 99.62% 1.446ms 1.446ms 0.000us 0.00% 179.812us 179.812us 1
+ aten::mul 10.74% 155.876us 18.65% 270.688us 11.279us 94.916us 53.64% 94.916us 3.955us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.916us 53.64% 94.916us 3.955us 24
+ aten::copy_ 7.70% 111.823us 43.62% 633.117us 35.173us 57.568us 32.53% 60.416us 3.356us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.544us 22.91% 40.544us 3.379us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.480us 13.83% 24.480us 2.040us 12
+ aten::clone 1.50% 21.731us 37.58% 545.384us 90.897us 0.000us 0.00% 19.872us 3.312us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 9.62% 17.024us 2.837us 6
+ aten::add 2.38% 34.509us 4.05% 58.781us 9.797us 12.256us 6.93% 12.256us 2.043us 6
+ aten::sub 2.51% 36.442us 4.13% 59.923us 9.987us 12.224us 6.91% 12.224us 2.037us 6
+ Activity Buffer Request 15.40% 223.485us 15.40% 223.485us 223.485us 2.848us 1.61% 2.848us 2.848us 1
+ aten::empty_strided 2.13% 30.930us 2.13% 30.930us 5.155us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.79% 229.197us 15.79% 229.197us 38.200us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.88% 70.882us 6.18% 89.652us 3.735us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.29% 18.770us 1.29% 18.770us 0.782us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.93% 231.177us 15.93% 231.177us 4.816us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.38% 5.510us 0.38% 5.510us 5.510us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.658ms
-Self CUDA time total: 177.376us
+Self CPU time total: 1.451ms
+Self CUDA time total: 176.964us
@@ -4629,27 +4629,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 942.515us 317.36% 942.515us 942.515us 1
- torch_eager 20.57% 285.923us 99.62% 1.385ms 1.385ms 0.000us 0.00% 314.717us 314.717us 1
- aten::mul 10.73% 149.116us 18.62% 258.870us 10.786us 145.439us 48.97% 145.439us 6.060us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.439us 48.97% 145.439us 6.060us 24
- aten::copy_ 7.46% 103.659us 42.33% 588.488us 32.694us 110.749us 37.29% 128.477us 7.138us 18
- aten::clone 1.56% 21.753us 36.61% 508.959us 84.826us 0.000us 0.00% 71.104us 11.851us 6
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.373us 19.32% 57.373us 4.781us 12
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.376us 17.97% 53.376us 8.896us 6
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.801us 13.74% 40.801us 3.400us 12
- aten::sub 2.38% 33.081us 4.03% 56.021us 9.337us 20.449us 6.89% 20.449us 3.408us 6
- aten::add 2.40% 33.331us 4.05% 56.271us 9.379us 20.352us 6.85% 20.352us 3.392us 6
- Activity Buffer Request 14.18% 197.118us 14.18% 197.118us 197.118us 17.728us 5.97% 17.728us 17.728us 1
- aten::empty_strided 2.21% 30.780us 2.21% 30.780us 5.130us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 16.19% 225.018us 16.19% 225.018us 37.503us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 4.87% 67.722us 6.24% 86.713us 3.613us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.37% 18.991us 1.37% 18.991us 0.791us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 15.71% 218.327us 15.71% 218.327us 4.548us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.38% 5.310us 0.38% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 992.756us 332.77% 992.756us 992.756us 1
+ torch_eager 20.12% 289.006us 99.66% 1.432ms 1.432ms 0.000us 0.00% 316.222us 316.222us 1
+ aten::mul 11.31% 162.528us 19.47% 279.759us 11.657us 146.880us 49.23% 146.880us 6.120us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.880us 49.23% 146.880us 6.120us 24
+ aten::copy_ 7.73% 111.012us 41.48% 595.895us 33.105us 110.942us 37.19% 128.830us 7.157us 18
+ aten::clone 1.55% 22.310us 35.21% 505.793us 84.299us 0.000us 0.00% 71.424us 11.904us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.406us 19.24% 57.406us 4.784us 12
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.536us 17.94% 53.536us 8.923us 6
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.512us 13.58% 40.512us 3.376us 12
+ aten::add 2.53% 36.289us 4.25% 61.011us 10.169us 20.352us 6.82% 20.352us 3.392us 6
+ aten::sub 2.59% 37.162us 4.41% 63.291us 10.549us 20.160us 6.76% 20.160us 3.360us 6
+ Activity Buffer Request 13.10% 188.164us 13.10% 188.164us 188.164us 17.888us 6.00% 17.888us 17.888us 1
+ aten::empty_strided 2.24% 32.121us 2.24% 32.121us 5.354us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.74% 226.067us 15.74% 226.067us 37.678us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.81% 69.111us 6.15% 88.363us 3.682us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.34% 19.252us 1.34% 19.252us 0.802us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.62% 238.734us 16.62% 238.734us 4.974us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.34% 4.940us 0.34% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.390ms
-Self CUDA time total: 296.989us
+Self CPU time total: 1.437ms
+Self CUDA time total: 298.334us
@@ -4659,27 +4659,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 928.214us 158.30% 928.214us 928.214us 1
- torch_eager 21.21% 285.194us 99.61% 1.340ms 1.340ms 0.000us 0.00% 610.012us 610.012us 1
- aten::copy_ 7.59% 102.047us 40.19% 540.521us 30.029us 268.445us 45.78% 292.093us 16.227us 18
- aten::mul 11.07% 148.860us 19.42% 261.184us 10.883us 251.679us 42.92% 251.679us 10.487us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 251.679us 42.92% 251.679us 10.487us 24
- aten::clone 1.57% 21.069us 34.26% 460.696us 76.783us 0.000us 0.00% 201.406us 33.568us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 177.758us 30.32% 177.758us 29.626us 6
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.687us 15.47% 90.687us 7.557us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.240us 11.30% 66.240us 5.520us 12
- aten::sub 2.72% 36.642us 4.50% 60.582us 10.097us 33.152us 5.65% 33.152us 5.525us 6
- aten::add 2.29% 30.800us 3.93% 52.901us 8.817us 33.088us 5.64% 33.088us 5.515us 6
- Activity Buffer Request 12.31% 165.596us 12.31% 165.596us 165.596us 23.648us 4.03% 23.648us 23.648us 1
- aten::empty_strided 2.19% 29.501us 2.19% 29.501us 4.917us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 15.63% 210.266us 15.63% 210.266us 35.044us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 5.16% 69.374us 6.60% 88.734us 3.697us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 1.44% 19.360us 1.44% 19.360us 0.807us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 16.43% 220.977us 16.43% 220.977us 4.604us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 0.39% 5.180us 0.39% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 957.657us 163.29% 957.657us 957.657us 1
+ torch_eager 20.09% 288.813us 99.63% 1.432ms 1.432ms 0.000us 0.00% 610.425us 610.425us 1
+ aten::copy_ 7.31% 105.011us 42.63% 612.724us 34.040us 268.572us 45.79% 292.508us 16.250us 18
+ aten::mul 10.71% 153.870us 18.84% 270.776us 11.282us 252.607us 43.07% 252.607us 10.525us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 252.607us 43.07% 252.607us 10.525us 24
+ aten::clone 1.42% 20.480us 36.58% 525.692us 87.615us 0.000us 0.00% 201.566us 33.594us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 177.630us 30.29% 177.630us 29.605us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.942us 15.51% 90.942us 7.578us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.310us 11.14% 65.310us 5.443us 12
+ aten::sub 2.69% 38.720us 4.45% 63.991us 10.665us 32.991us 5.63% 32.991us 5.499us 6
+ aten::add 2.37% 34.041us 3.93% 56.461us 9.410us 32.319us 5.51% 32.319us 5.387us 6
+ Activity Buffer Request 15.99% 229.866us 15.99% 229.866us 229.866us 23.936us 4.08% 23.936us 23.936us 1
+ aten::empty_strided 2.02% 29.010us 2.02% 29.010us 4.835us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 14.72% 211.585us 14.72% 211.585us 35.264us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.83% 69.478us 6.24% 89.671us 3.736us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.40% 20.193us 1.40% 20.193us 0.841us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.06% 230.859us 16.06% 230.859us 4.810us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.37% 5.320us 0.37% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.345ms
-Self CUDA time total: 586.364us
+Self CPU time total: 1.437ms
+Self CUDA time total: 586.489us
@@ -4689,35 +4689,35 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 9.32% 323.657us 76.63% 2.662ms 2.662ms 0.000us 0.00% 1.834ms 1.834ms 1
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.806ms 102.11% 1.806ms 1.806ms 1
- aten::copy_ 3.12% 108.276us 52.46% 1.822ms 101.225us 791.134us 44.74% 857.278us 47.627us 18
- aten::mul 4.16% 144.572us 7.37% 256.109us 10.671us 827.198us 46.78% 827.198us 34.467us 24
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 827.198us 46.78% 827.198us 34.467us 24
- aten::clone 0.81% 28.142us 50.15% 1.742ms 290.300us 0.000us 0.00% 624.095us 104.016us 6
- Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 557.951us 31.55% 557.951us 92.992us 6
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.183us 13.19% 233.183us 19.432us 12
-void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 149.919us 8.48% 149.919us 12.493us 12
- aten::sub 0.98% 34.102us 1.65% 57.362us 9.560us 90.368us 5.11% 90.368us 15.061us 6
- Activity Buffer Request 41.53% 1.443ms 41.53% 1.443ms 1.443ms 66.144us 3.74% 66.144us 66.144us 1
- aten::add 0.89% 30.740us 1.53% 53.293us 8.882us 59.551us 3.37% 59.551us 9.925us 6
- aten::empty_strided 0.86% 29.871us 0.86% 29.871us 4.979us 0.000us 0.00% 0.000us 0.000us 6
- cudaMemcpyAsync 5.94% 206.426us 5.94% 206.426us 34.404us 0.000us 0.00% 0.000us 0.000us 6
- aten::slice 2.06% 71.442us 2.62% 91.034us 3.793us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.56% 19.592us 0.56% 19.592us 0.816us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 6.40% 222.192us 6.40% 222.192us 4.629us 0.000us 0.00% 0.000us 0.000us 48
- cudaDeviceSynchronize 23.37% 811.698us 23.37% 811.698us 811.698us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 9.43% 329.378us 77.87% 2.720ms 2.720ms 0.000us 0.00% 1.842ms 1.842ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.815ms 102.19% 1.815ms 1.815ms 1
+ aten::copy_ 3.09% 107.951us 52.68% 1.840ms 102.235us 794.051us 44.71% 860.068us 47.782us 18
+ aten::mul 4.59% 160.365us 8.02% 279.997us 11.667us 834.368us 46.99% 834.368us 34.765us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 834.368us 46.99% 834.368us 34.765us 24
+ aten::clone 0.80% 28.034us 50.14% 1.751ms 291.882us 0.000us 0.00% 627.394us 104.566us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 561.377us 31.61% 561.377us 93.563us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 232.674us 13.10% 232.674us 19.389us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 147.392us 8.30% 147.392us 12.283us 12
+ aten::sub 1.14% 39.970us 1.89% 66.170us 11.028us 89.952us 5.07% 89.952us 14.992us 6
+ Activity Buffer Request 41.31% 1.443ms 41.31% 1.443ms 1.443ms 66.017us 3.72% 66.017us 66.017us 1
+ aten::add 0.95% 33.281us 1.61% 56.271us 9.379us 57.440us 3.23% 57.440us 9.573us 6
+ aten::empty_strided 0.85% 29.670us 0.85% 29.670us 4.945us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 6.22% 217.146us 6.22% 217.146us 36.191us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.01% 70.292us 2.58% 90.182us 3.758us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.57% 19.890us 0.57% 19.890us 0.829us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 6.90% 240.975us 6.90% 240.975us 5.020us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 22.13% 773.090us 22.13% 773.090us 773.090us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 3.473ms
-Self CUDA time total: 1.768ms
+Self CPU time total: 3.493ms
+Self CUDA time total: 1.776ms
impl wl p50(ms) ok
torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
-torch_eager cuda_B1_S128_H8_D64_R32 0.17 True
-torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
+torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
+torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
@@ -4735,7 +4735,7 @@ torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
-torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
+torch_eager cuda_B2_S512_H8_D128_R64 0.21 True
torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
diff --git a/rotary/index.html b/rotary/index.html
index cb1be8e4d680b5623caf2d05c1be684b075964b4..5ff503336b04c290f15ed24958b96a45568efad3 100644
--- a/rotary/index.html
+++ b/rotary/index.html
@@ -1,89 +1,3879 @@
-
+
-
-
-
Index of /rotary
-
+
+
+
index
+
+
+
+
+
-
-
Index of /rotary
-
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
Rotary Position Embeddings Benchmarks
+
This directory contains benchmarks for Rotary Position Embeddings (RoPE) implementations.
+
Implementations
+
+
Results
+
+
+
\ No newline at end of file
diff --git a/rotary/results/artifacts/combine/latency.svg b/rotary/results/artifacts/combine/latency.svg
index 793d43c4ad9f51efa85fd8e3504aaff6f6bbc3ad..3fdefb46544d73b9bc85fc2ae3e00add87b86535 100644
--- a/rotary/results/artifacts/combine/latency.svg
+++ b/rotary/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:0517a426384d0bc9df1932ace04595ea1867cb036e7fbeced61eb044cff2e335
+oid sha256:36e71e631ab1a00097df3bc72a4532b4b383ed31a1df2368bd041e765254a9c3
size 31018
diff --git a/rotary/results/combined_results.html b/rotary/results/combined_results.html
index a1cdc49d1fc3934c88244cb81845c6ffb97c9784..17475d0e65452d0f310ef38d60c5c80c88e6833b 100644
--- a/rotary/results/combined_results.html
+++ b/rotary/results/combined_results.html
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
- 2025-10-28T14:09:08.848427
+ 2025-10-29T14:27:54.393501
image/svg+xml
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
-
+
- 0.2
+ 0.2
-
+
-
+
- 0.3
+ 0.3
-
+
-
+
- 0.4
+ 0.4
-
+
-
+
- 0.5
+ 0.5
-
+
-
+
- 0.6
+ 0.6
@@ -4287,34 +4287,34 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -4364,7 +4364,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
▼ output
▶ uv-logs
|
-Cell: combine | 4.36s
+Cell: combine | 4.35s
| ▶ run
Copy
Raw
@@ -4453,7 +4453,7 @@ COMBINED BENCHMARK SUMMARY
impl wl p50(ms) ok
hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False
hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.09 False
-hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.09 False
hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False
@@ -4478,8 +4478,8 @@ hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
-torch_eager cuda_B1_S128_H8_D64_R32 0.17 True
-torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
+torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
+torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
@@ -4497,7 +4497,7 @@ torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
-torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
+torch_eager cuda_B2_S512_H8_D128_R64 0.21 True
torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
GENERATING COMBINED VISUALIZATION
@@ -4518,7 +4518,7 @@ Implementations included:
-Installed 37 packages in 219ms
+Installed 37 packages in 239ms
@@ -4531,7 +4531,7 @@ Installed 37 packages in 219ms
- 2025-10-28T14:09:08.848427
+ 2025-10-29T14:27:54.393501
image/svg+xml
@@ -4875,70 +4875,70 @@ Installed 37 packages in 219ms
-
+
-
+
- 0.2
+ 0.2
-
+
-
+
- 0.3
+ 0.3
-
+
-
+
- 0.4
+ 0.4
-
+
-
+
- 0.5
+ 0.5
-
+
-
+
- 0.6
+ 0.6
@@ -4946,34 +4946,34 @@ Installed 37 packages in 219ms
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+