Thu Oct 30 15:52:16 2025
+Fri Oct 31 20:00:17 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4132,7 @@ Cell: nv | 0.23s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 29C P0 86W / 350W | 0MiB / 46068MiB | 0% Default |
+| N/A 33C P0 108W / 350W | 0MiB / 46068MiB | 88% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -4155,11 +4156,12 @@ Cell: nv | 0.23s
▼ output
▶ uv-logs
|
-Cell: benchmark | 4.17s
+Cell: benchmark | 4.19s
| ▶ run
Copy
Raw
-GitHub
+GitHub
+🤗 HF
@@ -4211,17 +4213,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 78.752us 1953.17% 78.752us 78.752us 1
- hf_kernels_swiglu 9.29% 160.875us 99.59% 1.725ms 1.725ms 0.000us 0.00% 5.440us 5.440us 1
- _activation_beeaae6::silu_and_mul 1.15% 19.839us 87.61% 1.518ms 505.995us 4.032us 100.00% 5.440us 1.813us 3
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3
- Activity Buffer Request 83.97% 1.455ms 83.97% 1.455ms 1.455ms 1.408us 34.92% 1.408us 1.408us 1
- aten::empty 2.69% 46.600us 2.69% 46.600us 15.533us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 2.49% 43.201us 2.49% 43.201us 14.400us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.41% 7.161us 0.41% 7.161us 7.161us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 105.055us 2585.65% 105.055us 105.055us 1
+ hf_kernels_swiglu 11.41% 202.714us 99.64% 1.770ms 1.770ms 0.000us 0.00% 5.471us 5.471us 1
+ _activation_beeaae6::silu_and_mul 1.18% 21.050us 84.47% 1.501ms 500.190us 4.063us 100.00% 5.471us 1.824us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3
+ Activity Buffer Request 80.70% 1.434ms 80.70% 1.434ms 1.434ms 1.408us 34.65% 1.408us 1.408us 1
+ aten::empty 3.76% 66.772us 3.76% 66.772us 22.257us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 2.58% 45.872us 2.58% 45.872us 15.291us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.36% 6.420us 0.36% 6.420us 6.420us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.733ms
-Self CUDA time total: 4.032us
+Self CPU time total: 1.776ms
+Self CUDA time total: 4.063us
@@ -4231,17 +4233,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.528us 1575.81% 62.528us 62.528us 1
- hf_kernels_swiglu 6.86% 110.833us 99.69% 1.610ms 1.610ms 0.000us 0.00% 5.312us 5.312us 1
- _activation_beeaae6::silu_and_mul 1.31% 21.159us 91.69% 1.481ms 493.565us 3.968us 100.00% 5.312us 1.771us 3
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
- Activity Buffer Request 88.77% 1.434ms 88.77% 1.434ms 1.434ms 1.344us 33.87% 1.344us 1.344us 1
- aten::empty 1.14% 18.330us 1.14% 18.330us 6.110us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 1.61% 26.001us 1.61% 26.001us 8.667us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.31% 5.030us 0.31% 5.030us 5.030us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.119us 1540.69% 61.119us 61.119us 1
+ hf_kernels_swiglu 6.50% 104.811us 99.67% 1.607ms 1.607ms 0.000us 0.00% 5.279us 5.279us 1
+ _activation_beeaae6::silu_and_mul 1.26% 20.331us 91.95% 1.482ms 494.073us 3.967us 100.00% 5.279us 1.760us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3
+ Activity Buffer Request 89.13% 1.437ms 89.13% 1.437ms 1.437ms 1.312us 33.07% 1.312us 1.312us 1
+ aten::empty 1.22% 19.632us 1.22% 19.632us 6.544us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.56% 25.120us 1.56% 25.120us 8.373us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.33% 5.360us 0.33% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.615ms
-Self CUDA time total: 3.968us
+Self CPU time total: 1.612ms
+Self CUDA time total: 3.967us
@@ -4251,17 +4253,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.232us 1291.50% 63.232us 63.232us 1
- hf_kernels_swiglu 6.20% 101.121us 99.70% 1.627ms 1.627ms 0.000us 0.00% 6.528us 6.528us 1
- _activation_beeaae6::silu_and_mul 1.27% 20.780us 92.37% 1.507ms 502.489us 4.896us 100.00% 6.528us 2.176us 3
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3
- Activity Buffer Request 89.54% 1.461ms 89.54% 1.461ms 1.461ms 1.632us 33.33% 1.632us 1.632us 1
- aten::empty 1.13% 18.440us 1.13% 18.440us 6.147us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 1.56% 25.391us 1.56% 25.391us 8.464us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.30% 4.970us 0.30% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.488us 1288.31% 63.488us 63.488us 1
+ hf_kernels_swiglu 6.89% 111.363us 99.67% 1.611ms 1.611ms 0.000us 0.00% 6.592us 6.592us 1
+ _activation_beeaae6::silu_and_mul 1.36% 22.028us 91.47% 1.479ms 492.912us 4.928us 100.00% 6.592us 2.197us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.928us 100.00% 4.928us 1.643us 3
+ Activity Buffer Request 88.52% 1.431ms 88.52% 1.431ms 1.431ms 1.664us 33.77% 1.664us 1.664us 1
+ aten::empty 1.30% 21.081us 1.30% 21.081us 7.027us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.59% 25.652us 1.59% 25.652us 8.551us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.33% 5.390us 0.33% 5.390us 5.390us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.632ms
-Self CUDA time total: 4.896us
+Self CPU time total: 1.617ms
+Self CUDA time total: 4.928us
@@ -4271,17 +4273,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.664us 1554.55% 65.664us 65.664us 1
- hf_kernels_swiglu 5.63% 101.442us 99.74% 1.798ms 1.798ms 0.000us 0.00% 5.632us 5.632us 1
- _activation_beeaae6::silu_and_mul 1.18% 21.341us 92.99% 1.677ms 558.850us 4.224us 100.00% 5.632us 1.877us 3
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.224us 100.00% 4.224us 1.408us 3
- Activity Buffer Request 79.26% 1.429ms 79.26% 1.429ms 1.429ms 1.408us 33.33% 1.408us 1.408us 1
- aten::empty 1.12% 20.239us 1.12% 20.239us 6.746us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 12.54% 226.164us 12.54% 226.164us 75.388us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.26% 4.649us 0.26% 4.649us 4.649us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.000us 1585.82% 68.000us 68.000us 1
+ hf_kernels_swiglu 5.97% 106.915us 99.70% 1.784ms 1.784ms 0.000us 0.00% 5.760us 5.760us 1
+ _activation_beeaae6::silu_and_mul 1.16% 20.770us 92.62% 1.658ms 552.564us 4.288us 100.00% 5.760us 1.920us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3
+ Activity Buffer Request 80.58% 1.442ms 80.58% 1.442ms 1.442ms 1.472us 34.33% 1.472us 1.472us 1
+ aten::empty 1.10% 19.770us 1.10% 19.770us 6.590us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 10.88% 194.785us 10.88% 194.785us 64.928us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.30% 5.350us 0.30% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.803ms
-Self CUDA time total: 4.224us
+Self CPU time total: 1.790ms
+Self CUDA time total: 4.288us
@@ -4291,17 +4293,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.968us 1086.23% 63.968us 63.968us 1
- hf_kernels_swiglu 19.44% 85.062us 98.79% 432.257us 432.257us 0.000us 0.00% 7.874us 7.874us 1
- _activation_beeaae6::silu_and_mul 4.74% 20.731us 74.99% 328.126us 109.375us 5.889us 100.00% 7.874us 2.625us 3
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 100.00% 5.889us 1.963us 3
- Activity Buffer Request 29.32% 128.302us 29.32% 128.302us 128.302us 1.985us 33.71% 1.985us 1.985us 1
- aten::empty 4.36% 19.069us 4.36% 19.069us 6.356us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 40.93% 179.093us 40.93% 179.093us 59.698us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.21% 5.289us 1.21% 5.289us 5.289us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.599us 1108.28% 65.599us 65.599us 1
+ hf_kernels_swiglu 18.75% 89.073us 98.88% 469.813us 469.813us 0.000us 0.00% 7.903us 7.903us 1
+ _activation_beeaae6::silu_and_mul 4.69% 22.280us 76.20% 362.069us 120.690us 5.919us 100.00% 7.903us 2.634us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 100.00% 5.919us 1.973us 3
+ Activity Buffer Request 38.23% 181.645us 38.23% 181.645us 181.645us 1.984us 33.52% 1.984us 1.984us 1
+ aten::empty 3.93% 18.671us 3.93% 18.671us 6.224us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 33.28% 158.144us 33.28% 158.144us 52.715us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.12% 5.330us 1.12% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 437.546us
-Self CUDA time total: 5.889us
+Self CPU time total: 475.143us
+Self CUDA time total: 5.919us
@@ -4311,17 +4313,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.167us 867.45% 67.167us 67.167us 1
- hf_kernels_swiglu 5.97% 103.951us 99.66% 1.736ms 1.736ms 0.000us 0.00% 10.335us 10.335us 1
- _activation_beeaae6::silu_and_mul 1.17% 20.451us 92.57% 1.612ms 537.363us 7.743us 100.00% 10.335us 3.445us 3
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 100.00% 7.743us 2.581us 3
- Activity Buffer Request 82.03% 1.429ms 82.03% 1.429ms 1.429ms 2.592us 33.48% 2.592us 2.592us 1
- aten::empty 1.12% 19.510us 1.12% 19.510us 6.503us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 9.36% 162.983us 9.36% 162.983us 54.328us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.34% 5.970us 0.34% 5.970us 5.970us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.207us 906.60% 70.207us 70.207us 1
+ hf_kernels_swiglu 6.12% 106.261us 99.74% 1.733ms 1.733ms 0.000us 0.00% 10.336us 10.336us 1
+ _activation_beeaae6::silu_and_mul 1.25% 21.782us 92.41% 1.606ms 535.254us 7.744us 100.00% 10.336us 3.445us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 100.00% 7.744us 2.581us 3
+ Activity Buffer Request 82.36% 1.431ms 82.36% 1.431ms 1.431ms 2.592us 33.47% 2.592us 2.592us 1
+ aten::empty 1.21% 21.081us 1.21% 21.081us 7.027us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.80% 152.893us 8.80% 152.893us 50.964us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.26% 4.511us 0.26% 4.511us 4.511us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.742ms
-Self CUDA time total: 7.743us
+Self CPU time total: 1.738ms
+Self CUDA time total: 7.744us
@@ -4331,17 +4333,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.999us 1036.41% 67.999us 67.999us 1
- hf_kernels_swiglu 5.88% 101.172us 99.74% 1.716ms 1.716ms 0.000us 0.00% 8.769us 8.769us 1
- _activation_beeaae6::silu_and_mul 1.20% 20.670us 92.73% 1.596ms 531.873us 6.561us 100.00% 8.769us 2.923us 3
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 100.00% 6.561us 2.187us 3
- Activity Buffer Request 82.56% 1.421ms 82.56% 1.421ms 1.421ms 2.208us 33.65% 2.208us 2.208us 1
- aten::empty 1.13% 19.490us 1.13% 19.490us 6.497us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 8.96% 154.233us 8.96% 154.233us 51.411us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.26% 4.490us 0.26% 4.490us 4.490us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.214us 1045.06% 69.214us 69.214us 1
+ hf_kernels_swiglu 7.00% 122.783us 99.73% 1.750ms 1.750ms 0.000us 0.00% 8.830us 8.830us 1
+ _activation_beeaae6::silu_and_mul 1.22% 21.430us 91.58% 1.607ms 535.694us 6.623us 100.00% 8.830us 2.943us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.623us 100.00% 6.623us 2.208us 3
+ Activity Buffer Request 81.74% 1.434ms 81.74% 1.434ms 1.434ms 2.207us 33.32% 2.207us 2.207us 1
+ aten::empty 1.15% 20.211us 1.15% 20.211us 6.737us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.62% 151.304us 8.62% 151.304us 50.435us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 4.780us 0.27% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.721ms
-Self CUDA time total: 6.561us
+Self CPU time total: 1.755ms
+Self CUDA time total: 6.623us
@@ -4351,17 +4353,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.295us 670.43% 63.295us 63.295us 1
- hf_kernels_swiglu 23.24% 86.211us 98.67% 366.026us 366.026us 0.000us 0.00% 12.609us 12.609us 1
- _activation_beeaae6::silu_and_mul 5.71% 21.191us 70.40% 261.155us 87.052us 9.441us 100.00% 12.609us 4.203us 3
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.441us 100.00% 9.441us 3.147us 3
- Activity Buffer Request 23.85% 88.481us 23.85% 88.481us 88.481us 3.168us 33.56% 3.168us 3.168us 1
- aten::empty 5.03% 18.660us 5.03% 18.660us 6.220us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 40.84% 151.483us 40.84% 151.483us 50.494us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.33% 4.920us 1.33% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.152us 692.52% 65.152us 65.152us 1
+ hf_kernels_swiglu 21.62% 91.474us 98.93% 418.571us 418.571us 0.000us 0.00% 12.576us 12.576us 1
+ _activation_beeaae6::silu_and_mul 4.88% 20.631us 69.03% 292.067us 97.356us 9.408us 100.00% 12.576us 4.192us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.408us 100.00% 9.408us 3.136us 3
+ Activity Buffer Request 28.63% 121.143us 28.63% 121.143us 121.143us 3.168us 33.67% 3.168us 3.168us 1
+ aten::empty 8.28% 35.030us 8.28% 35.030us 11.677us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 35.52% 150.293us 35.52% 150.293us 50.098us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.07% 4.530us 1.07% 4.530us 4.530us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 370.946us
-Self CUDA time total: 9.441us
+Self CPU time total: 423.101us
+Self CUDA time total: 9.408us
@@ -4371,17 +4373,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.342us 500.47% 65.342us 65.342us 1
- hf_kernels_swiglu 22.94% 96.471us 98.88% 415.727us 415.727us 0.000us 0.00% 17.408us 17.408us 1
- _activation_beeaae6::silu_and_mul 5.11% 21.490us 71.29% 299.725us 99.908us 13.056us 100.00% 17.408us 5.803us 3
-void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.056us 100.00% 13.056us 4.352us 3
- Activity Buffer Request 30.59% 128.632us 30.59% 128.632us 128.632us 4.352us 33.33% 4.352us 4.352us 1
- aten::empty 4.65% 19.531us 4.65% 19.531us 6.510us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 35.58% 149.603us 35.58% 149.603us 49.868us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.12% 4.720us 1.12% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.197us 514.72% 67.197us 67.197us 1
+ hf_kernels_swiglu 22.39% 97.642us 98.93% 431.481us 431.481us 0.000us 0.00% 17.439us 17.439us 1
+ _activation_beeaae6::silu_and_mul 4.99% 21.781us 71.94% 313.789us 104.596us 13.055us 100.00% 17.439us 5.813us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.055us 100.00% 13.055us 4.352us 3
+ Activity Buffer Request 32.48% 141.684us 32.48% 141.684us 141.684us 4.384us 33.58% 4.384us 4.384us 1
+ aten::empty 4.60% 20.050us 4.60% 20.050us 6.683us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 34.47% 150.324us 34.47% 150.324us 50.108us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.07% 4.681us 1.07% 4.681us 4.681us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 420.447us
-Self CUDA time total: 13.056us
+Self CPU time total: 436.162us
+Self CUDA time total: 13.055us
impl wl p50(ms) ok
@@ -4398,12 +4400,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
-Installed 15 packages in 13ms
+Installed 15 packages in 15ms
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
-Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 14.50it/s]
-Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 20.28it/s]
+Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 15.31it/s]
+Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 21.41it/s]
Artifacts:
activation.jsonl
diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html
index 215b2799716ac41798e6372ba0e150a2bd6bd9c0..41f6e46a2626019e3e97d61016b7b71b844385d6 100644
--- a/activation/impls/torch_swiglu.html
+++ b/activation/impls/torch_swiglu.html
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
▼ output
▶ uv-logs
|
-Cell: nv | 0.23s
+Cell: nv | 0.26s
|
▶ run
Copy
Raw
-
GitHub
+
GitHub
@@ -4122,7 +4122,7 @@ Cell: nv | 0.23s
-
Thu Oct 30 15:52:16 2025
+Fri Oct 31 20:00:17 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.23s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 29C P0 86W / 350W | 0MiB / 46068MiB | 0% Default |
+| N/A 33C P0 108W / 350W | 0MiB / 46068MiB | 88% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -4155,11 +4155,11 @@ Cell: nv | 0.23s
▼ output
▶ uv-logs
|
-Cell: benchmark | 6.88s
+Cell: benchmark | 7.02s
| ▶ run
Copy
Raw
-GitHub
+GitHub
@@ -4205,20 +4205,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 179.327us 1411.47% 179.327us 179.327us 1
- torch_eager 11.22% 210.364us 99.57% 1.867ms 1.867ms 0.000us 0.00% 15.009us 15.009us 1
- aten::silu 3.37% 63.151us 82.30% 1.543ms 514.355us 6.497us 51.14% 8.801us 2.934us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 51.14% 6.497us 2.166us 3
- aten::mul 1.76% 33.030us 2.90% 54.310us 18.103us 6.208us 48.86% 6.208us 2.069us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 48.86% 6.208us 2.069us 3
- Activity Buffer Request 76.72% 1.439ms 76.72% 1.439ms 1.439ms 2.304us 18.13% 2.304us 2.304us 1
- aten::slice 2.52% 47.241us 3.15% 59.052us 9.842us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.63% 11.811us 0.63% 11.811us 1.968us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 3.34% 62.690us 3.34% 62.690us 10.448us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.43% 8.120us 0.43% 8.120us 8.120us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 188.575us 1476.70% 188.575us 188.575us 1
+ torch_eager 11.13% 210.826us 99.56% 1.887ms 1.887ms 0.000us 0.00% 15.106us 15.106us 1
+ aten::silu 3.37% 63.781us 82.44% 1.562ms 520.736us 6.497us 50.88% 8.833us 2.944us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 50.88% 6.497us 2.166us 3
+ aten::mul 1.86% 35.170us 2.95% 55.841us 18.614us 6.273us 49.12% 6.273us 2.091us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.273us 49.12% 6.273us 2.091us 3
+ Activity Buffer Request 76.78% 1.455ms 76.78% 1.455ms 1.455ms 2.336us 18.29% 2.336us 2.336us 1
+ aten::slice 2.45% 46.380us 3.05% 57.842us 9.640us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.60% 11.462us 0.60% 11.462us 1.910us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.38% 64.112us 3.38% 64.112us 10.685us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.44% 8.280us 0.44% 8.280us 8.280us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.875ms
-Self CUDA time total: 12.705us
+Self CPU time total: 1.895ms
+Self CUDA time total: 12.770us
@@ -4228,20 +4228,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.777us 1228.76% 151.777us 151.777us 1
- torch_eager 6.62% 113.831us 99.66% 1.713ms 1.713ms 0.000us 0.00% 14.496us 14.496us 1
- aten::silu 2.46% 42.260us 88.64% 1.523ms 507.722us 6.368us 51.55% 8.512us 2.837us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 51.55% 6.368us 2.123us 3
- aten::mul 1.53% 26.241us 2.60% 44.713us 14.904us 5.984us 48.45% 5.984us 1.995us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.45% 5.984us 1.995us 3
- Activity Buffer Request 84.63% 1.454ms 84.63% 1.454ms 1.454ms 2.144us 17.36% 2.144us 2.144us 1
- aten::slice 1.45% 24.880us 1.80% 30.920us 5.153us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.35% 6.040us 0.35% 6.040us 1.007us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 2.62% 45.062us 2.62% 45.062us 7.510us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.34% 5.800us 0.34% 5.800us 5.800us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.926us 1234.87% 152.926us 152.926us 1
+ torch_eager 6.55% 113.093us 99.67% 1.721ms 1.721ms 0.000us 0.00% 14.560us 14.560us 1
+ aten::silu 2.40% 41.391us 88.69% 1.532ms 510.609us 6.400us 51.68% 8.576us 2.859us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 51.68% 6.400us 2.133us 3
+ aten::mul 1.50% 25.830us 2.63% 45.361us 15.120us 5.984us 48.32% 5.984us 1.995us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
+ Activity Buffer Request 84.72% 1.463ms 84.72% 1.463ms 1.463ms 2.176us 17.57% 2.176us 2.176us 1
+ aten::slice 1.43% 24.741us 1.80% 31.062us 5.177us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.37% 6.321us 0.37% 6.321us 1.054us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 2.71% 46.721us 2.71% 46.721us 7.787us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.33% 5.741us 0.33% 5.741us 5.741us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.718ms
-Self CUDA time total: 12.352us
+Self CPU time total: 1.727ms
+Self CUDA time total: 12.384us
@@ -4251,20 +4251,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.422us 1145.66% 151.422us 151.422us 1
- torch_eager 6.39% 108.591us 99.69% 1.694ms 1.694ms 0.000us 0.00% 15.489us 15.489us 1
- aten::silu 2.42% 41.180us 88.84% 1.509ms 503.045us 6.784us 51.33% 9.056us 3.019us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.33% 6.784us 2.261us 3
- aten::mul 1.56% 26.573us 2.72% 46.263us 15.421us 6.433us 48.67% 6.433us 2.144us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.67% 6.433us 2.144us 3
- Activity Buffer Request 84.90% 1.442ms 84.90% 1.442ms 1.442ms 2.272us 17.19% 2.272us 2.272us 1
- aten::slice 1.42% 24.110us 1.74% 29.570us 4.928us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.32% 5.460us 0.32% 5.460us 0.910us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 2.67% 45.420us 2.67% 45.420us 7.570us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.31% 5.240us 0.31% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.413us 1147.86% 152.413us 152.413us 1
+ torch_eager 6.17% 105.134us 99.68% 1.699ms 1.699ms 0.000us 0.00% 15.581us 15.581us 1
+ aten::silu 2.58% 43.990us 88.96% 1.517ms 505.533us 6.814us 51.32% 9.117us 3.039us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.814us 51.32% 6.814us 2.271us 3
+ aten::mul 1.63% 27.711us 2.72% 46.371us 15.457us 6.464us 48.68% 6.464us 2.155us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.68% 6.464us 2.155us 3
+ Activity Buffer Request 84.84% 1.446ms 84.84% 1.446ms 1.446ms 2.303us 17.34% 2.303us 2.303us 1
+ aten::slice 1.47% 24.990us 1.83% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.37% 6.260us 0.37% 6.260us 1.043us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 2.63% 44.871us 2.63% 44.871us 7.478us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.32% 5.431us 0.32% 5.431us 5.431us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.699ms
-Self CUDA time total: 13.217us
+Self CPU time total: 1.705ms
+Self CUDA time total: 13.278us
@@ -4274,20 +4274,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.159us 1197.73% 152.159us 152.159us 1
- torch_eager 7.49% 109.251us 99.65% 1.454ms 1.454ms 0.000us 0.00% 14.912us 14.912us 1
- aten::silu 2.87% 41.871us 86.91% 1.268ms 422.724us 6.560us 51.64% 8.768us 2.923us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.64% 6.560us 2.187us 3
- aten::mul 1.82% 26.542us 3.09% 45.132us 15.044us 6.144us 48.36% 6.144us 2.048us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.144us 48.36% 6.144us 2.048us 3
- Activity Buffer Request 71.19% 1.039ms 71.19% 1.039ms 1.039ms 2.208us 17.38% 2.208us 2.208us 1
- aten::slice 1.75% 25.480us 2.16% 31.560us 5.260us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.42% 6.080us 0.42% 6.080us 1.013us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 14.12% 206.043us 14.12% 206.043us 34.340us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.35% 5.050us 0.35% 5.050us 5.050us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.359us 1219.84% 155.359us 155.359us 1
+ torch_eager 6.31% 109.593us 99.71% 1.733ms 1.733ms 0.000us 0.00% 14.944us 14.944us 1
+ aten::silu 2.48% 43.021us 88.93% 1.545ms 515.160us 6.560us 51.51% 8.768us 2.923us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
+ aten::mul 1.62% 28.091us 2.66% 46.261us 15.420us 6.176us 48.49% 6.176us 2.059us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
+ Activity Buffer Request 74.70% 1.298ms 74.70% 1.298ms 1.298ms 2.208us 17.34% 2.208us 2.208us 1
+ aten::slice 1.46% 25.370us 1.82% 31.631us 5.272us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.36% 6.261us 0.36% 6.261us 1.043us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 12.80% 222.405us 12.80% 222.405us 37.068us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.29% 4.960us 0.29% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.459ms
-Self CUDA time total: 12.704us
+Self CPU time total: 1.738ms
+Self CUDA time total: 12.736us
@@ -4297,20 +4297,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.295us 1117.14% 147.295us 147.295us 1
- torch_eager 5.91% 105.630us 99.72% 1.782ms 1.782ms 0.000us 0.00% 15.457us 15.457us 1
- aten::silu 2.35% 41.900us 89.64% 1.602ms 533.846us 6.752us 51.21% 9.024us 3.008us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.21% 6.752us 2.251us 3
- aten::mul 1.43% 25.502us 2.46% 43.882us 14.627us 6.433us 48.79% 6.433us 2.144us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.79% 6.433us 2.144us 3
- Activity Buffer Request 78.53% 1.403ms 78.53% 1.403ms 1.403ms 2.272us 17.23% 2.272us 2.272us 1
- aten::slice 1.39% 24.781us 1.71% 30.582us 5.097us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.32% 5.801us 0.32% 5.801us 0.967us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.80% 175.053us 9.80% 175.053us 29.176us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.28% 4.969us 0.28% 4.969us 4.969us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.122us 1152.94% 153.122us 153.122us 1
+ torch_eager 5.95% 108.905us 99.72% 1.827ms 1.827ms 0.000us 0.00% 15.585us 15.585us 1
+ aten::silu 2.26% 41.441us 89.57% 1.641ms 546.874us 6.816us 51.32% 9.120us 3.040us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.32% 6.816us 2.272us 3
+ aten::mul 1.45% 26.581us 2.47% 45.261us 15.087us 6.465us 48.68% 6.465us 2.155us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.68% 6.465us 2.155us 3
+ Activity Buffer Request 78.54% 1.439ms 78.54% 1.439ms 1.439ms 2.304us 17.35% 2.304us 2.304us 1
+ aten::slice 1.41% 25.869us 1.74% 31.870us 5.312us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.33% 6.001us 0.33% 6.001us 1.000us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.78% 179.164us 9.78% 179.164us 29.861us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.28% 5.090us 0.28% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.787ms
-Self CUDA time total: 13.185us
+Self CPU time total: 1.832ms
+Self CUDA time total: 13.281us
@@ -4320,20 +4320,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.964us 937.33% 143.964us 143.964us 1
- torch_eager 21.41% 103.402us 98.95% 477.918us 477.918us 0.000us 0.00% 18.047us 18.047us 1
- aten::silu 9.04% 43.640us 62.61% 302.394us 100.798us 7.872us 51.25% 10.560us 3.520us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 51.25% 7.872us 2.624us 3
- aten::mul 5.13% 24.761us 8.85% 42.722us 14.241us 7.487us 48.75% 7.487us 2.496us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.487us 48.75% 7.487us 2.496us 3
- Activity Buffer Request 22.09% 106.692us 22.09% 106.692us 106.692us 2.688us 17.50% 2.688us 2.688us 1
- aten::slice 4.94% 23.880us 6.09% 29.400us 4.900us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 1.14% 5.520us 1.14% 5.520us 0.920us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 35.20% 170.023us 35.20% 170.023us 28.337us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 1.05% 5.060us 1.05% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.877us 970.08% 150.877us 150.877us 1
+ torch_eager 20.61% 104.763us 99.03% 503.283us 503.283us 0.000us 0.00% 18.241us 18.241us 1
+ aten::silu 8.60% 43.701us 63.19% 321.148us 107.049us 7.969us 51.24% 10.657us 3.552us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.969us 51.24% 7.969us 2.656us 3
+ aten::mul 5.45% 27.720us 8.99% 45.690us 15.230us 7.584us 48.76% 7.584us 2.528us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.76% 7.584us 2.528us 3
+ Activity Buffer Request 24.24% 123.213us 24.24% 123.213us 123.213us 2.688us 17.28% 2.688us 2.688us 1
+ aten::slice 5.04% 25.603us 6.23% 31.682us 5.280us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 1.20% 6.079us 1.20% 6.079us 1.013us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 33.88% 172.204us 33.88% 172.204us 28.701us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.97% 4.940us 0.97% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 482.978us
-Self CUDA time total: 15.359us
+Self CPU time total: 508.223us
+Self CUDA time total: 15.553us
@@ -4343,20 +4343,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.301us 1078.65% 154.301us 154.301us 1
- torch_eager 5.96% 107.399us 99.74% 1.796ms 1.796ms 0.000us 0.00% 16.769us 16.769us 1
- aten::silu 2.38% 42.931us 89.51% 1.612ms 537.266us 7.328us 51.23% 9.792us 3.264us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 51.23% 7.328us 2.443us 3
- aten::mul 1.49% 26.893us 2.55% 45.883us 15.294us 6.977us 48.77% 6.977us 2.326us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.977us 48.77% 6.977us 2.326us 3
- Activity Buffer Request 78.67% 1.417ms 78.67% 1.417ms 1.417ms 2.464us 17.22% 2.464us 2.464us 1
- aten::slice 1.40% 25.140us 1.72% 31.031us 5.172us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.33% 5.891us 0.33% 5.891us 0.982us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.51% 171.283us 9.51% 171.283us 28.547us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.26% 4.600us 0.26% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.541us 1089.44% 156.541us 156.541us 1
+ torch_eager 6.81% 125.673us 99.72% 1.840ms 1.840ms 0.000us 0.00% 16.866us 16.866us 1
+ aten::silu 2.28% 42.101us 88.57% 1.634ms 544.654us 7.361us 51.23% 9.858us 3.286us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 51.23% 7.361us 2.454us 3
+ aten::mul 1.53% 28.200us 2.53% 46.622us 15.541us 7.008us 48.77% 7.008us 2.336us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.77% 7.008us 2.336us 3
+ Activity Buffer Request 77.96% 1.438ms 77.96% 1.438ms 1.438ms 2.497us 17.38% 2.497us 2.497us 1
+ aten::slice 1.46% 26.979us 1.81% 33.310us 5.552us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.34% 6.331us 0.34% 6.331us 1.055us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.33% 172.076us 9.33% 172.076us 28.679us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.28% 5.210us 0.28% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.801ms
-Self CUDA time total: 14.305us
+Self CPU time total: 1.845ms
+Self CUDA time total: 14.369us
@@ -4366,20 +4366,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.686us 1002.89% 154.686us 154.686us 1
- torch_eager 22.31% 107.382us 99.03% 476.668us 476.668us 0.000us 0.00% 18.080us 18.080us 1
- aten::silu 9.43% 45.390us 60.13% 289.404us 96.468us 7.872us 51.04% 10.528us 3.509us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 51.04% 7.872us 2.624us 3
- aten::mul 6.54% 31.461us 10.39% 50.022us 16.674us 7.552us 48.96% 7.552us 2.517us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.96% 7.552us 2.517us 3
- Activity Buffer Request 19.41% 93.401us 19.41% 93.401us 93.401us 2.656us 17.22% 2.656us 2.656us 1
- aten::slice 5.01% 24.090us 6.20% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 1.20% 5.770us 1.20% 5.770us 0.962us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 35.15% 169.174us 35.15% 169.174us 28.196us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.97% 4.650us 0.97% 4.650us 4.650us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.754us 962.92% 149.754us 149.754us 1
+ torch_eager 21.77% 106.163us 98.85% 481.952us 481.952us 0.000us 0.00% 18.240us 18.240us 1
+ aten::silu 8.65% 42.151us 61.90% 301.788us 100.596us 7.968us 51.23% 10.656us 3.552us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.23% 7.968us 2.656us 3
+ aten::mul 5.09% 24.801us 8.77% 42.752us 14.251us 7.584us 48.77% 7.584us 2.528us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.77% 7.584us 2.528us 3
+ Activity Buffer Request 21.73% 105.953us 21.73% 105.953us 105.953us 2.688us 17.28% 2.688us 2.688us 1
+ aten::slice 5.14% 25.050us 6.41% 31.249us 5.208us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 1.27% 6.199us 1.27% 6.199us 1.033us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 35.20% 171.635us 35.20% 171.635us 28.606us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 1.15% 5.600us 1.15% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 481.318us
-Self CUDA time total: 15.424us
+Self CPU time total: 487.552us
+Self CUDA time total: 15.552us
@@ -4389,20 +4389,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.678us 692.09% 155.678us 155.678us 1
- torch_eager 6.04% 109.222us 99.73% 1.805ms 1.805ms 0.000us 0.00% 26.365us 26.365us 1
- aten::silu 2.28% 41.351us 89.49% 1.620ms 539.866us 11.614us 51.63% 15.485us 5.162us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.614us 51.63% 11.614us 3.871us 3
- aten::mul 1.47% 26.681us 2.47% 44.641us 14.880us 10.880us 48.37% 10.880us 3.627us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.880us 48.37% 10.880us 3.627us 3
- Activity Buffer Request 78.73% 1.425ms 78.73% 1.425ms 1.425ms 3.871us 17.21% 3.871us 3.871us 1
- aten::slice 1.39% 25.188us 1.73% 31.390us 5.232us 0.000us 0.00% 0.000us 0.000us 6
- aten::as_strided 0.34% 6.202us 0.34% 6.202us 1.034us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.47% 171.352us 9.47% 171.352us 28.559us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 0.27% 4.900us 0.27% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 187.357us 834.00% 187.357us 187.357us 1
+ torch_eager 6.93% 128.860us 99.74% 1.856ms 1.856ms 0.000us 0.00% 26.369us 26.369us 1
+ aten::silu 2.32% 43.123us 88.23% 1.642ms 547.175us 11.616us 51.71% 15.520us 5.173us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.616us 51.71% 11.616us 3.872us 3
+ aten::mul 1.63% 30.312us 2.74% 50.922us 16.974us 10.849us 48.29% 10.849us 3.616us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.849us 48.29% 10.849us 3.616us 3
+ Activity Buffer Request 77.79% 1.447ms 77.79% 1.447ms 1.447ms 3.904us 17.38% 3.904us 3.904us 1
+ aten::slice 1.49% 27.691us 1.84% 34.251us 5.708us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.35% 6.560us 0.35% 6.560us 1.093us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 9.23% 171.734us 9.23% 171.734us 28.622us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.26% 4.930us 0.26% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.810ms
-Self CUDA time total: 22.494us
+Self CPU time total: 1.860ms
+Self CUDA time total: 22.465us
impl wl p50(ms) ok
@@ -4419,7 +4419,7 @@ torch_eager cuda_T512_D768 0.05 True
-Installed 37 packages in 216ms
+Installed 37 packages in 251ms
diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg
index 961d35bc69df12d3f8c1e9441cc14de8f19fb723..c90094a9212ed4b3ea466620aa29c029e98de04f 100644
--- a/activation/results/artifacts/combine/latency.svg
+++ b/activation/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:49127439c8b28e18efed1525d57e9bb48bdb632034f2f84a60940f7d447aff24
-size 20647
+oid sha256:085b4a64bddea2955d6d074836121ec2e120fb1ca9140f3ccb75e8358e4526b3
+size 20644
diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html
index 2ed3e05955eb7f6d843de731dbef9c8c20788b83..aefcf7c048ef413bda722db3be44aa8b9b9cef43 100644
--- a/activation/results/combined_results.html
+++ b/activation/results/combined_results.html
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
- 2025-10-30T15:53:40.869549
+ 2025-10-31T20:14:01.265668
image/svg+xml
@@ -4256,83 +4256,83 @@ body[data-tool="eraser"] .main-content {
-
+
-
+
- 0.025
+ 0.025
-
+
-
+
- 0.030
+ 0.030
-
+
-
+
- 0.035
+ 0.035
-
+
-
+
- 0.040
+ 0.040
-
+
-
+
- 0.045
+ 0.045
-
+
-
+
- 0.050
+ 0.050
@@ -4340,37 +4340,37 @@ body[data-tool="eraser"] .main-content {
-
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
+
-
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
@@ -4428,7 +4428,7 @@ body[data-tool="eraser"] .main-content {
▼ output
▶ uv-logs
|
-Cell: combine | 4.28s
+Cell: combine | 4.32s
| ▶ run
Copy
Raw
@@ -4554,7 +4554,7 @@ Implementations included:
-Installed 37 packages in 222ms
+Installed 37 packages in 213ms
@@ -4567,7 +4567,7 @@ Installed 37 packages in 222ms
- 2025-10-30T15:53:40.869549
+ 2025-10-31T20:14:01.265668
image/svg+xml
@@ -4716,83 +4716,83 @@ Installed 37 packages in 222ms
-
+
-
+
- 0.025
+ 0.025
-
+
-
+
- 0.030
+ 0.030
-
+
-
+
- 0.035
+ 0.035
-
+
-
+
- 0.040
+ 0.040
-
+
-
+
- 0.045
+ 0.045
-
+
-
+
- 0.050
+ 0.050
@@ -4800,37 +4800,37 @@ Installed 37 packages in 222ms
-
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
+
-
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
diff --git a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
index 4dbcd737042ccd89af4999232ce91680c8569342..7bfddcfb2c66ba429fccc98758725309b85f6780 100644
--- a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
+++ b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
@@ -1,24 +1,24 @@
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04394999996293336, "p50": 0.04566100000147344, "p90": 0.046750000024076144, "mean": 0.04579239999884521, "iqr": 0.0020500000346146408, "raw_times": [0.0446999999894615, 0.047901000016281614, 0.046750000024076144, 0.04566100000147344, 0.04394999996293336], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05609099997627709, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05193099997313766, "p50": 0.05449100001442275, "p90": 0.054510999973444996, "mean": 0.05559319998837964, "iqr": 0.0010200000133409048, "raw_times": [0.05349099996010409, 0.05449100001442275, 0.06354200002078869, 0.05193099997313766, 0.054510999973444996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.060221000012461445, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051560999963840004, "p50": 0.05184100001542902, "p90": 0.05310099999178419, "mean": 0.05230499999697713, "iqr": 0.0014099999816608033, "raw_times": [0.05184100001542902, 0.05333100000370905, 0.05310099999178419, 0.05169100001012339, 0.051560999963840004], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058330999991085264, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05121099997040801, "p50": 0.051831000007496186, "p90": 0.052310999990368146, "mean": 0.05185479999454401, "iqr": 0.0008799999591246888, "raw_times": [0.05121099997040801, 0.051831000007496186, 0.052310999990368146, 0.05248999997320425, 0.05143100003124346], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05627100000538121, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050751000003401714, "p50": 0.051640999970459234, "p90": 0.05217000000357075, "mean": 0.05161080000561924, "iqr": 0.0008689999617672584, "raw_times": [0.05219100000886101, 0.05217000000357075, 0.050751000003401714, 0.05130100004180349, 0.051640999970459234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055421000013211597, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04883100001507046, "p50": 0.049950999994052836, "p90": 0.05039000001261229, "mean": 0.04992260001017712, "iqr": 0.0006600000119760807, "raw_times": [0.04883100001507046, 0.05071100002851381, 0.04973000000063621, 0.05039000001261229, 0.049950999994052836], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05684100000280523, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04895099999657759, "p50": 0.050181000005977694, "p90": 0.05176100000880979, "mean": 0.05066500000339147, "iqr": 0.0021600000081889448, "raw_times": [0.04960100000062084, 0.05176100000880979, 0.050181000005977694, 0.05283100000497143, 0.04895099999657759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05629100002124687, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048800999991271965, "p50": 0.051240999994206504, "p90": 0.0513809999915793, "mean": 0.05085500000632237, "iqr": 0.00043999995114063495, "raw_times": [0.051240999994206504, 0.048800999991271965, 0.051911000014115416, 0.050941000040438666, 0.0513809999915793], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056131000008008414, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04940100001249448, "p50": 0.05085099996904319, "p90": 0.05221100002472667, "mean": 0.05112659999895186, "iqr": 0.0015410000742122065, "raw_times": [0.050669999950514466, 0.05221100002472667, 0.04940100001249448, 0.0525000000379805, 0.05085099996904319], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053861000026245165, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04947999997284569, "p50": 0.05073100004437947, "p90": 0.05098100001532657, "mean": 0.05063280001422754, "iqr": 0.0010900000120273035, "raw_times": [0.04947999997284569, 0.05098100001532657, 0.04989100000329927, 0.05073100004437947, 0.052081000035286706], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054841000007854745, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05039100000203689, "p50": 0.051160999987587275, "p90": 0.05154000001539316, "mean": 0.051364599994485616, "iqr": 0.00038000001723048626, "raw_times": [0.051160999987587275, 0.05257099996924808, 0.05039100000203689, 0.05154000001539316, 0.051159999998162675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05513099995368975, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048071000037452905, "p50": 0.05178100002467545, "p90": 0.0526809999996658, "mean": 0.05150900001353875, "iqr": 0.0032599999713056604, "raw_times": [0.04942100002836014, 0.0526809999996658, 0.05178100002467545, 0.05559099997753947, 0.048071000037452905], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05527100000790597, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04963099996757592, "p50": 0.05203099999562255, "p90": 0.052549999963957816, "mean": 0.05276679999042244, "iqr": 0.0005189999683352653, "raw_times": [0.05759100002933337, 0.05203099999562255, 0.052549999963957816, 0.04963099996757592, 0.05203099999562255], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07661199998665325, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049701000023105735, "p50": 0.051581000036549085, "p90": 0.05290100000365783, "mean": 0.05255880001868718, "iqr": 0.002381000001605571, "raw_times": [0.05290100000365783, 0.058091000028070994, 0.051581000036549085, 0.05052000000205226, 0.049701000023105735], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054920000025049376, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0517009999612128, "p50": 0.05219999997052582, "p90": 0.05233100000623381, "mean": 0.05215079999061345, "iqr": 0.0001500000053056283, "raw_times": [0.05233100000623381, 0.05234100001416664, 0.05219999997052582, 0.0517009999612128, 0.05218100000092818], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055141000018466, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05047100000865612, "p50": 0.05349100001694751, "p90": 0.05691100000149163, "mean": 0.057148999997025385, "iqr": 0.004350000040176383, "raw_times": [0.05047100000865612, 0.05349100001694751, 0.07231099999671642, 0.05256099996131525, 0.05691100000149163], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05554099999471873, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049690000025748304, "p50": 0.050921000024573004, "p90": 0.051730999985011294, "mean": 0.051232800001344, "iqr": 0.0010800000040944724, "raw_times": [0.05065099998091682, 0.051730999985011294, 0.05317099999047059, 0.049690000025748304, 0.050921000024573004], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05373099997996178, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05013100002315696, "p50": 0.05073099998753605, "p90": 0.052470999946763186, "mean": 0.051448999988679134, "iqr": 0.001829999973779195, "raw_times": [0.05013100002315696, 0.05073099998753605, 0.05327100001295548, 0.052470999946763186, 0.05064099997298399], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05419999996547631, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04918100000850245, "p50": 0.050670999996782484, "p90": 0.05192099996520483, "mean": 0.050938799995492445, "iqr": 0.0013709999393540784, "raw_times": [0.04918100000850245, 0.05192099996520483, 0.05237099998112171, 0.05055000002585075, 0.050670999996782484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05559099997753947, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049871000044277025, "p50": 0.05047100000865612, "p90": 0.05118100000345294, "mean": 0.050820800015571876, "iqr": 0.0007699999855503847, "raw_times": [0.049871000044277025, 0.05041100001790255, 0.05217000000357075, 0.05047100000865612, 0.05118100000345294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05564100001720362, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05115100003649786, "p50": 0.052071000027353875, "p90": 0.05212100001017461, "mean": 0.05199700001412566, "iqr": 0.0006100000291553442, "raw_times": [0.05115100003649786, 0.052071000027353875, 0.053131000015582686, 0.05212100001017461, 0.05151099998101927], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05440099999987069, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04927099996621109, "p50": 0.051500999973086437, "p90": 0.05194099998107049, "mean": 0.05114499998626343, "iqr": 0.000919999990856013, "raw_times": [0.051500999973086437, 0.04927099996621109, 0.051991000020734646, 0.05194099998107049, 0.05102099999021448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054591000036907644, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049930999978187174, "p50": 0.050361000035081815, "p90": 0.05102099999021448, "mean": 0.05066480000550655, "iqr": 0.0008009999987734773, "raw_times": [0.050219999991441, 0.050361000035081815, 0.05179100003260828, 0.049930999978187174, 0.05102099999021448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05545099998016667, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:36Z", "run": "a6ca6031a2be44b8852eef7e2bbddc9e", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0500799999940682, "p50": 0.05195099998900332, "p90": 0.051991000020734646, "mean": 0.05318280000210507, "iqr": 0.0014600000213249587, "raw_times": [0.0500799999940682, 0.05195099998900332, 0.051991000020734646, 0.05053099999940969, 0.061361000007309485], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05489099999067548, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06906199996592477, "p50": 0.07093199997143529, "p90": 0.07169200000589626, "mean": 0.07107379998387842, "iqr": 0.0011000000199601345, "raw_times": [0.07093199997143529, 0.07309099999019963, 0.07059199998593613, 0.07169200000589626, 0.06906199996592477], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07642200000645971, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.08879199998546028, "p90": 0.08886199998414668, "mean": 0.0890762000040013, "iqr": 0.00037899997096246807, "raw_times": [0.08730199999718025, 0.08879199998546028, 0.08848300001318421, 0.08886199998414668, 0.09194200004003505], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.091862999965997, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08465199999818651, "p50": 0.08821300002637145, "p90": 0.08871199997884105, "mean": 0.08770840000806857, "iqr": 0.0007599999776175537, "raw_times": [0.08465199999818651, 0.0879520000012235, 0.08821300002637145, 0.08901300003572032, 0.08871199997884105], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09156300001222917, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08501199999955134, "p50": 0.08710200000905388, "p90": 0.08719199996676252, "mean": 0.08665020000080403, "iqr": 0.001349999934063817, "raw_times": [0.08501199999955134, 0.08710200000905388, 0.08719199996676252, 0.0858420000326987, 0.08810299999595372], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09103200000026845, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08575200001814665, "p50": 0.08690200002092752, "p90": 0.08706200003416598, "mean": 0.08684220001669019, "iqr": 0.00029900002118665725, "raw_times": [0.08773199999723147, 0.08676300001297932, 0.08690200002092752, 0.08706200003416598, 0.08575200001814665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09036199998035954, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08490200002597703, "p50": 0.08731200000511308, "p90": 0.0877829999694768, "mean": 0.08806820000017979, "iqr": 0.001451000002816727, "raw_times": [0.09401200003367194, 0.08731200000511308, 0.08633199996666008, 0.08490200002597703, 0.0877829999694768], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0907329999790818, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0847820000444699, "p50": 0.08513199998105847, "p90": 0.08660200001031626, "mean": 0.08566600000676772, "iqr": 0.0016600000094513234, "raw_times": [0.08494200000086494, 0.0847820000444699, 0.08687199999712902, 0.08660200001031626, 0.08513199998105847], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0911219999579771, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08356199998615921, "p50": 0.0846430000365217, "p90": 0.08576199996923606, "mean": 0.08508039999242101, "iqr": 0.0011189999895577785, "raw_times": [0.08356199998615921, 0.0867919999905098, 0.08464299997967828, 0.08576199996923606, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08955300000934585, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08469199997307442, "p50": 0.08614199998646654, "p90": 0.08723299998791845, "mean": 0.08654439999418173, "iqr": 0.0011309999763398082, "raw_times": [0.08469199997307442, 0.08610200001157864, 0.08614199998646654, 0.08855300001187061, 0.08723299998791845], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09115300002804361, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08576300001550408, "p50": 0.08703200001036748, "p90": 0.08823299998539369, "mean": 0.09075460000076419, "iqr": 0.0015310000094359566, "raw_times": [0.10604300001659794, 0.08823299998539369, 0.08703200001036748, 0.08670199997595773, 0.08576300001550408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985199997368909, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14525299997103502, "p50": 0.1457439999512644, "p90": 0.1459139999724357, "mean": 0.1457395999750588, "iqr": 0.00044099999740865314, "raw_times": [0.14525299997103502, 0.14547299997502705, 0.1457439999512644, 0.14631400000553185, 0.1459139999724357], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1472430000148961, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16037399996093882, "p50": 0.16231400002197915, "p90": 0.16309400001546237, "mean": 0.1622881999992387, "iqr": 0.0012190000120426703, "raw_times": [0.16309400001546237, 0.16231400002197915, 0.16378399999439353, 0.1618750000034197, 0.16037399996093882], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16341399998509587, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08445299999948475, "p50": 0.08518200002072263, "p90": 0.08666200000106983, "mean": 0.08572240001285536, "iqr": 0.0017899999988912896, "raw_times": [0.08445299999948475, 0.08744300004082106, 0.08518200002072263, 0.08666200000106983, 0.08487200000217854], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0890119999894523, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08437200000344092, "p50": 0.08463200003916427, "p90": 0.08609200000364581, "mean": 0.08522400000856578, "iqr": 0.0015900000107649248, "raw_times": [0.08463200003916427, 0.08609200000364581, 0.08652200000369703, 0.08437200000344092, 0.08450199999288088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08977199996706986, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08375199996635274, "p50": 0.08519199997181204, "p90": 0.08627200003274993, "mean": 0.08607399998936671, "iqr": 0.0020100000597267353, "raw_times": [0.08375199996635274, 0.0842619999730232, 0.08627200003274993, 0.08519199997181204, 0.09089200000289566], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08821199998010343, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08470200003785067, "p50": 0.08566200000359458, "p90": 0.08573299999170558, "mean": 0.08566220001284819, "iqr": 0.0006109999617365247, "raw_times": [0.08470200003785067, 0.08709200000112105, 0.08512200002996906, 0.08566200000359458, 0.08573299999170558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08864200003699807, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08451200000081371, "p50": 0.08525300000883362, "p90": 0.08580199994412396, "mean": 0.08525219999455658, "iqr": 0.0009299999419454252, "raw_times": [0.08580199994412396, 0.08525300000883362, 0.08451200000081371, 0.08487200000217854, 0.08582200001683304], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08942300001990588, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08533199996918484, "p50": 0.08693199998788259, "p90": 0.09015199998430035, "mean": 0.08883799998784525, "iqr": 0.0043200000163778896, "raw_times": [0.08533199996918484, 0.09015199998430035, 0.08583199996792246, 0.08693199998788259, 0.09594200002993603], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09176200001093093, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08384200003774822, "p50": 0.08611200001951147, "p90": 0.08663199997727133, "mean": 0.08570400000280642, "iqr": 0.001730000008137722, "raw_times": [0.08384200003774822, 0.08611200001951147, 0.08703200001036748, 0.08663199997727133, 0.08490199996913361], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941200002254845, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08507300003657292, "p50": 0.0865819999944506, "p90": 0.08741199997075455, "mean": 0.09195439998848087, "iqr": 0.0020300000187489786, "raw_times": [0.11532299998862072, 0.0865819999944506, 0.08741199997075455, 0.08538199995200557, 0.08507300003657292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08733200002097874, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09419299999535724, "p50": 0.09539199999153425, "p90": 0.09730299996135727, "mean": 0.09678459998667677, "iqr": 0.002380999944762152, "raw_times": [0.10211299996853995, 0.09730299996135727, 0.09492200001659512, 0.09539199999153425, 0.09419299999535724], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09651299995994123, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.10080199996309602, "p50": 0.10192199999892182, "p90": 0.1026219999857858, "mean": 0.10294419998899684, "iqr": 0.0008999999749903509, "raw_times": [0.10765299998638511, 0.10172200001079545, 0.1026219999857858, 0.10192199999892182, 0.10080199996309602], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10299199999508346, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4861929999719905, "p50": 0.4890019999947981, "p90": 0.48961200002395344, "mean": 0.48862639999924795, "iqr": 0.001079000014669873, "raw_times": [0.48979199999621414, 0.4861929999719905, 0.48961200002395344, 0.4890019999947981, 0.48853300000928357], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.48705300002893637, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.49736299996538946, "p50": 0.49848299994437184, "p90": 0.49918199999865465, "mean": 0.4987367999774506, "iqr": 0.0007590000450363732, "raw_times": [0.4984229999536183, 0.49848299994437184, 0.49918199999865465, 0.5002330000252186, 0.49736299996538946], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4985730000157673, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
diff --git a/causal_conv1d/impls/cells/benchmark.py b/causal_conv1d/impls/cells/benchmark.py
index 725b12c4018e4eec05c5ddccb0c88a8eae6f150d..2e38669a505cbdf181a93e97f31ed1e67ecf4883 100644
--- a/causal_conv1d/impls/cells/benchmark.py
+++ b/causal_conv1d/impls/cells/benchmark.py
@@ -4,28 +4,37 @@
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
-# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
+import torch.nn.functional as F
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
-from kernels import get_kernel
-# Load the causal conv1d kernel
-causal_conv1d = get_kernel("kernels-community/causal-conv1d")
+def torch_causal_conv1d(input_tensor, weight, bias):
+ # Convert to weight dtype for computation
+ x = input_tensor.to(weight.dtype)
+ dim = weight.shape[0]
+ width = weight.shape[1]
+ seqlen = input_tensor.shape[-1]
-def hf_kernels_causal_conv1d(input_tensor, weight, bias):
- return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
+ # Depthwise causal conv1d using PyTorch
+ out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
+
+ # Truncate to original sequence length
+ out = out[..., :seqlen]
+
+ # Convert back to original dtype
+ return out.to(input_tensor.dtype)
run_benchmark(
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
- impl_name="hf_kernels_causal_conv1d",
- impl_tags={"family": "hf-kernels", "backend": "cuda"},
- impl_func=hf_kernels_causal_conv1d,
+ impl_name="torch_eager",
+ impl_tags={"family": "pytorch", "backend": "eager"},
+ impl_func=torch_causal_conv1d,
)
\ No newline at end of file
diff --git a/causal_conv1d/impls/hf_kernels_causal_conv1d.html b/causal_conv1d/impls/hf_kernels_causal_conv1d.html
index e161062d07cab205d4d881403fd3310ed83e20ca..cb1bde40be01c47bdde38e8da86912f92e3be9c0 100644
--- a/causal_conv1d/impls/hf_kernels_causal_conv1d.html
+++ b/causal_conv1d/impls/hf_kernels_causal_conv1d.html
@@ -4106,11 +4106,12 @@ body[data-tool="eraser"] .main-content {
▼ output
▶ uv-logs
|
-Cell: nv | 0.28s
+Cell: nv | 0.21s
| ▶ run
Copy
Raw
-GitHub
+GitHub
+🤗 HF
@@ -4122,7 +4123,7 @@ Cell: nv | 0.28s
-
Thu Oct 30 15:51:43 2025
+Fri Oct 31 20:00:25 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4132,7 @@ Cell: nv | 0.28s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 27C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
+| N/A 33C P0 79W / 350W | 0MiB / 46068MiB | 11% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -4155,11 +4156,12 @@ Cell: nv | 0.28s
▼ output
▶ uv-logs
|
-Cell: benchmark | 5.66s
+Cell: benchmark | 9.11s
| ▶ run
Copy
Raw
-GitHub
+GitHub
+🤗 HF
@@ -4208,19 +4210,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 148.031us 3643.39% 148.031us 148.031us 1
- hf_kernels_causal_conv1d 8.90% 165.322us 99.57% 1.851ms 1.851ms 0.000us 0.00% 5.503us 5.503us 1
- CausalConv1dFn 5.85% 108.724us 90.68% 1.685ms 561.740us 0.000us 0.00% 5.503us 1.834us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.35% 25.159us 81.18% 1.509ms 502.865us 4.063us 100.00% 5.503us 1.834us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3
- Activity Buffer Request 77.32% 1.437ms 77.32% 1.437ms 1.437ms 1.440us 35.44% 1.440us 1.440us 1
- aten::empty_like 0.95% 17.630us 3.65% 67.900us 22.633us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 2.70% 50.270us 2.70% 50.270us 16.757us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 2.50% 46.532us 2.50% 46.532us 15.511us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.43% 7.900us 0.43% 7.900us 7.900us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 180.703us 4446.43% 180.703us 180.703us 1
+ hf_kernels_causal_conv1d 8.48% 160.534us 99.62% 1.886ms 1.886ms 0.000us 0.00% 5.504us 5.504us 1
+ CausalConv1dFn 6.47% 122.423us 91.15% 1.726ms 575.261us 0.000us 0.00% 5.504us 1.835us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.51% 28.612us 80.84% 1.531ms 510.207us 4.064us 100.00% 5.504us 1.835us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.064us 100.00% 4.064us 1.355us 3
+ Activity Buffer Request 76.71% 1.452ms 76.71% 1.452ms 1.452ms 1.440us 35.43% 1.440us 1.440us 1
+ aten::empty_like 1.07% 20.220us 3.84% 72.741us 24.247us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 2.77% 52.521us 2.77% 52.521us 17.507us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 2.62% 49.571us 2.62% 49.571us 16.524us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.38% 7.101us 0.38% 7.101us 7.101us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.858ms
-Self CUDA time total: 4.063us
+Self CPU time total: 1.893ms
+Self CUDA time total: 4.064us
@@ -4230,19 +4232,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.926us 3229.86% 120.926us 120.926us 1
- hf_kernels_causal_conv1d 5.72% 96.561us 99.68% 1.683ms 1.683ms 0.000us 0.00% 4.992us 4.992us 1
- CausalConv1dFn 4.27% 72.072us 93.97% 1.587ms 528.936us 0.000us 0.00% 4.992us 1.664us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.50% 25.350us 87.84% 1.483ms 494.459us 3.744us 100.00% 4.992us 1.664us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.744us 100.00% 3.744us 1.248us 3
- Activity Buffer Request 84.49% 1.427ms 84.49% 1.427ms 1.427ms 1.248us 33.33% 1.248us 1.248us 1
- aten::empty_like 0.48% 8.160us 1.86% 31.360us 10.453us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.37% 23.200us 1.37% 23.200us 7.733us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 1.85% 31.292us 1.85% 31.292us 10.431us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.32% 5.320us 0.32% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.791us 3331.33% 125.791us 125.791us 1
+ hf_kernels_causal_conv1d 5.58% 96.392us 99.64% 1.721ms 1.721ms 0.000us 0.00% 5.056us 5.056us 1
+ CausalConv1dFn 4.40% 76.074us 94.06% 1.625ms 541.671us 0.000us 0.00% 5.056us 1.685us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.52% 26.231us 87.95% 1.519ms 506.473us 3.776us 100.00% 5.056us 1.685us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.776us 100.00% 3.776us 1.259us 3
+ Activity Buffer Request 84.56% 1.461ms 84.56% 1.461ms 1.461ms 1.280us 33.90% 1.280us 1.280us 1
+ aten::empty_like 0.44% 7.590us 1.71% 29.520us 9.840us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.27% 21.930us 1.27% 21.930us 7.310us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.87% 32.290us 1.87% 32.290us 10.763us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.36% 6.200us 0.36% 6.200us 6.200us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.689ms
-Self CUDA time total: 3.744us
+Self CPU time total: 1.728ms
+Self CUDA time total: 3.776us
@@ -4252,18 +4254,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.942us 3255.88% 122.942us 122.942us 1
- hf_kernels_causal_conv1d 6.02% 102.400us 99.66% 1.696ms 1.696ms 0.000us 0.00% 5.023us 5.023us 1
- CausalConv1dFn 4.37% 74.304us 93.64% 1.594ms 531.323us 0.000us 0.00% 5.023us 1.674us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.51% 25.778us 87.51% 1.490ms 496.532us 3.776us 100.00% 5.023us 1.674us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.758us 3330.46% 125.758us 125.758us 1
+ hf_kernels_causal_conv1d 5.23% 90.742us 99.66% 1.729ms 1.729ms 0.000us 0.00% 5.056us 5.056us 1
+ CausalConv1dFn 4.39% 76.092us 94.43% 1.638ms 546.081us 0.000us 0.00% 5.056us 1.685us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.50% 26.031us 88.31% 1.532ms 510.660us 3.776us 100.00% 5.056us 1.685us 3
void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.776us 100.00% 3.776us 1.259us 3
- Activity Buffer Request 84.19% 1.433ms 84.19% 1.433ms 1.433ms 1.247us 33.02% 1.247us 1.247us 1
- aten::empty_like 0.48% 8.219us 1.77% 30.070us 10.023us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.28% 21.851us 1.28% 21.851us 7.284us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 1.81% 30.742us 1.81% 30.742us 10.247us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.34% 5.821us 0.34% 5.821us 5.821us 0.000us 0.00% 0.000us 0.000us 1
+ Activity Buffer Request 84.98% 1.474ms 84.98% 1.474ms 1.474ms 1.280us 33.90% 1.280us 1.280us 1
+ aten::empty_like 0.47% 8.201us 1.74% 30.171us 10.057us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.27% 21.970us 1.27% 21.970us 7.323us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.83% 31.671us 1.83% 31.671us 10.557us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.34% 5.850us 0.34% 5.850us 5.850us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.702ms
+Self CPU time total: 1.735ms
Self CUDA time total: 3.776us
@@ -4274,19 +4276,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 154.975us 4105.30% 154.975us 154.975us 1
- hf_kernels_causal_conv1d 5.10% 97.113us 99.71% 1.897ms 1.897ms 0.000us 0.00% 5.022us 5.022us 1
- CausalConv1dFn 5.06% 96.320us 94.60% 1.800ms 599.880us 0.000us 0.00% 5.022us 1.674us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.32% 25.153us 87.78% 1.670ms 556.640us 3.775us 100.00% 5.022us 1.674us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.775us 100.00% 3.775us 1.258us 3
- Activity Buffer Request 75.43% 1.435ms 75.43% 1.435ms 1.435ms 1.247us 33.03% 1.247us 1.247us 1
- aten::empty_like 0.48% 9.119us 1.76% 33.400us 11.133us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.28% 24.281us 1.28% 24.281us 8.094us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 11.03% 209.783us 11.03% 209.783us 69.928us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.29% 5.600us 0.29% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 127.584us 3350.42% 127.584us 127.584us 1
+ hf_kernels_causal_conv1d 4.53% 88.983us 99.75% 1.962ms 1.962ms 0.000us 0.00% 5.088us 5.088us 1
+ CausalConv1dFn 3.93% 77.252us 95.23% 1.873ms 624.219us 0.000us 0.00% 5.088us 1.696us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.36% 26.710us 89.83% 1.766ms 588.805us 3.808us 100.00% 5.088us 1.696us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.808us 100.00% 3.808us 1.269us 3
+ Activity Buffer Request 74.34% 1.462ms 74.34% 1.462ms 1.462ms 1.280us 33.61% 1.280us 1.280us 1
+ aten::empty_like 0.41% 8.060us 1.47% 28.990us 9.663us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.06% 20.930us 1.06% 20.930us 6.977us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 14.13% 277.777us 14.13% 277.777us 92.592us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.25% 4.831us 0.25% 4.831us 4.831us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.902ms
-Self CUDA time total: 3.775us
+Self CPU time total: 1.966ms
+Self CUDA time total: 3.808us
@@ -4296,19 +4298,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 127.520us 2656.67% 127.520us 127.520us 1
- hf_kernels_causal_conv1d 5.48% 101.023us 99.67% 1.838ms 1.838ms 0.000us 0.00% 6.400us 6.400us 1
- CausalConv1dFn 4.02% 74.081us 94.20% 1.737ms 579.070us 0.000us 0.00% 6.400us 2.133us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.41% 25.982us 88.51% 1.632ms 544.113us 4.800us 100.00% 6.400us 2.133us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.800us 100.00% 4.800us 1.600us 3
- Activity Buffer Request 78.02% 1.439ms 78.02% 1.439ms 1.439ms 1.600us 33.33% 1.600us 1.600us 1
- aten::empty_like 0.45% 8.310us 1.67% 30.790us 10.263us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.22% 22.480us 1.22% 22.480us 7.493us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 9.08% 167.462us 9.08% 167.462us 55.821us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.33% 6.020us 0.33% 6.020us 6.020us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 126.686us 2639.84% 126.686us 126.686us 1
+ hf_kernels_causal_conv1d 4.55% 87.622us 99.73% 1.920ms 1.920ms 0.000us 0.00% 6.430us 6.430us 1
+ CausalConv1dFn 3.92% 75.482us 95.18% 1.832ms 610.789us 0.000us 0.00% 6.430us 2.143us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.44% 27.663us 89.66% 1.726ms 575.372us 4.799us 100.00% 6.430us 2.143us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.799us 100.00% 4.799us 1.600us 3
+ Activity Buffer Request 74.49% 1.434ms 74.49% 1.434ms 1.434ms 1.631us 33.99% 1.631us 1.631us 1
+ aten::empty_like 0.42% 8.140us 1.60% 30.770us 10.257us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.18% 22.630us 1.18% 22.630us 7.543us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 13.74% 264.526us 13.74% 264.526us 88.175us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 5.120us 0.27% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.844ms
-Self CUDA time total: 4.800us
+Self CPU time total: 1.925ms
+Self CUDA time total: 4.799us
@@ -4318,19 +4320,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.208us 2446.36% 118.208us 118.208us 1
- hf_kernels_causal_conv1d 14.10% 77.840us 98.97% 546.449us 546.449us 0.000us 0.00% 6.464us 6.464us 1
- CausalConv1dFn 13.03% 71.942us 84.87% 468.609us 156.203us 0.000us 0.00% 6.464us 2.155us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 4.50% 24.830us 66.59% 367.636us 122.545us 4.832us 100.00% 6.464us 2.155us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.832us 100.00% 4.832us 1.611us 3
- Activity Buffer Request 33.64% 185.743us 33.64% 185.743us 185.743us 1.632us 33.77% 1.632us 1.632us 1
- aten::empty_like 1.44% 7.931us 5.26% 29.031us 9.677us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 3.82% 21.100us 3.82% 21.100us 7.033us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 28.45% 157.063us 28.45% 157.063us 52.354us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.03% 5.680us 1.03% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.083us 2423.58% 117.083us 117.083us 1
+ hf_kernels_causal_conv1d 12.24% 83.203us 99.28% 674.957us 674.957us 0.000us 0.00% 6.463us 6.463us 1
+ CausalConv1dFn 10.43% 70.911us 87.04% 591.754us 197.251us 0.000us 0.00% 6.463us 2.154us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 3.93% 26.710us 72.18% 490.682us 163.561us 4.831us 100.00% 6.463us 2.154us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.831us 100.00% 4.831us 1.610us 3
+ Activity Buffer Request 32.42% 220.416us 32.42% 220.416us 220.416us 1.632us 33.78% 1.632us 1.632us 1
+ aten::empty_like 1.07% 7.270us 4.44% 30.161us 10.054us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.37% 22.891us 3.37% 22.891us 7.630us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 35.83% 243.556us 35.83% 243.556us 81.185us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.72% 4.870us 0.72% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 552.129us
-Self CUDA time total: 4.832us
+Self CPU time total: 679.827us
+Self CUDA time total: 4.831us
@@ -4340,19 +4342,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 129.887us 1226.27% 129.887us 129.887us 1
- hf_kernels_causal_conv1d 5.23% 95.772us 99.69% 1.826ms 1.826ms 0.000us 0.00% 14.144us 14.144us 1
- CausalConv1dFn 4.13% 75.612us 94.46% 1.730ms 576.726us 0.000us 0.00% 14.144us 4.715us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.41% 25.780us 88.71% 1.625ms 541.586us 10.592us 100.00% 14.144us 4.715us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.592us 100.00% 10.592us 3.531us 3
- Activity Buffer Request 78.55% 1.439ms 78.55% 1.439ms 1.439ms 3.552us 33.53% 3.552us 3.552us 1
- aten::empty_like 0.48% 8.780us 1.63% 29.810us 9.937us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.15% 21.030us 1.15% 21.030us 7.010us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 8.75% 160.332us 8.75% 160.332us 53.444us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.31% 5.650us 0.31% 5.650us 5.650us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.381us 1167.35% 124.381us 124.381us 1
+ hf_kernels_causal_conv1d 4.48% 85.542us 99.75% 1.904ms 1.904ms 0.000us 0.00% 14.271us 14.271us 1
+ CausalConv1dFn 3.83% 73.182us 95.27% 1.819ms 606.282us 0.000us 0.00% 14.271us 4.757us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.41% 26.960us 89.88% 1.716ms 571.988us 10.655us 100.00% 14.271us 4.757us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.655us 100.00% 10.655us 3.552us 3
+ Activity Buffer Request 76.01% 1.451ms 76.01% 1.451ms 1.451ms 3.616us 33.94% 3.616us 3.616us 1
+ aten::empty_like 0.43% 8.120us 1.56% 29.700us 9.900us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.13% 21.580us 1.13% 21.580us 7.193us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 12.45% 237.787us 12.45% 237.787us 79.262us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.25% 4.860us 0.25% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.832ms
-Self CUDA time total: 10.592us
+Self CPU time total: 1.909ms
+Self CUDA time total: 10.655us
@@ -4362,19 +4364,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.356us 1093.80% 119.356us 119.356us 1
- hf_kernels_causal_conv1d 19.79% 94.221us 98.72% 469.928us 469.928us 0.000us 0.00% 14.592us 14.592us 1
- CausalConv1dFn 14.74% 70.172us 78.93% 375.707us 125.236us 0.000us 0.00% 14.592us 4.864us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.30% 25.240us 58.06% 276.375us 92.125us 10.912us 100.00% 14.592us 4.864us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.912us 100.00% 10.912us 3.637us 3
- Activity Buffer Request 19.79% 94.192us 19.79% 94.192us 94.192us 3.680us 33.72% 3.680us 3.680us 1
- aten::empty_like 1.68% 7.980us 6.13% 29.160us 9.720us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.45% 21.180us 4.45% 21.180us 7.060us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 32.97% 156.943us 32.97% 156.943us 52.314us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.28% 6.090us 1.28% 6.090us 6.090us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.652us 1120.72% 122.652us 122.652us 1
+ hf_kernels_causal_conv1d 12.91% 86.303us 99.27% 663.588us 663.588us 0.000us 0.00% 14.624us 14.624us 1
+ CausalConv1dFn 10.74% 71.821us 86.36% 577.285us 192.428us 0.000us 0.00% 14.624us 4.875us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 3.81% 25.480us 71.21% 476.023us 158.674us 10.944us 100.00% 14.624us 4.875us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 100.00% 10.944us 3.648us 3
+ Activity Buffer Request 32.82% 219.426us 32.82% 219.426us 219.426us 3.680us 33.63% 3.680us 3.680us 1
+ aten::empty_like 1.14% 7.591us 4.40% 29.441us 9.814us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.27% 21.850us 3.27% 21.850us 7.283us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 34.57% 231.117us 34.57% 231.117us 77.039us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.73% 4.900us 0.73% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 476.018us
-Self CUDA time total: 10.912us
+Self CPU time total: 668.488us
+Self CUDA time total: 10.944us
@@ -4384,19 +4386,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 129.375us 1178.71% 129.375us 129.375us 1
- hf_kernels_causal_conv1d 5.38% 99.351us 99.70% 1.840ms 1.840ms 0.000us 0.00% 14.656us 14.656us 1
- CausalConv1dFn 4.01% 73.942us 94.32% 1.740ms 580.087us 0.000us 0.00% 14.656us 4.885us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.38% 25.552us 88.67% 1.636ms 545.346us 10.976us 100.00% 14.656us 4.885us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.976us 100.00% 10.976us 3.659us 3
- Activity Buffer Request 78.64% 1.451ms 78.64% 1.451ms 1.451ms 3.680us 33.53% 3.680us 3.680us 1
- aten::empty_like 0.48% 8.800us 1.64% 30.280us 10.093us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.16% 21.480us 1.16% 21.480us 7.160us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 8.64% 159.392us 8.64% 159.392us 53.131us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.30% 5.531us 0.30% 5.531us 5.531us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 130.430us 1181.43% 130.430us 130.430us 1
+ hf_kernels_causal_conv1d 4.23% 79.341us 99.73% 1.871ms 1.871ms 0.000us 0.00% 14.784us 14.784us 1
+ CausalConv1dFn 4.03% 75.521us 95.50% 1.792ms 597.206us 0.000us 0.00% 14.784us 4.928us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.43% 26.810us 89.82% 1.685ms 561.675us 11.040us 100.00% 14.784us 4.928us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.040us 100.00% 11.040us 3.680us 3
+ Activity Buffer Request 77.07% 1.446ms 77.07% 1.446ms 1.446ms 3.744us 33.91% 3.744us 3.744us 1
+ aten::empty_like 0.44% 8.272us 1.66% 31.072us 10.357us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.22% 22.800us 1.22% 22.800us 7.600us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 11.32% 212.286us 11.32% 212.286us 70.762us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 5.130us 0.27% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.845ms
-Self CUDA time total: 10.976us
+Self CPU time total: 1.876ms
+Self CUDA time total: 11.040us
@@ -4406,19 +4408,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.679us 1104.47% 123.679us 123.679us 1
- hf_kernels_causal_conv1d 17.75% 87.860us 98.92% 489.618us 489.618us 0.000us 0.00% 14.974us 14.974us 1
- CausalConv1dFn 14.77% 73.091us 81.17% 401.758us 133.919us 0.000us 0.00% 14.974us 4.991us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.42% 26.830us 60.45% 299.195us 99.732us 11.198us 100.00% 14.974us 4.991us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.198us 100.00% 11.198us 3.733us 3
- Activity Buffer Request 20.28% 100.392us 20.28% 100.392us 100.392us 3.776us 33.72% 3.776us 3.776us 1
- aten::empty_like 1.69% 8.381us 5.95% 29.472us 9.824us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.26% 21.091us 4.26% 21.091us 7.030us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 34.75% 171.973us 34.75% 171.973us 57.324us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.08% 5.331us 1.08% 5.331us 5.331us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.097us 1060.18% 120.097us 120.097us 1
+ hf_kernels_causal_conv1d 13.35% 76.301us 99.17% 566.674us 566.674us 0.000us 0.00% 15.168us 15.168us 1
+ CausalConv1dFn 12.80% 73.153us 85.81% 490.373us 163.458us 0.000us 0.00% 15.168us 5.056us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.71% 26.911us 68.00% 388.569us 129.523us 11.328us 100.00% 15.168us 5.056us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.328us 100.00% 11.328us 3.776us 3
+ Activity Buffer Request 34.49% 197.075us 34.49% 197.075us 197.075us 3.840us 33.90% 3.840us 3.840us 1
+ aten::empty_like 1.29% 7.379us 5.01% 28.651us 9.550us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.72% 21.272us 3.72% 21.272us 7.091us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 28.80% 164.583us 28.80% 164.583us 54.861us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.83% 4.760us 0.83% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 494.949us
-Self CUDA time total: 11.198us
+Self CPU time total: 571.434us
+Self CUDA time total: 11.328us
@@ -4428,19 +4430,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 132.959us 264.31% 132.959us 132.959us 1
- hf_kernels_causal_conv1d 5.33% 97.801us 99.71% 1.830ms 1.830ms 0.000us 0.00% 83.968us 83.968us 1
- CausalConv1dFn 4.03% 73.903us 94.38% 1.732ms 577.264us 0.000us 0.00% 83.968us 27.989us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.44% 26.339us 88.71% 1.628ms 542.606us 50.304us 100.00% 83.968us 27.989us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 50.304us 100.00% 50.304us 16.768us 3
- Activity Buffer Request 78.52% 1.441ms 78.52% 1.441ms 1.441ms 33.664us 66.92% 33.664us 33.664us 1
- aten::empty_like 0.46% 8.510us 1.64% 30.070us 10.023us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.17% 21.560us 1.17% 21.560us 7.187us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 8.75% 160.594us 8.75% 160.594us 53.531us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.29% 5.400us 0.29% 5.400us 5.400us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 133.919us 265.71% 133.919us 133.919us 1
+ hf_kernels_causal_conv1d 4.38% 80.552us 99.73% 1.836ms 1.836ms 0.000us 0.00% 83.873us 83.873us 1
+ CausalConv1dFn 4.09% 75.353us 95.35% 1.755ms 585.145us 0.000us 0.00% 83.873us 27.958us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.33% 24.410us 89.50% 1.648ms 549.264us 50.401us 100.00% 83.873us 27.958us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 50.401us 100.00% 50.401us 16.800us 3
+ Activity Buffer Request 79.01% 1.455ms 79.01% 1.455ms 1.455ms 33.472us 66.41% 33.472us 33.472us 1
+ aten::empty_like 0.45% 8.369us 1.75% 32.290us 10.763us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.30% 23.921us 1.30% 23.921us 7.974us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 9.17% 168.764us 9.17% 168.764us 56.255us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 5.020us 0.27% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.835ms
-Self CUDA time total: 50.304us
+Self CPU time total: 1.841ms
+Self CUDA time total: 50.401us
@@ -4450,18 +4452,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.085us 244.46% 125.085us 125.085us 1
- hf_kernels_causal_conv1d 15.91% 74.080us 98.78% 459.898us 459.898us 0.000us 0.00% 85.694us 85.694us 1
- CausalConv1dFn 15.58% 72.521us 82.87% 385.818us 128.606us 0.000us 0.00% 85.694us 28.565us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.92% 27.572us 61.05% 284.236us 94.745us 51.167us 100.00% 85.694us 28.565us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 131.005us 256.03% 131.005us 131.005us 1
+ hf_kernels_causal_conv1d 11.69% 77.241us 99.25% 655.717us 655.717us 0.000us 0.00% 85.534us 85.534us 1
+ CausalConv1dFn 10.97% 72.503us 87.56% 578.476us 192.825us 0.000us 0.00% 85.534us 28.511us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 3.89% 25.692us 71.76% 474.103us 158.034us 51.167us 100.00% 85.534us 28.511us 3
void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 51.167us 100.00% 51.167us 17.056us 3
- Activity Buffer Request 21.78% 101.412us 21.78% 101.412us 101.412us 34.527us 67.48% 34.527us 34.527us 1
- aten::empty_like 1.68% 7.830us 6.24% 29.061us 9.687us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.56% 21.231us 4.56% 21.231us 7.077us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 33.35% 155.252us 33.35% 155.252us 51.751us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.22% 5.680us 1.22% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1
+ Activity Buffer Request 43.08% 284.587us 43.08% 284.587us 284.587us 34.367us 67.17% 34.367us 34.367us 1
+ aten::empty_like 1.14% 7.549us 4.82% 31.870us 10.623us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.68% 24.321us 3.68% 24.321us 8.107us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 24.80% 163.824us 24.80% 163.824us 54.608us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.75% 4.929us 0.75% 4.929us 4.929us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 465.578us
+Self CPU time total: 660.646us
Self CUDA time total: 51.167us
@@ -4472,19 +4474,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.583us 3164.74% 123.583us 123.583us 1
- hf_kernels_causal_conv1d 8.70% 75.560us 99.36% 863.215us 863.215us 0.000us 0.00% 5.153us 5.153us 1
- CausalConv1dFn 8.33% 72.353us 90.66% 787.655us 262.552us 0.000us 0.00% 5.153us 1.718us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 2.88% 25.000us 78.85% 685.062us 228.354us 3.905us 100.00% 5.153us 1.718us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.905us 100.00% 3.905us 1.302us 3
- Activity Buffer Request 57.61% 500.499us 57.61% 500.499us 500.499us 1.248us 31.96% 1.248us 1.248us 1
- aten::empty_like 0.96% 8.370us 3.48% 30.240us 10.080us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 2.52% 21.870us 2.52% 21.870us 7.290us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 18.37% 159.563us 18.37% 159.563us 53.188us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.64% 5.560us 0.64% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.686us 3040.89% 118.686us 118.686us 1
+ hf_kernels_causal_conv1d 11.60% 73.750us 99.24% 631.216us 631.216us 0.000us 0.00% 5.183us 5.183us 1
+ CausalConv1dFn 11.30% 71.845us 87.65% 557.466us 185.822us 0.000us 0.00% 5.183us 1.728us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.22% 26.861us 71.87% 457.101us 152.367us 3.903us 100.00% 5.183us 1.728us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.903us 100.00% 3.903us 1.301us 3
+ Activity Buffer Request 42.38% 269.577us 42.38% 269.577us 269.577us 1.280us 32.80% 1.280us 1.280us 1
+ aten::empty_like 1.23% 7.810us 4.48% 28.520us 9.507us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.26% 20.710us 3.26% 20.710us 6.903us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 25.26% 160.663us 25.26% 160.663us 53.554us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.76% 4.821us 0.76% 4.821us 4.821us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 868.775us
-Self CUDA time total: 3.905us
+Self CPU time total: 636.037us
+Self CUDA time total: 3.903us
@@ -4494,19 +4496,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.845us 3044.19% 118.845us 118.845us 1
- hf_kernels_causal_conv1d 16.55% 74.260us 98.76% 443.077us 443.077us 0.000us 0.00% 5.152us 5.152us 1
- CausalConv1dFn 15.87% 71.182us 82.21% 368.817us 122.939us 0.000us 0.00% 5.152us 1.717us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.48% 24.591us 59.34% 266.204us 88.735us 3.904us 100.00% 5.152us 1.717us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.904us 100.00% 3.904us 1.301us 3
- Activity Buffer Request 18.72% 83.961us 18.72% 83.961us 83.961us 1.248us 31.97% 1.248us 1.248us 1
- aten::empty_like 1.83% 8.189us 7.01% 31.431us 10.477us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 5.18% 23.242us 5.18% 23.242us 7.747us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 35.14% 157.652us 35.14% 157.652us 52.551us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.24% 5.551us 1.24% 5.551us 5.551us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.221us 3029.76% 120.221us 120.221us 1
+ hf_kernels_causal_conv1d 13.01% 75.082us 99.09% 571.775us 571.775us 0.000us 0.00% 5.248us 5.248us 1
+ CausalConv1dFn 12.35% 71.241us 86.08% 496.693us 165.564us 0.000us 0.00% 5.248us 1.749us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.88% 28.181us 68.58% 395.720us 131.907us 3.968us 100.00% 5.248us 1.749us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
+ Activity Buffer Request 36.26% 209.246us 36.26% 209.246us 209.246us 1.280us 32.26% 1.280us 1.280us 1
+ aten::empty_like 1.42% 8.172us 5.15% 29.732us 9.911us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.74% 21.560us 3.74% 21.560us 7.187us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 27.43% 158.293us 27.43% 158.293us 52.764us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.91% 5.270us 0.91% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 448.628us
-Self CUDA time total: 3.904us
+Self CPU time total: 577.045us
+Self CUDA time total: 3.968us
@@ -4516,19 +4518,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.816us 3046.03% 122.816us 122.816us 1
- hf_kernels_causal_conv1d 8.66% 75.390us 99.38% 865.505us 865.505us 0.000us 0.00% 5.376us 5.376us 1
- CausalConv1dFn 8.40% 73.201us 90.72% 790.115us 263.372us 0.000us 0.00% 5.376us 1.792us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 3.02% 26.261us 78.90% 687.193us 229.064us 4.032us 100.00% 5.376us 1.792us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3
- Activity Buffer Request 57.07% 497.089us 57.07% 497.089us 497.089us 1.344us 33.33% 1.344us 1.344us 1
- aten::empty_like 0.93% 8.130us 3.41% 29.721us 9.907us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 2.48% 21.591us 2.48% 21.591us 7.197us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 18.81% 163.843us 18.81% 163.843us 54.614us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.62% 5.440us 0.62% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.374us 2843.36% 117.374us 117.374us 1
+ hf_kernels_causal_conv1d 14.38% 74.792us 98.97% 514.843us 514.843us 0.000us 0.00% 5.504us 5.504us 1
+ CausalConv1dFn 13.25% 68.940us 84.59% 440.051us 146.684us 0.000us 0.00% 5.504us 1.835us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.99% 25.981us 65.51% 340.779us 113.593us 4.128us 100.00% 5.504us 1.835us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.128us 100.00% 4.128us 1.376us 3
+ Activity Buffer Request 29.84% 155.214us 29.84% 155.214us 155.214us 1.376us 33.33% 1.376us 1.376us 1
+ aten::empty_like 1.55% 8.080us 5.83% 30.332us 10.111us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.28% 22.252us 4.28% 22.252us 7.417us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 30.68% 159.584us 30.68% 159.584us 53.195us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.03% 5.380us 1.03% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 870.945us
-Self CUDA time total: 4.032us
+Self CPU time total: 520.223us
+Self CUDA time total: 4.128us
@@ -4538,18 +4540,18 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 116.446us 2866.01% 116.446us 116.446us 1
- hf_kernels_causal_conv1d 16.24% 74.671us 98.84% 454.378us 454.378us 0.000us 0.00% 5.407us 5.407us 1
- CausalConv1dFn 15.28% 70.221us 82.60% 379.707us 126.569us 0.000us 0.00% 5.407us 1.802us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.99% 27.540us 61.00% 280.405us 93.468us 4.063us 100.00% 5.407us 1.802us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 116.831us 2875.49% 116.831us 116.831us 1
+ hf_kernels_causal_conv1d 13.78% 75.282us 99.09% 541.484us 541.484us 0.000us 0.00% 5.439us 5.439us 1
+ CausalConv1dFn 12.58% 68.741us 85.32% 466.202us 155.401us 0.000us 0.00% 5.439us 1.813us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.76% 26.021us 67.34% 367.980us 122.660us 4.063us 100.00% 5.439us 1.813us 3
void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3
- Activity Buffer Request 21.14% 97.192us 21.14% 97.192us 97.192us 1.344us 33.08% 1.344us 1.344us 1
- aten::empty_like 1.73% 7.931us 6.33% 29.081us 9.694us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.60% 21.150us 4.60% 21.150us 7.050us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 33.86% 155.673us 33.86% 155.673us 51.891us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.16% 5.330us 1.16% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1
+ Activity Buffer Request 33.52% 183.175us 33.52% 183.175us 183.175us 1.376us 33.87% 1.376us 1.376us 1
+ aten::empty_like 1.37% 7.489us 5.40% 29.481us 9.827us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.02% 21.992us 4.02% 21.992us 7.331us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 29.06% 158.784us 29.06% 158.784us 52.928us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.91% 4.951us 0.91% 4.951us 4.951us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 459.708us
+Self CPU time total: 546.435us
Self CUDA time total: 4.063us
@@ -4560,19 +4562,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.895us 2262.26% 120.895us 120.895us 1
- hf_kernels_causal_conv1d 10.03% 75.040us 99.26% 742.432us 742.432us 0.000us 0.00% 7.136us 7.136us 1
- CausalConv1dFn 9.57% 71.601us 89.23% 667.392us 222.464us 0.000us 0.00% 7.136us 2.379us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 3.57% 26.722us 75.60% 565.480us 188.493us 5.344us 100.00% 7.136us 2.379us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.344us 100.00% 5.344us 1.781us 3
- Activity Buffer Request 50.95% 381.056us 50.95% 381.056us 381.056us 1.792us 33.53% 1.792us 1.792us 1
- aten::empty_like 1.09% 8.161us 4.05% 30.311us 10.104us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 2.96% 22.150us 2.96% 22.150us 7.383us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 21.08% 157.702us 21.08% 157.702us 52.567us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.74% 5.510us 0.74% 5.510us 5.510us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.806us 2228.53% 119.806us 119.806us 1
+ hf_kernels_causal_conv1d 11.93% 76.073us 99.21% 632.507us 632.507us 0.000us 0.00% 7.200us 7.200us 1
+ CausalConv1dFn 11.21% 71.480us 87.28% 556.434us 185.478us 0.000us 0.00% 7.200us 2.400us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.13% 26.361us 71.46% 455.612us 151.871us 5.376us 100.00% 7.200us 2.400us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.376us 100.00% 5.376us 1.792us 3
+ Activity Buffer Request 42.49% 270.867us 42.49% 270.867us 270.867us 1.824us 33.93% 1.824us 1.824us 1
+ aten::empty_like 1.24% 7.892us 4.60% 29.342us 9.781us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.36% 21.450us 3.36% 21.450us 7.150us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 24.84% 158.384us 24.84% 158.384us 52.795us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.79% 5.050us 0.79% 5.050us 5.050us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 747.942us
-Self CUDA time total: 5.344us
+Self CPU time total: 637.557us
+Self CUDA time total: 5.376us
@@ -4582,19 +4584,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 114.428us 2091.54% 114.428us 114.428us 1
- hf_kernels_causal_conv1d 15.93% 72.612us 98.81% 450.477us 450.477us 0.000us 0.00% 7.327us 7.327us 1
- CausalConv1dFn 15.28% 69.671us 82.88% 377.865us 125.955us 0.000us 0.00% 7.327us 2.442us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.81% 26.480us 61.42% 279.994us 93.331us 5.471us 100.00% 7.327us 2.442us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.471us 100.00% 5.471us 1.824us 3
- Activity Buffer Request 21.45% 97.772us 21.45% 97.772us 97.772us 1.856us 33.92% 1.856us 1.856us 1
- aten::empty_like 1.75% 7.980us 6.19% 28.200us 9.400us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.44% 20.220us 4.44% 20.220us 6.740us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 34.16% 155.742us 34.16% 155.742us 51.914us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.19% 5.420us 1.19% 5.420us 5.420us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 119.676us 2174.35% 119.676us 119.676us 1
+ hf_kernels_causal_conv1d 14.25% 74.352us 99.01% 516.513us 516.513us 0.000us 0.00% 7.392us 7.392us 1
+ CausalConv1dFn 14.02% 73.122us 84.76% 442.161us 147.387us 0.000us 0.00% 7.392us 2.464us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.04% 26.281us 65.18% 340.038us 113.346us 5.504us 100.00% 7.392us 2.464us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.504us 100.00% 5.504us 1.835us 3
+ Activity Buffer Request 30.19% 157.524us 30.19% 157.524us 157.524us 1.888us 34.30% 1.888us 1.888us 1
+ aten::empty_like 1.50% 7.800us 5.56% 29.001us 9.667us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.06% 21.201us 4.06% 21.201us 7.067us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 29.95% 156.233us 29.95% 156.233us 52.078us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.99% 5.180us 0.99% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 455.897us
-Self CUDA time total: 5.471us
+Self CPU time total: 521.693us
+Self CUDA time total: 5.504us
@@ -4604,19 +4606,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.251us 717.80% 124.251us 124.251us 1
- hf_kernels_causal_conv1d 10.05% 75.520us 99.24% 745.563us 745.563us 0.000us 0.00% 23.101us 23.101us 1
- CausalConv1dFn 9.33% 70.111us 89.19% 670.043us 223.348us 0.000us 0.00% 23.101us 7.700us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 3.43% 25.770us 75.92% 570.342us 190.114us 17.310us 100.00% 23.101us 7.700us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.310us 100.00% 17.310us 5.770us 3
- Activity Buffer Request 51.18% 384.497us 51.18% 384.497us 384.497us 5.791us 33.45% 5.791us 5.791us 1
- aten::empty_like 1.14% 8.540us 3.94% 29.590us 9.863us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 2.80% 21.050us 2.80% 21.050us 7.017us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 21.31% 160.075us 21.31% 160.075us 53.358us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.76% 5.680us 0.76% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.798us 715.63% 124.798us 124.798us 1
+ hf_kernels_causal_conv1d 11.85% 75.293us 99.15% 630.167us 630.167us 0.000us 0.00% 23.295us 23.295us 1
+ CausalConv1dFn 11.06% 70.310us 87.30% 554.874us 184.958us 0.000us 0.00% 23.295us 7.765us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.18% 26.540us 71.39% 453.732us 151.244us 17.439us 100.00% 23.295us 7.765us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.439us 100.00% 17.439us 5.813us 3
+ Activity Buffer Request 42.20% 268.237us 42.20% 268.237us 268.237us 5.856us 33.58% 5.856us 5.856us 1
+ aten::empty_like 1.25% 7.951us 4.85% 30.832us 10.277us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.60% 22.881us 3.60% 22.881us 7.627us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 25.01% 158.955us 25.01% 158.955us 52.985us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.85% 5.410us 0.85% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 751.243us
-Self CUDA time total: 17.310us
+Self CPU time total: 635.577us
+Self CUDA time total: 17.439us
@@ -4626,19 +4628,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.596us 682.20% 121.596us 121.596us 1
- hf_kernels_causal_conv1d 16.81% 75.551us 98.76% 443.797us 443.797us 0.000us 0.00% 23.808us 23.808us 1
- CausalConv1dFn 15.22% 68.400us 81.95% 368.246us 122.749us 0.000us 0.00% 23.808us 7.936us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.83% 26.181us 60.07% 269.934us 89.978us 17.824us 100.00% 23.808us 7.936us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.824us 100.00% 17.824us 5.941us 3
- Activity Buffer Request 19.24% 86.441us 19.24% 86.441us 86.441us 5.984us 33.57% 5.984us 5.984us 1
- aten::empty_like 1.76% 7.900us 6.66% 29.912us 9.971us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.90% 22.012us 4.90% 22.012us 7.337us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 35.01% 157.312us 35.01% 157.312us 52.437us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.24% 5.550us 1.24% 5.550us 5.550us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.252us 695.89% 124.252us 124.252us 1
+ hf_kernels_causal_conv1d 15.28% 76.213us 99.04% 494.053us 494.053us 0.000us 0.00% 23.839us 23.839us 1
+ CausalConv1dFn 14.60% 72.841us 83.76% 417.840us 139.280us 0.000us 0.00% 23.839us 7.946us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.38% 26.851us 63.27% 315.607us 105.202us 17.855us 100.00% 23.839us 7.946us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.855us 100.00% 17.855us 5.952us 3
+ Activity Buffer Request 26.40% 131.703us 26.40% 131.703us 131.703us 5.984us 33.51% 5.984us 5.984us 1
+ aten::empty_like 1.62% 8.090us 5.89% 29.392us 9.797us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.27% 21.302us 4.27% 21.302us 7.101us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 31.48% 157.053us 31.48% 157.053us 52.351us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.96% 4.810us 0.96% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 449.347us
-Self CUDA time total: 17.824us
+Self CPU time total: 498.863us
+Self CUDA time total: 17.855us
@@ -4648,19 +4650,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.077us 686.13% 122.077us 122.077us 1
- hf_kernels_causal_conv1d 12.00% 91.181us 99.29% 754.243us 754.243us 0.000us 0.00% 23.808us 23.808us 1
- CausalConv1dFn 9.45% 71.802us 87.29% 663.062us 221.021us 0.000us 0.00% 23.808us 7.936us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 3.27% 24.831us 73.88% 561.180us 187.060us 17.792us 100.00% 23.808us 7.936us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.792us 100.00% 17.792us 5.931us 3
- Activity Buffer Request 49.89% 378.947us 49.89% 378.947us 378.947us 6.016us 33.81% 6.016us 6.016us 1
- aten::empty_like 1.06% 8.020us 3.96% 30.080us 10.027us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 2.90% 22.060us 2.90% 22.060us 7.353us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 20.72% 157.402us 20.72% 157.402us 52.467us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.71% 5.381us 0.71% 5.381us 5.381us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.253us 695.94% 124.253us 124.253us 1
+ hf_kernels_causal_conv1d 14.09% 92.581us 99.22% 652.096us 652.096us 0.000us 0.00% 23.838us 23.838us 1
+ CausalConv1dFn 11.45% 75.254us 85.13% 559.515us 186.505us 0.000us 0.00% 23.838us 7.946us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 3.84% 25.251us 69.30% 455.481us 151.827us 17.854us 100.00% 23.838us 7.946us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.854us 100.00% 17.854us 5.951us 3
+ Activity Buffer Request 41.42% 272.247us 41.42% 272.247us 272.247us 5.984us 33.52% 5.984us 5.984us 1
+ aten::empty_like 1.19% 7.849us 4.38% 28.780us 9.593us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.18% 20.931us 3.18% 20.931us 6.977us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 24.04% 157.983us 24.04% 157.983us 52.661us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.78% 5.140us 0.78% 5.140us 5.140us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 759.624us
-Self CUDA time total: 17.792us
+Self CPU time total: 657.236us
+Self CUDA time total: 17.854us
@@ -4670,19 +4672,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 124.351us 671.15% 124.351us 124.351us 1
- hf_kernels_causal_conv1d 19.13% 92.321us 98.80% 476.748us 476.748us 0.000us 0.00% 24.736us 24.736us 1
- CausalConv1dFn 14.83% 71.551us 79.67% 384.427us 128.142us 0.000us 0.00% 24.736us 8.245us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.89% 28.409us 58.58% 282.676us 94.225us 18.528us 100.00% 24.736us 8.245us 3
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.528us 100.00% 18.528us 6.176us 3
- Activity Buffer Request 20.26% 97.782us 20.26% 97.782us 97.782us 6.208us 33.51% 6.208us 6.208us 1
- aten::empty_like 1.73% 8.360us 6.26% 30.200us 10.067us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.53% 21.840us 4.53% 21.840us 7.280us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 32.43% 156.485us 32.43% 156.485us 52.162us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.20% 5.770us 1.20% 5.770us 5.770us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 121.982us 651.61% 121.982us 121.982us 1
+ hf_kernels_causal_conv1d 16.26% 76.273us 99.00% 464.343us 464.343us 0.000us 0.00% 25.088us 25.088us 1
+ CausalConv1dFn 15.20% 71.302us 82.74% 388.070us 129.357us 0.000us 0.00% 25.088us 8.363us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.49% 25.750us 61.15% 286.808us 95.603us 18.720us 100.00% 25.088us 8.363us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.720us 100.00% 18.720us 6.240us 3
+ Activity Buffer Request 22.13% 103.813us 22.13% 103.813us 103.813us 6.368us 34.02% 6.368us 6.368us 1
+ aten::empty_like 1.75% 8.210us 6.39% 29.960us 9.987us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.64% 21.750us 4.64% 21.750us 7.250us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 33.53% 157.245us 33.53% 157.245us 52.415us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.00% 4.680us 1.00% 4.680us 4.680us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 482.518us
-Self CUDA time total: 18.528us
+Self CPU time total: 469.023us
+Self CUDA time total: 18.720us
@@ -4692,19 +4694,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 5.47% 101.271us 99.69% 1.845ms 1.845ms 0.000us 0.00% 162.913us 162.913us 1
- CausalConv1dFn 4.05% 75.021us 94.22% 1.743ms 581.104us 0.000us 0.00% 162.913us 54.304us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 1.32% 24.372us 88.46% 1.637ms 545.603us 97.697us 100.00% 162.913us 54.304us 3
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 139.807us 143.10% 139.807us 139.807us 1
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 97.697us 100.00% 97.697us 32.566us 3
- Activity Buffer Request 78.43% 1.451ms 78.43% 1.451ms 1.451ms 65.216us 66.75% 65.216us 65.216us 1
- aten::empty_like 0.45% 8.320us 1.70% 31.480us 10.493us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 1.25% 23.160us 1.25% 23.160us 7.720us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 8.71% 161.192us 8.71% 161.192us 53.731us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 0.31% 5.721us 0.31% 5.721us 5.721us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 4.40% 80.973us 99.73% 1.837ms 1.837ms 0.000us 0.00% 162.749us 162.749us 1
+ CausalConv1dFn 4.14% 76.301us 95.33% 1.756ms 585.285us 0.000us 0.00% 162.749us 54.250us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.45% 26.730us 89.50% 1.648ms 549.474us 97.918us 100.00% 162.749us 54.250us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 141.950us 144.97% 141.950us 141.950us 1
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 97.918us 100.00% 97.918us 32.639us 3
+ Activity Buffer Request 78.99% 1.455ms 78.99% 1.455ms 1.455ms 64.831us 66.21% 64.831us 64.831us 1
+ aten::empty_like 0.45% 8.340us 1.69% 31.131us 10.377us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.24% 22.791us 1.24% 22.791us 7.597us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 9.06% 166.885us 9.06% 166.885us 55.628us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 4.980us 0.27% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.850ms
-Self CUDA time total: 97.697us
+Self CPU time total: 1.842ms
+Self CUDA time total: 97.918us
@@ -4714,19 +4716,19 @@ PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_causal_conv1d 19.60% 95.701us 98.90% 482.848us 482.848us 0.000us 0.00% 163.744us 163.744us 1
- CausalConv1dFn 15.21% 74.281us 79.29% 387.147us 129.049us 0.000us 0.00% 163.744us 54.581us 3
- _causal_conv1d_90f5a60::causal_conv1d_fwd 5.67% 27.701us 57.93% 282.846us 94.282us 98.688us 100.00% 163.744us 54.581us 3
- hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 139.968us 141.83% 139.968us 139.968us 1
-void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 98.688us 100.00% 98.688us 32.896us 3
- Activity Buffer Request 19.94% 97.362us 19.94% 97.362us 97.362us 65.056us 65.92% 65.056us 65.056us 1
- aten::empty_like 1.68% 8.190us 6.15% 30.020us 10.007us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 4.47% 21.830us 4.47% 21.830us 7.277us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 32.32% 157.783us 32.32% 157.783us 52.594us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 1.10% 5.391us 1.10% 5.391us 5.391us 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_causal_conv1d 16.07% 76.871us 98.94% 473.172us 473.172us 0.000us 0.00% 163.803us 163.803us 1
+ CausalConv1dFn 14.96% 71.532us 82.87% 396.301us 132.100us 0.000us 0.00% 163.803us 54.601us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.75% 27.501us 61.56% 294.418us 98.139us 98.685us 100.00% 163.803us 54.601us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 133.180us 134.95% 133.180us 133.180us 1
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 98.685us 100.00% 98.685us 32.895us 3
+ Activity Buffer Request 21.65% 103.543us 21.65% 103.543us 103.543us 65.118us 65.99% 65.118us 65.118us 1
+ aten::empty_like 1.52% 7.251us 6.35% 30.351us 10.117us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.83% 23.100us 4.83% 23.100us 7.700us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 34.16% 163.374us 34.16% 163.374us 54.458us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.06% 5.061us 1.06% 5.061us 5.061us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 488.239us
-Self CUDA time total: 98.688us
+Self CPU time total: 478.233us
+Self CUDA time total: 98.685us
impl wl p50(ms) ok
@@ -4758,13 +4760,13 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
-Installed 15 packages in 14ms
+Installed 52 packages in 240ms
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
-Fetching 11 files: 9%|▉ | 1/11 [00:00<00:01, 6.41it/s]
-Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:00, 4.26it/s]
-Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 6.78it/s]
+Fetching 11 files: 9%|▉ | 1/11 [00:00<00:01, 9.42it/s]
+Fetching 11 files: 64%|██████▎ | 7/11 [00:01<00:00, 4.98it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 7.98it/s]
Artifacts:
causal_conv1d.jsonl
diff --git a/causal_conv1d/impls/torch_causal_conv1d.html b/causal_conv1d/impls/torch_causal_conv1d.html
index afa77a96b2a763e56f05f85b5cc1eef91c17fd17..6358d2b943cf22bb9f31aeb2e669932f13397132 100644
--- a/causal_conv1d/impls/torch_causal_conv1d.html
+++ b/causal_conv1d/impls/torch_causal_conv1d.html
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
▼ output
▶ uv-logs
|
-Cell: nv | 0.28s
+Cell: nv | 0.21s
|
▶ run
Copy
Raw
-
GitHub
+
GitHub
@@ -4122,7 +4122,7 @@ Cell: nv | 0.28s
-
Thu Oct 30 15:51:43 2025
+Fri Oct 31 20:00:25 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -4131,7 +4131,7 @@ Cell: nv | 0.28s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 27C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
+| N/A 33C P0 79W / 350W | 0MiB / 46068MiB | 11% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -4153,13 +4153,13 @@ Cell: nv | 0.28s
▼ code
▼ output
- ▶ uv-logs
+ ▶ uv-logs
|
-Cell: benchmark | 32.46s
+Cell: benchmark | 3.68s
| ▶ run
Copy
Raw
-GitHub
+GitHub
@@ -4217,29 +4217,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 467.230us 2421.38% 467.230us 467.230us 1
- torch_eager 10.72% 231.062us 99.69% 2.148ms 2.148ms 0.000us 0.00% 21.632us 21.632us 1
- aten::to 0.58% 12.480us 78.88% 1.700ms 283.277us 0.000us 0.00% 14.336us 2.389us 6
- aten::_to_copy 2.05% 44.092us 78.31% 1.687ms 281.197us 0.000us 0.00% 14.336us 2.389us 6
- aten::copy_ 3.07% 66.050us 73.46% 1.583ms 263.783us 12.000us 62.19% 14.336us 2.389us 6
- aten::conv1d 0.49% 10.600us 7.90% 170.164us 56.721us 0.000us 0.00% 7.296us 2.432us 3
- aten::convolution 0.77% 16.490us 7.41% 159.564us 53.188us 0.000us 0.00% 7.296us 2.432us 3
- aten::_convolution 1.64% 35.301us 6.64% 143.074us 47.691us 0.000us 0.00% 7.296us 2.432us 3
- aten::_conv_depthwise2d 1.69% 36.381us 4.00% 86.271us 28.757us 7.296us 37.81% 7.296us 2.432us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.296us 37.81% 7.296us 2.432us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.67% 6.304us 2.101us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 29.52% 5.696us 1.899us 3
- Activity Buffer Request 66.85% 1.440ms 66.85% 1.440ms 1.440ms 2.336us 12.11% 2.336us 2.336us 1
- aten::empty_strided 2.80% 60.390us 2.80% 60.390us 10.065us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 4.73% 101.823us 4.73% 101.823us 11.314us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 1.46% 31.451us 1.84% 39.731us 4.415us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.62% 13.289us 0.62% 13.289us 0.886us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.58% 12.560us 0.58% 12.560us 4.187us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.54% 11.740us 0.54% 11.740us 3.913us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.42% 8.963us 0.49% 10.602us 3.534us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 439.324us 2269.12% 439.324us 439.324us 1
+ torch_eager 10.31% 220.478us 99.69% 2.131ms 2.131ms 0.000us 0.00% 21.729us 21.729us 1
+ aten::to 0.50% 10.770us 79.87% 1.707ms 284.530us 0.000us 0.00% 14.369us 2.395us 6
+ aten::_to_copy 1.71% 36.499us 79.36% 1.696ms 282.735us 0.000us 0.00% 14.369us 2.395us 6
+ aten::copy_ 2.77% 59.234us 75.21% 1.608ms 267.930us 12.001us 61.99% 14.369us 2.395us 6
+ aten::conv1d 0.36% 7.590us 7.34% 156.883us 52.294us 0.000us 0.00% 7.360us 2.453us 3
+ aten::convolution 0.66% 14.070us 6.98% 149.293us 49.764us 0.000us 0.00% 7.360us 2.453us 3
+ aten::_convolution 1.51% 32.210us 6.33% 135.223us 45.074us 0.000us 0.00% 7.360us 2.453us 3
+ aten::_conv_depthwise2d 1.61% 34.371us 4.00% 85.463us 28.488us 7.360us 38.01% 7.360us 2.453us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 38.01% 7.360us 2.453us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.337us 32.73% 6.337us 2.112us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.25% 5.664us 1.888us 3
+ Activity Buffer Request 69.37% 1.483ms 69.37% 1.483ms 1.483ms 2.368us 12.23% 2.368us 2.368us 1
+ aten::empty_strided 2.45% 52.331us 2.45% 52.331us 8.722us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 4.26% 91.032us 4.26% 91.032us 10.115us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.32% 28.311us 1.71% 36.491us 4.055us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.64% 13.700us 0.64% 13.700us 0.913us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.60% 12.790us 0.60% 12.790us 4.263us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.59% 12.710us 0.59% 12.710us 4.237us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.31% 6.640us 0.38% 8.090us 2.697us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.155ms
-Self CUDA time total: 19.296us
+Self CPU time total: 2.138ms
+Self CUDA time total: 19.361us
@@ -4249,29 +4249,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 337.566us 1726.42% 337.566us 337.566us 1
- torch_eager 6.86% 130.161us 99.69% 1.893ms 1.893ms 0.000us 0.00% 21.665us 21.665us 1
- aten::to 0.32% 6.060us 85.13% 1.616ms 269.375us 0.000us 0.00% 13.729us 2.288us 6
- aten::_to_copy 1.27% 24.100us 84.81% 1.610ms 268.365us 0.000us 0.00% 13.729us 2.288us 6
- aten::copy_ 2.69% 51.011us 81.95% 1.556ms 259.305us 11.617us 59.41% 13.729us 2.288us 6
- aten::conv1d 0.30% 5.740us 6.23% 118.253us 39.418us 0.000us 0.00% 7.936us 2.645us 3
- aten::convolution 0.52% 9.902us 5.93% 112.513us 37.504us 0.000us 0.00% 7.936us 2.645us 3
- aten::_convolution 1.21% 22.959us 5.40% 102.611us 34.204us 0.000us 0.00% 7.936us 2.645us 3
- aten::_conv_depthwise2d 1.18% 22.461us 3.33% 63.161us 21.054us 7.936us 40.59% 7.936us 2.645us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 40.59% 7.936us 2.645us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 31.09% 6.080us 2.027us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.537us 28.32% 5.537us 1.846us 3
- Activity Buffer Request 76.56% 1.454ms 76.56% 1.454ms 1.454ms 2.112us 10.80% 2.112us 2.112us 1
- aten::empty_strided 1.59% 30.260us 1.59% 30.260us 5.043us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 3.84% 72.993us 3.84% 72.993us 8.110us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.96% 18.220us 1.27% 24.051us 2.672us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.50% 9.451us 0.50% 9.451us 0.630us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.52% 9.960us 0.52% 9.960us 3.320us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.48% 9.030us 0.48% 9.030us 3.010us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.31% 5.890us 0.39% 7.340us 2.447us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.789us 1742.49% 341.789us 341.789us 1
+ torch_eager 7.86% 151.082us 99.71% 1.916ms 1.916ms 0.000us 0.00% 21.695us 21.695us 1
+ aten::to 0.35% 6.661us 83.96% 1.614ms 268.966us 0.000us 0.00% 13.695us 2.282us 6
+ aten::_to_copy 1.29% 24.781us 83.61% 1.607ms 267.856us 0.000us 0.00% 13.695us 2.282us 6
+ aten::copy_ 2.59% 49.784us 80.72% 1.552ms 258.589us 11.615us 59.21% 13.695us 2.282us 6
+ aten::conv1d 0.32% 6.220us 6.35% 122.113us 40.704us 0.000us 0.00% 8.000us 2.667us 3
+ aten::convolution 0.53% 10.120us 6.03% 115.893us 38.631us 0.000us 0.00% 8.000us 2.667us 3
+ aten::_convolution 1.20% 23.080us 5.50% 105.773us 35.258us 0.000us 0.00% 8.000us 2.667us 3
+ aten::_conv_depthwise2d 1.19% 22.952us 3.39% 65.123us 21.708us 8.000us 40.79% 8.000us 2.667us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 40.79% 8.000us 2.667us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.047us 30.83% 6.047us 2.016us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.39% 5.568us 1.856us 3
+ Activity Buffer Request 75.54% 1.452ms 75.54% 1.452ms 1.452ms 2.080us 10.60% 2.080us 2.080us 1
+ aten::empty_strided 1.60% 30.820us 1.60% 30.820us 5.137us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.74% 71.953us 3.74% 71.953us 7.995us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.98% 18.881us 1.29% 24.750us 2.750us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.50% 9.609us 0.50% 9.609us 0.641us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.56% 10.750us 0.56% 10.750us 3.583us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.49% 9.339us 0.49% 9.339us 3.113us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.34% 6.630us 0.42% 8.000us 2.667us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.899ms
-Self CUDA time total: 19.553us
+Self CPU time total: 1.922ms
+Self CUDA time total: 19.615us
@@ -4281,29 +4281,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 333.726us 1795.19% 333.726us 333.726us 1
- torch_eager 6.76% 126.472us 99.71% 1.865ms 1.865ms 0.000us 0.00% 20.510us 20.510us 1
- aten::to 0.32% 5.970us 85.12% 1.592ms 265.378us 0.000us 0.00% 13.598us 2.266us 6
- aten::_to_copy 1.26% 23.561us 84.80% 1.586ms 264.383us 0.000us 0.00% 13.598us 2.266us 6
- aten::copy_ 2.75% 51.371us 81.92% 1.532ms 255.399us 11.678us 62.82% 13.598us 2.266us 6
- aten::conv1d 0.31% 5.850us 6.37% 119.083us 39.694us 0.000us 0.00% 6.912us 2.304us 3
- aten::convolution 0.54% 10.170us 6.05% 113.233us 37.744us 0.000us 0.00% 6.912us 2.304us 3
- aten::_convolution 1.25% 23.320us 5.51% 103.063us 34.354us 0.000us 0.00% 6.912us 2.304us 3
- aten::_conv_depthwise2d 1.20% 22.402us 3.41% 63.713us 21.238us 6.912us 37.18% 6.912us 2.304us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.912us 37.18% 6.912us 2.304us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.951us 32.01% 5.951us 1.984us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.727us 30.81% 5.727us 1.909us 3
- Activity Buffer Request 76.63% 1.433ms 76.63% 1.433ms 1.433ms 1.920us 10.33% 1.920us 1.920us 1
- aten::empty_strided 1.62% 30.340us 1.62% 30.340us 5.057us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 3.76% 70.302us 3.76% 70.302us 7.811us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.94% 17.590us 1.23% 22.950us 2.550us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.48% 8.970us 0.48% 8.970us 0.598us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.54% 10.051us 0.54% 10.051us 3.350us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.46% 8.519us 0.46% 8.519us 2.840us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.32% 5.980us 0.39% 7.380us 2.460us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 343.328us 1837.45% 343.328us 343.328us 1
+ torch_eager 7.88% 151.015us 99.69% 1.911ms 1.911ms 0.000us 0.00% 20.605us 20.605us 1
+ aten::to 0.33% 6.409us 84.02% 1.611ms 268.468us 0.000us 0.00% 13.662us 2.277us 6
+ aten::_to_copy 1.32% 25.354us 83.68% 1.604ms 267.400us 0.000us 0.00% 13.662us 2.277us 6
+ aten::copy_ 2.65% 50.770us 80.80% 1.549ms 258.170us 11.742us 62.84% 13.662us 2.277us 6
+ aten::conv1d 0.33% 6.290us 6.34% 121.483us 40.494us 0.000us 0.00% 6.943us 2.314us 3
+ aten::convolution 0.54% 10.430us 6.01% 115.193us 38.398us 0.000us 0.00% 6.943us 2.314us 3
+ aten::_convolution 1.17% 22.439us 5.46% 104.763us 34.921us 0.000us 0.00% 6.943us 2.314us 3
+ aten::_conv_depthwise2d 1.17% 22.412us 3.43% 65.843us 21.948us 6.943us 37.16% 6.943us 2.314us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.943us 37.16% 6.943us 2.314us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.982us 32.01% 5.982us 1.994us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.760us 30.83% 5.760us 1.920us 3
+ Activity Buffer Request 75.50% 1.448ms 75.50% 1.448ms 1.448ms 1.920us 10.28% 1.920us 1.920us 1
+ aten::empty_strided 1.57% 30.029us 1.57% 30.029us 5.005us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.90% 74.680us 3.90% 74.680us 8.298us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.93% 17.782us 1.21% 23.252us 2.584us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.48% 9.281us 0.48% 9.281us 0.619us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.57% 10.910us 0.57% 10.910us 3.637us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.44% 8.531us 0.44% 8.531us 2.844us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.32% 6.170us 0.39% 7.570us 2.523us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.871ms
-Self CUDA time total: 18.590us
+Self CPU time total: 1.917ms
+Self CUDA time total: 18.685us
@@ -4313,29 +4313,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.229us 1732.17% 339.229us 339.229us 1
- torch_eager 6.09% 126.194us 99.75% 2.066ms 2.066ms 0.000us 0.00% 21.729us 21.729us 1
- aten::to 0.29% 6.100us 86.58% 1.793ms 298.900us 0.000us 0.00% 14.018us 2.336us 6
- aten::_to_copy 1.16% 23.990us 86.28% 1.787ms 297.883us 0.000us 0.00% 14.018us 2.336us 6
- aten::copy_ 2.58% 53.448us 83.67% 1.733ms 288.850us 11.873us 60.63% 14.018us 2.336us 6
- aten::conv1d 0.32% 6.580us 5.73% 118.763us 39.588us 0.000us 0.00% 7.711us 2.570us 3
- aten::convolution 0.48% 9.870us 5.42% 112.183us 37.394us 0.000us 0.00% 7.711us 2.570us 3
- aten::_convolution 1.09% 22.580us 4.94% 102.313us 34.104us 0.000us 0.00% 7.711us 2.570us 3
- aten::_conv_depthwise2d 1.08% 22.411us 3.09% 64.033us 21.344us 7.711us 39.37% 7.711us 2.570us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.711us 39.37% 7.711us 2.570us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.145us 31.38% 6.145us 2.048us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.25% 5.728us 1.909us 3
- Activity Buffer Request 69.66% 1.443ms 69.66% 1.443ms 1.443ms 2.145us 10.95% 2.145us 2.145us 1
- aten::empty_strided 1.46% 30.210us 1.46% 30.210us 5.035us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 12.49% 258.686us 12.49% 258.686us 28.743us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.87% 18.050us 1.12% 23.200us 2.578us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.42% 8.720us 0.42% 8.720us 0.581us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.49% 10.140us 0.49% 10.140us 3.380us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.46% 9.442us 0.46% 9.442us 3.147us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.28% 5.830us 0.35% 7.220us 2.407us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.280us 1734.88% 340.280us 340.280us 1
+ torch_eager 6.89% 141.563us 99.72% 2.049ms 2.049ms 0.000us 0.00% 21.726us 21.726us 1
+ aten::to 0.30% 6.132us 85.38% 1.755ms 292.424us 0.000us 0.00% 13.982us 2.330us 6
+ aten::_to_copy 1.19% 24.439us 85.08% 1.748ms 291.402us 0.000us 0.00% 13.982us 2.330us 6
+ aten::copy_ 2.50% 51.302us 82.39% 1.693ms 282.182us 11.870us 60.52% 13.982us 2.330us 6
+ aten::conv1d 0.29% 5.930us 5.97% 122.723us 40.908us 0.000us 0.00% 7.744us 2.581us 3
+ aten::convolution 0.50% 10.300us 5.68% 116.793us 38.931us 0.000us 0.00% 7.744us 2.581us 3
+ aten::_convolution 1.17% 23.960us 5.18% 106.493us 35.498us 0.000us 0.00% 7.744us 2.581us 3
+ aten::_conv_depthwise2d 1.08% 22.141us 3.19% 65.452us 21.817us 7.744us 39.48% 7.744us 2.581us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 39.48% 7.744us 2.581us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.143us 31.32% 6.143us 2.048us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.727us 29.20% 5.727us 1.909us 3
+ Activity Buffer Request 70.00% 1.438ms 70.00% 1.438ms 1.438ms 2.112us 10.77% 2.112us 2.112us 1
+ aten::empty_strided 1.50% 30.881us 1.50% 30.881us 5.147us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 11.01% 226.194us 11.01% 226.194us 25.133us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.89% 18.302us 1.19% 24.432us 2.715us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.49% 9.981us 0.49% 9.981us 0.665us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.55% 11.260us 0.55% 11.260us 3.753us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.45% 9.171us 0.45% 9.171us 3.057us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.32% 6.620us 0.39% 8.030us 2.677us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.071ms
-Self CUDA time total: 19.584us
+Self CPU time total: 2.055ms
+Self CUDA time total: 19.614us
@@ -4345,29 +4345,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 342.208us 1399.74% 342.208us 342.208us 1
- torch_eager 6.21% 125.160us 99.74% 2.012ms 2.012ms 0.000us 0.00% 26.720us 26.720us 1
- aten::to 0.29% 5.910us 86.35% 1.742ms 290.270us 0.000us 0.00% 15.168us 2.528us 6
- aten::_to_copy 1.25% 25.122us 86.06% 1.736ms 289.285us 0.000us 0.00% 15.168us 2.528us 6
- aten::copy_ 2.93% 59.190us 83.27% 1.679ms 279.905us 12.896us 52.75% 15.168us 2.528us 6
- aten::conv1d 0.28% 5.620us 5.81% 117.132us 39.044us 0.000us 0.00% 11.552us 3.851us 3
- aten::convolution 0.49% 9.910us 5.53% 111.512us 37.171us 0.000us 0.00% 11.552us 3.851us 3
- aten::_convolution 1.15% 23.280us 5.04% 101.602us 33.867us 0.000us 0.00% 11.552us 3.851us 3
- aten::_conv_depthwise2d 1.09% 21.990us 3.08% 62.201us 20.734us 11.552us 47.25% 11.552us 3.851us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.25% 11.552us 3.851us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 27.09% 6.624us 2.208us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 25.65% 6.272us 2.091us 3
- Activity Buffer Request 71.09% 1.434ms 71.09% 1.434ms 1.434ms 2.272us 9.29% 2.272us 2.272us 1
- aten::empty_strided 1.55% 31.162us 1.55% 31.162us 5.194us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 10.29% 207.543us 10.29% 207.543us 23.060us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.90% 18.220us 1.17% 23.681us 2.631us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.44% 8.971us 0.44% 8.971us 0.598us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.49% 9.951us 0.49% 9.951us 3.317us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.46% 9.230us 0.46% 9.230us 3.077us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.29% 5.780us 0.35% 7.150us 2.383us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 379.964us 1548.03% 379.964us 379.964us 1
+ torch_eager 7.69% 160.944us 99.76% 2.089ms 2.089ms 0.000us 0.00% 26.817us 26.817us 1
+ aten::to 0.33% 7.000us 83.76% 1.754ms 292.349us 0.000us 0.00% 15.265us 2.544us 6
+ aten::_to_copy 1.23% 25.779us 83.43% 1.747ms 291.183us 0.000us 0.00% 15.265us 2.544us 6
+ aten::copy_ 2.49% 52.100us 80.65% 1.689ms 281.484us 12.993us 52.94% 15.265us 2.544us 6
+ aten::conv1d 0.31% 6.410us 6.85% 143.364us 47.788us 0.000us 0.00% 11.552us 3.851us 3
+ aten::convolution 1.48% 31.021us 6.54% 136.954us 45.651us 0.000us 0.00% 11.552us 3.851us 3
+ aten::_convolution 1.13% 23.621us 5.06% 105.933us 35.311us 0.000us 0.00% 11.552us 3.851us 3
+ aten::_conv_depthwise2d 1.06% 22.209us 3.13% 65.632us 21.877us 11.552us 47.06% 11.552us 3.851us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.06% 11.552us 3.851us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.625us 26.99% 6.625us 2.208us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 25.94% 6.368us 2.123us 3
+ Activity Buffer Request 68.76% 1.440ms 68.76% 1.440ms 1.440ms 2.272us 9.26% 2.272us 2.272us 1
+ aten::empty_strided 1.55% 32.413us 1.55% 32.413us 5.402us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.50% 219.817us 10.50% 219.817us 24.424us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.87% 18.301us 1.15% 24.061us 2.673us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.50% 10.530us 0.50% 10.530us 0.702us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.50% 10.490us 0.50% 10.490us 3.497us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.47% 9.872us 0.47% 9.872us 3.291us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.30% 6.220us 0.37% 7.740us 2.580us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.017ms
-Self CUDA time total: 24.448us
+Self CPU time total: 2.094ms
+Self CUDA time total: 24.545us
@@ -4377,29 +4377,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 360.702us 1391.60% 360.702us 360.702us 1
- torch_eager 7.02% 142.940us 99.74% 2.030ms 2.030ms 0.000us 0.00% 28.128us 28.128us 1
- aten::to 0.30% 6.030us 85.23% 1.734ms 289.050us 0.000us 0.00% 15.136us 2.523us 6
- aten::_to_copy 1.18% 23.913us 84.93% 1.728ms 288.045us 0.000us 0.00% 15.136us 2.523us 6
- aten::copy_ 2.60% 52.858us 82.24% 1.673ms 278.911us 12.928us 49.88% 15.136us 2.523us 6
- aten::conv1d 0.29% 5.931us 6.05% 123.062us 41.021us 0.000us 0.00% 12.992us 4.331us 3
- aten::convolution 0.49% 10.049us 5.76% 117.131us 39.044us 0.000us 0.00% 12.992us 4.331us 3
- aten::_convolution 1.15% 23.381us 5.26% 107.082us 35.694us 0.000us 0.00% 12.992us 4.331us 3
- aten::_conv_depthwise2d 1.11% 22.652us 3.33% 67.801us 22.600us 12.992us 50.12% 12.992us 4.331us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 12.992us 50.12% 12.992us 4.331us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 25.43% 6.592us 2.197us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 24.44% 6.336us 2.112us 3
- Activity Buffer Request 70.88% 1.442ms 70.88% 1.442ms 1.442ms 2.208us 8.52% 2.208us 2.208us 1
- aten::empty_strided 1.52% 30.891us 1.52% 30.891us 5.148us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 10.00% 203.394us 10.00% 203.394us 22.599us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.92% 18.741us 1.20% 24.361us 2.707us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.46% 9.330us 0.46% 9.330us 0.622us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.51% 10.450us 0.51% 10.450us 3.483us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.47% 9.490us 0.47% 9.490us 3.163us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.29% 5.900us 0.36% 7.380us 2.460us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 351.133us 1341.43% 351.133us 351.133us 1
+ torch_eager 7.55% 157.812us 99.73% 2.084ms 2.084ms 0.000us 0.00% 28.416us 28.416us 1
+ aten::to 0.31% 6.571us 84.80% 1.772ms 295.318us 0.000us 0.00% 15.264us 2.544us 6
+ aten::_to_copy 1.22% 25.450us 84.49% 1.765ms 294.223us 0.000us 0.00% 15.264us 2.544us 6
+ aten::copy_ 2.31% 48.301us 81.82% 1.710ms 284.947us 13.024us 49.76% 15.264us 2.544us 6
+ aten::conv1d 0.32% 6.640us 5.96% 124.543us 41.514us 0.000us 0.00% 13.152us 4.384us 3
+ aten::convolution 0.50% 10.360us 5.64% 117.903us 39.301us 0.000us 0.00% 13.152us 4.384us 3
+ aten::_convolution 1.16% 24.330us 5.15% 107.543us 35.848us 0.000us 0.00% 13.152us 4.384us 3
+ aten::_conv_depthwise2d 1.06% 22.241us 3.14% 65.623us 21.874us 13.152us 50.24% 13.152us 4.384us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.152us 50.24% 13.152us 4.384us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 25.43% 6.656us 2.219us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.33% 6.368us 2.123us 3
+ Activity Buffer Request 70.10% 1.465ms 70.10% 1.465ms 1.465ms 2.240us 8.56% 2.240us 2.240us 1
+ aten::empty_strided 1.45% 30.202us 1.45% 30.202us 5.034us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.51% 219.677us 10.51% 219.677us 24.409us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.90% 18.881us 1.17% 24.421us 2.713us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.46% 9.580us 0.46% 9.580us 0.639us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.55% 11.471us 0.55% 11.471us 3.824us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.43% 8.890us 0.43% 8.890us 2.963us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.33% 6.950us 0.40% 8.400us 2.800us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.035ms
-Self CUDA time total: 25.920us
+Self CPU time total: 2.089ms
+Self CUDA time total: 26.176us
@@ -4409,29 +4409,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 369.628us 962.57% 369.628us 369.628us 1
- torch_eager 7.12% 161.009us 99.76% 2.255ms 2.255ms 0.000us 0.00% 40.960us 40.960us 1
- aten::conv1d 0.32% 7.222us 5.82% 131.613us 43.871us 0.000us 0.00% 22.528us 7.509us 3
- aten::convolution 0.54% 12.229us 5.50% 124.391us 41.464us 0.000us 0.00% 22.528us 7.509us 3
- aten::_convolution 1.15% 26.031us 4.96% 112.162us 37.387us 0.000us 0.00% 22.528us 7.509us 3
- aten::_conv_depthwise2d 1.09% 24.630us 3.00% 67.820us 22.607us 22.528us 58.67% 22.528us 7.509us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.528us 58.67% 22.528us 7.509us 3
- aten::to 0.34% 7.671us 85.42% 1.931ms 321.787us 0.000us 0.00% 18.432us 3.072us 6
- aten::_to_copy 1.41% 31.890us 85.08% 1.923ms 320.509us 0.000us 0.00% 18.432us 3.072us 6
- aten::copy_ 2.64% 59.711us 82.13% 1.856ms 309.384us 15.872us 41.33% 18.432us 3.072us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.544us 22.25% 8.544us 2.848us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 19.08% 7.328us 2.443us 3
- Activity Buffer Request 64.20% 1.451ms 64.20% 1.451ms 1.451ms 2.560us 6.67% 2.560us 2.560us 1
- aten::empty_strided 1.54% 34.861us 1.54% 34.861us 5.810us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 16.32% 368.786us 16.32% 368.786us 40.976us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.93% 20.991us 1.15% 26.100us 2.900us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.41% 9.319us 0.41% 9.319us 0.621us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.44% 9.850us 0.44% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.44% 9.970us 0.44% 9.970us 3.323us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.31% 7.041us 0.38% 8.701us 2.900us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 349.627us 908.24% 349.627us 349.627us 1
+ torch_eager 7.45% 152.992us 99.76% 2.049ms 2.049ms 0.000us 0.00% 41.086us 41.086us 1
+ aten::conv1d 0.32% 6.640us 6.06% 124.413us 41.471us 0.000us 0.00% 22.561us 7.520us 3
+ aten::convolution 0.50% 10.370us 5.73% 117.773us 39.258us 0.000us 0.00% 22.561us 7.520us 3
+ aten::_convolution 1.14% 23.411us 5.23% 107.403us 35.801us 0.000us 0.00% 22.561us 7.520us 3
+ aten::_conv_depthwise2d 1.15% 23.650us 3.29% 67.532us 22.511us 22.561us 58.61% 22.561us 7.520us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.561us 58.61% 22.561us 7.520us 3
+ aten::to 0.33% 6.780us 84.82% 1.743ms 290.446us 0.000us 0.00% 18.525us 3.087us 6
+ aten::_to_copy 1.29% 26.502us 84.49% 1.736ms 289.316us 0.000us 0.00% 18.525us 3.087us 6
+ aten::copy_ 2.40% 49.251us 81.74% 1.679ms 279.869us 15.934us 41.39% 18.525us 3.087us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.543us 22.19% 8.543us 2.848us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.391us 19.20% 7.391us 2.464us 3
+ Activity Buffer Request 69.84% 1.435ms 69.84% 1.435ms 1.435ms 2.591us 6.73% 2.591us 2.591us 1
+ aten::empty_strided 1.47% 30.182us 1.47% 30.182us 5.030us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.64% 218.664us 10.64% 218.664us 24.296us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.89% 18.281us 1.17% 24.011us 2.668us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.47% 9.739us 0.47% 9.739us 0.649us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.53% 10.991us 0.53% 10.991us 3.664us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.46% 9.421us 0.46% 9.421us 3.140us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.29% 5.970us 0.36% 7.320us 2.440us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.260ms
-Self CUDA time total: 38.400us
+Self CPU time total: 2.054ms
+Self CUDA time total: 38.495us
@@ -4441,29 +4441,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 343.007us 838.09% 343.007us 343.007us 1
- torch_eager 6.47% 141.163us 99.73% 2.175ms 2.175ms 0.000us 0.00% 43.487us 43.487us 1
- aten::conv1d 0.27% 5.870us 5.52% 120.313us 40.104us 0.000us 0.00% 25.376us 8.459us 3
- aten::convolution 0.46% 10.120us 5.25% 114.443us 38.148us 0.000us 0.00% 25.376us 8.459us 3
- aten::_convolution 1.12% 24.490us 4.78% 104.323us 34.774us 0.000us 0.00% 25.376us 8.459us 3
- aten::_conv_depthwise2d 1.00% 21.702us 2.89% 62.963us 20.988us 25.376us 62.00% 25.376us 8.459us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.376us 62.00% 25.376us 8.459us 3
- aten::to 0.28% 6.129us 86.46% 1.885ms 314.232us 0.000us 0.00% 18.111us 3.018us 6
- aten::_to_copy 1.13% 24.640us 86.18% 1.879ms 313.211us 0.000us 0.00% 18.111us 3.018us 6
- aten::copy_ 2.51% 54.672us 83.58% 1.823ms 303.754us 15.551us 38.00% 18.111us 3.018us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.224us 20.09% 8.224us 2.741us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 17.90% 7.327us 2.442us 3
- Activity Buffer Request 66.59% 1.452ms 66.59% 1.452ms 1.452ms 2.560us 6.26% 2.560us 2.560us 1
- aten::empty_strided 1.47% 32.100us 1.47% 32.100us 5.350us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 15.50% 338.007us 15.50% 338.007us 37.556us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.84% 18.320us 1.10% 24.070us 2.674us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.43% 9.420us 0.43% 9.420us 0.628us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.46% 10.080us 0.46% 10.080us 3.360us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.42% 9.080us 0.42% 9.080us 3.027us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.27% 5.960us 0.34% 7.390us 2.463us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 345.054us 837.81% 345.054us 345.054us 1
+ torch_eager 7.39% 151.695us 99.75% 2.049ms 2.049ms 0.000us 0.00% 43.810us 43.810us 1
+ aten::conv1d 0.32% 6.620us 6.03% 123.883us 41.294us 0.000us 0.00% 25.375us 8.458us 3
+ aten::convolution 0.50% 10.320us 5.71% 117.263us 39.088us 0.000us 0.00% 25.375us 8.458us 3
+ aten::_convolution 1.20% 24.592us 5.21% 106.943us 35.648us 0.000us 0.00% 25.375us 8.458us 3
+ aten::_conv_depthwise2d 1.13% 23.150us 3.19% 65.451us 21.817us 25.375us 61.61% 25.375us 8.458us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.375us 61.61% 25.375us 8.458us 3
+ aten::to 0.31% 6.440us 84.93% 1.744ms 290.716us 0.000us 0.00% 18.435us 3.072us 6
+ aten::_to_copy 1.24% 25.501us 84.61% 1.738ms 289.642us 0.000us 0.00% 18.435us 3.072us 6
+ aten::copy_ 2.41% 49.431us 81.91% 1.682ms 280.380us 15.810us 38.39% 18.435us 3.072us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.386us 20.36% 8.386us 2.795us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 18.03% 7.424us 2.475us 3
+ Activity Buffer Request 70.32% 1.444ms 70.32% 1.444ms 1.444ms 2.625us 6.37% 2.625us 2.625us 1
+ aten::empty_strided 1.46% 30.070us 1.46% 30.070us 5.012us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.28% 211.144us 10.28% 211.144us 23.460us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.92% 18.949us 1.19% 24.411us 2.712us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.313us 0.45% 9.313us 0.621us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.52% 10.601us 0.52% 10.601us 3.534us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.44% 9.110us 0.44% 9.110us 3.037us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.29% 5.930us 0.36% 7.410us 2.470us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.181ms
-Self CUDA time total: 40.927us
+Self CPU time total: 2.054ms
+Self CUDA time total: 41.185us
@@ -4473,29 +4473,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 367.004us 357.73% 367.004us 367.004us 1
- torch_eager 6.17% 126.763us 99.73% 2.049ms 2.049ms 0.000us 0.00% 108.512us 108.512us 1
- aten::conv1d 0.28% 5.761us 5.81% 119.372us 39.791us 0.000us 0.00% 70.432us 23.477us 3
- aten::convolution 0.48% 9.820us 5.53% 113.611us 37.870us 0.000us 0.00% 70.432us 23.477us 3
- aten::_convolution 1.11% 22.788us 5.05% 103.791us 34.597us 0.000us 0.00% 70.432us 23.477us 3
- aten::_conv_depthwise2d 1.12% 22.910us 3.14% 64.601us 21.534us 70.432us 68.65% 70.432us 23.477us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.432us 68.65% 70.432us 23.477us 3
- aten::to 0.30% 6.130us 86.37% 1.774ms 295.680us 0.000us 0.00% 38.080us 6.347us 6
- aten::_to_copy 2.18% 44.819us 86.07% 1.768ms 294.658us 0.000us 0.00% 38.080us 6.347us 6
- aten::copy_ 2.56% 52.622us 82.32% 1.691ms 281.815us 32.160us 31.35% 38.080us 6.347us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.504us 17.06% 17.504us 5.835us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.656us 14.29% 14.656us 4.885us 3
- Activity Buffer Request 69.77% 1.433ms 69.77% 1.433ms 1.433ms 5.920us 5.77% 5.920us 5.920us 1
- aten::empty_strided 1.57% 32.241us 1.57% 32.241us 5.373us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 11.08% 227.645us 11.08% 227.645us 25.294us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.87% 17.849us 1.12% 23.070us 2.563us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.44% 9.030us 0.44% 9.030us 0.602us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.49% 10.050us 0.49% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.44% 9.040us 0.44% 9.040us 3.013us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.30% 6.163us 0.38% 7.782us 2.594us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.348us 338.39% 348.348us 348.348us 1
+ torch_eager 7.21% 148.863us 99.73% 2.059ms 2.059ms 0.000us 0.00% 108.926us 108.926us 1
+ aten::conv1d 0.31% 6.430us 5.95% 122.893us 40.964us 0.000us 0.00% 70.592us 23.531us 3
+ aten::convolution 0.50% 10.290us 5.64% 116.463us 38.821us 0.000us 0.00% 70.592us 23.531us 3
+ aten::_convolution 1.17% 24.211us 5.14% 106.173us 35.391us 0.000us 0.00% 70.592us 23.531us 3
+ aten::_conv_depthwise2d 1.12% 23.052us 3.16% 65.282us 21.761us 70.592us 68.57% 70.592us 23.531us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.592us 68.57% 70.592us 23.531us 3
+ aten::to 0.31% 6.372us 85.15% 1.758ms 292.949us 0.000us 0.00% 38.334us 6.389us 6
+ aten::_to_copy 1.20% 24.680us 84.84% 1.751ms 291.887us 0.000us 0.00% 38.334us 6.389us 6
+ aten::copy_ 2.47% 51.072us 82.20% 1.697ms 282.787us 32.350us 31.43% 38.334us 6.389us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.695us 17.19% 17.695us 5.898us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.655us 14.24% 14.655us 4.885us 3
+ Activity Buffer Request 70.59% 1.457ms 70.59% 1.457ms 1.457ms 5.984us 5.81% 5.984us 5.984us 1
+ aten::empty_strided 1.45% 29.921us 1.45% 29.921us 4.987us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.23% 211.264us 10.23% 211.264us 23.474us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.89% 18.462us 1.17% 24.111us 2.679us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.47% 9.709us 0.47% 9.709us 0.647us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.47% 9.780us 0.47% 9.780us 3.260us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.47% 9.740us 0.47% 9.740us 3.247us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.28% 5.880us 0.35% 7.260us 2.420us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.054ms
-Self CUDA time total: 102.592us
+Self CPU time total: 2.064ms
+Self CUDA time total: 102.942us
@@ -4505,29 +4505,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.959us 299.49% 336.959us 336.959us 1
- torch_eager 6.25% 125.522us 99.75% 2.004ms 2.004ms 0.000us 0.00% 118.493us 118.493us 1
- aten::conv1d 0.38% 7.700us 5.98% 120.223us 40.074us 0.000us 0.00% 80.479us 26.826us 3
- aten::convolution 0.49% 9.780us 5.60% 112.523us 37.508us 0.000us 0.00% 80.479us 26.826us 3
- aten::_convolution 1.13% 22.669us 5.11% 102.743us 34.248us 0.000us 0.00% 80.479us 26.826us 3
- aten::_conv_depthwise2d 1.12% 22.452us 3.19% 64.073us 21.358us 80.479us 71.53% 80.479us 26.826us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.479us 71.53% 80.479us 26.826us 3
- aten::to 0.29% 5.910us 86.14% 1.731ms 288.442us 0.000us 0.00% 38.014us 6.336us 6
- aten::_to_copy 1.19% 24.001us 85.85% 1.725ms 287.457us 0.000us 0.00% 38.014us 6.336us 6
- aten::copy_ 2.56% 51.481us 83.17% 1.671ms 278.473us 32.031us 28.47% 38.014us 6.336us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.567us 15.61% 17.567us 5.856us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.464us 12.86% 14.464us 4.821us 3
- Activity Buffer Request 71.72% 1.441ms 71.72% 1.441ms 1.441ms 5.983us 5.32% 5.983us 5.983us 1
- aten::empty_strided 1.49% 29.901us 1.49% 29.901us 4.983us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 10.00% 200.814us 10.00% 200.814us 22.313us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.89% 17.861us 1.15% 23.111us 2.568us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.45% 8.970us 0.45% 8.970us 0.598us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.50% 10.050us 0.50% 10.050us 3.350us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.46% 9.169us 0.46% 9.169us 3.056us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.30% 6.030us 0.38% 7.560us 2.520us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 344.181us 304.53% 344.181us 344.181us 1
+ torch_eager 14.98% 124.863us 99.35% 828.302us 828.302us 0.000us 0.00% 119.036us 119.036us 1
+ aten::conv1d 0.70% 5.870us 14.55% 121.343us 40.448us 0.000us 0.00% 80.669us 26.890us 3
+ aten::convolution 1.17% 9.720us 13.85% 115.473us 38.491us 0.000us 0.00% 80.669us 26.890us 3
+ aten::_convolution 2.96% 24.691us 12.68% 105.753us 35.251us 0.000us 0.00% 80.669us 26.890us 3
+ aten::_conv_depthwise2d 2.65% 22.121us 7.65% 63.762us 21.254us 80.669us 71.38% 80.669us 26.890us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.669us 71.38% 80.669us 26.890us 3
+ aten::to 0.77% 6.429us 66.53% 554.705us 92.451us 0.000us 0.00% 38.367us 6.394us 6
+ aten::_to_copy 3.01% 25.101us 65.76% 548.276us 91.379us 0.000us 0.00% 38.367us 6.394us 6
+ aten::copy_ 6.16% 51.352us 59.05% 492.343us 82.057us 32.351us 28.62% 38.367us 6.394us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.696us 15.66% 17.696us 5.899us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.655us 12.97% 14.655us 4.885us 3
+ Activity Buffer Request 28.81% 240.197us 28.81% 240.197us 240.197us 6.016us 5.32% 6.016us 6.016us 1
+ aten::empty_strided 3.70% 30.832us 3.70% 30.832us 5.139us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 26.65% 222.174us 26.65% 222.174us 24.686us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.09% 17.401us 2.70% 22.541us 2.505us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.05% 8.790us 1.05% 8.790us 0.586us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.34% 11.151us 1.34% 11.151us 3.717us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.09% 9.110us 1.09% 9.110us 3.037us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.89% 7.450us 1.05% 8.790us 2.930us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.009ms
-Self CUDA time total: 112.510us
+Self CPU time total: 833.752us
+Self CUDA time total: 113.020us
@@ -4537,29 +4537,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 5.98% 122.945us 97.76% 2.011ms 2.011ms 0.000us 0.00% 433.437us 433.437us 1
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 423.709us 107.83% 423.709us 423.709us 1
- aten::conv1d 0.28% 5.760us 5.73% 117.851us 39.284us 0.000us 0.00% 250.941us 83.647us 3
- aten::convolution 0.48% 9.830us 5.45% 112.091us 37.364us 0.000us 0.00% 250.941us 83.647us 3
- aten::_convolution 1.12% 23.111us 4.97% 102.261us 34.087us 0.000us 0.00% 250.941us 83.647us 3
- aten::_conv_depthwise2d 1.03% 21.200us 3.03% 62.360us 20.787us 250.941us 63.86% 250.941us 83.647us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 250.941us 63.86% 250.941us 83.647us 3
- aten::to 0.28% 5.851us 84.70% 1.742ms 290.313us 0.000us 0.00% 182.496us 30.416us 6
- aten::_to_copy 1.16% 23.919us 84.41% 1.736ms 289.338us 0.000us 0.00% 182.496us 30.416us 6
- aten::copy_ 2.53% 51.981us 81.78% 1.682ms 280.333us 142.016us 36.14% 182.496us 30.416us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 101.952us 25.94% 101.952us 33.984us 3
- Activity Buffer Request 70.64% 1.453ms 70.64% 1.453ms 1.453ms 40.480us 10.30% 40.480us 40.480us 1
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.064us 10.20% 40.064us 13.355us 3
- aten::empty_strided 1.46% 30.112us 1.46% 30.112us 5.019us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.67% 198.853us 9.67% 198.853us 22.095us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.91% 18.669us 1.18% 24.270us 2.697us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.44% 9.151us 0.44% 9.151us 0.610us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.48% 9.870us 0.48% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.47% 9.710us 0.47% 9.710us 3.237us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.29% 5.960us 0.36% 7.350us 2.450us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 14.21% 122.455us 95.83% 825.681us 825.681us 0.000us 0.00% 433.339us 433.339us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 419.771us 106.59% 419.771us 419.771us 1
+ aten::conv1d 0.75% 6.429us 14.10% 121.522us 40.507us 0.000us 0.00% 251.453us 83.818us 3
+ aten::convolution 1.15% 9.929us 13.36% 115.093us 38.364us 0.000us 0.00% 251.453us 83.818us 3
+ aten::_convolution 2.67% 23.042us 12.21% 105.164us 35.055us 0.000us 0.00% 251.453us 83.818us 3
+ aten::_conv_depthwise2d 2.60% 22.440us 7.52% 64.810us 21.603us 251.453us 63.85% 251.453us 83.818us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 251.453us 63.85% 251.453us 83.818us 3
+ aten::to 0.70% 6.001us 64.14% 552.672us 92.112us 0.000us 0.00% 181.886us 30.314us 6
+ aten::_to_copy 2.73% 23.540us 63.45% 546.671us 91.112us 0.000us 0.00% 181.886us 30.314us 6
+ aten::copy_ 5.94% 51.140us 57.36% 494.211us 82.368us 142.367us 36.15% 181.886us 30.314us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 102.367us 25.99% 102.367us 34.122us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.000us 10.16% 40.000us 13.333us 3
+ Activity Buffer Request 29.04% 250.247us 29.04% 250.247us 250.247us 39.519us 10.03% 39.519us 39.519us 1
+ aten::empty_strided 3.36% 28.920us 3.36% 28.920us 4.820us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 24.89% 214.494us 24.89% 214.494us 23.833us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.98% 17.062us 2.59% 22.273us 2.475us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.09% 9.391us 1.09% 9.391us 0.626us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.24% 10.660us 1.24% 10.660us 3.553us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.17% 10.040us 1.17% 10.040us 3.347us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.86% 7.370us 1.02% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.057ms
-Self CUDA time total: 392.957us
+Self CPU time total: 861.602us
+Self CUDA time total: 393.820us
@@ -4569,29 +4569,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 5.86% 122.119us 95.18% 1.984ms 1.984ms 0.000us 0.00% 485.373us 485.373us 1
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 475.549us 106.61% 475.549us 475.549us 1
- aten::conv1d 0.29% 6.020us 5.58% 116.291us 38.764us 0.000us 0.00% 298.429us 99.476us 3
- aten::convolution 0.46% 9.580us 5.29% 110.271us 36.757us 0.000us 0.00% 298.429us 99.476us 3
- aten::_convolution 1.07% 22.391us 4.83% 100.691us 33.564us 0.000us 0.00% 298.429us 99.476us 3
- aten::_conv_depthwise2d 1.02% 21.160us 3.01% 62.730us 20.910us 298.429us 66.91% 298.429us 99.476us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.429us 66.91% 298.429us 99.476us 3
- aten::to 0.28% 5.929us 82.40% 1.718ms 286.300us 0.000us 0.00% 186.944us 31.157us 6
- aten::_to_copy 1.13% 23.472us 82.12% 1.712ms 285.312us 0.000us 0.00% 186.944us 31.157us 6
- aten::copy_ 2.45% 51.061us 79.57% 1.659ms 276.443us 147.616us 33.09% 186.944us 31.157us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 107.712us 24.15% 107.712us 35.904us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.904us 8.95% 39.904us 13.301us 3
- Activity Buffer Request 68.65% 1.431ms 68.65% 1.431ms 1.431ms 39.328us 8.82% 39.328us 39.328us 1
- aten::empty_strided 1.43% 29.742us 1.43% 29.742us 4.957us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.54% 198.903us 9.54% 198.903us 22.100us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.85% 17.731us 1.11% 23.210us 2.579us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.44% 9.210us 0.44% 9.210us 0.614us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.47% 9.850us 0.47% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.45% 9.320us 0.45% 9.320us 3.107us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.28% 5.850us 0.35% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 15.32% 134.312us 91.67% 803.971us 803.971us 0.000us 0.00% 487.924us 487.924us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 476.501us 106.34% 476.501us 476.501us 1
+ aten::conv1d 0.67% 5.860us 13.82% 121.173us 40.391us 0.000us 0.00% 299.161us 99.720us 3
+ aten::convolution 1.17% 10.220us 13.15% 115.313us 38.438us 0.000us 0.00% 299.161us 99.720us 3
+ aten::_convolution 2.67% 23.450us 11.98% 105.093us 35.031us 0.000us 0.00% 299.161us 99.720us 3
+ aten::_conv_depthwise2d 2.56% 22.451us 7.48% 65.623us 21.874us 299.161us 66.76% 299.161us 99.720us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 299.161us 66.76% 299.161us 99.720us 3
+ aten::to 0.69% 6.051us 59.17% 518.906us 86.484us 0.000us 0.00% 188.763us 31.460us 6
+ aten::_to_copy 2.71% 23.771us 58.48% 512.855us 85.476us 0.000us 0.00% 188.763us 31.460us 6
+ aten::copy_ 5.69% 49.880us 52.31% 458.742us 76.457us 148.924us 33.24% 188.763us 31.460us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 108.861us 24.29% 108.861us 36.287us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.063us 8.94% 40.063us 13.354us 3
+ Activity Buffer Request 25.01% 219.366us 25.01% 219.366us 219.366us 39.839us 8.89% 39.839us 39.839us 1
+ aten::empty_strided 3.46% 30.342us 3.46% 30.342us 5.057us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 24.34% 213.439us 24.34% 213.439us 23.715us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.98% 17.400us 2.59% 22.720us 2.524us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.09% 9.540us 1.09% 9.540us 0.636us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.14% 10.010us 1.14% 10.010us 3.337us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.05% 9.219us 1.05% 9.219us 3.073us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.66% 5.750us 0.82% 7.210us 2.403us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 2.085ms
-Self CUDA time total: 446.045us
+Self CPU time total: 876.983us
+Self CUDA time total: 448.085us
@@ -4601,29 +4601,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.833us 1729.88% 323.833us 323.833us 1
- torch_eager 14.51% 116.191us 99.37% 795.884us 795.884us 0.000us 0.00% 20.608us 20.608us 1
- aten::to 0.75% 6.009us 67.15% 537.870us 89.645us 0.000us 0.00% 13.376us 2.229us 6
- aten::_to_copy 2.93% 23.471us 66.40% 531.861us 88.644us 0.000us 0.00% 13.376us 2.229us 6
- aten::copy_ 6.32% 50.599us 59.65% 477.769us 79.628us 11.488us 61.37% 13.376us 2.229us 6
- aten::conv1d 0.81% 6.510us 14.38% 115.173us 38.391us 0.000us 0.00% 7.232us 2.411us 3
- aten::convolution 1.28% 10.221us 13.57% 108.663us 36.221us 0.000us 0.00% 7.232us 2.411us 3
- aten::_convolution 2.73% 21.890us 12.29% 98.442us 32.814us 0.000us 0.00% 7.232us 2.411us 3
- aten::_conv_depthwise2d 2.76% 22.080us 7.70% 61.700us 20.567us 7.232us 38.63% 7.232us 2.411us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.232us 38.63% 7.232us 2.411us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 31.45% 5.888us 1.963us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.600us 29.91% 5.600us 1.867us 3
- Activity Buffer Request 31.20% 249.924us 31.20% 249.924us 249.924us 1.888us 10.09% 1.888us 1.888us 1
- aten::empty_strided 3.82% 30.621us 3.82% 30.621us 5.103us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 24.75% 198.236us 24.75% 198.236us 22.026us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.09% 16.762us 2.71% 21.692us 2.410us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.04% 8.330us 1.04% 8.330us 0.555us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.15% 9.220us 1.15% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.17% 9.410us 1.17% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.72% 5.800us 0.89% 7.160us 2.387us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.392us 1804.85% 338.392us 338.392us 1
+ torch_eager 18.33% 161.236us 99.35% 873.703us 873.703us 0.000us 0.00% 20.637us 20.637us 1
+ aten::to 0.69% 6.070us 63.71% 560.224us 93.371us 0.000us 0.00% 13.406us 2.234us 6
+ aten::_to_copy 2.78% 24.471us 63.02% 554.154us 92.359us 0.000us 0.00% 13.406us 2.234us 6
+ aten::copy_ 5.94% 52.212us 56.85% 499.953us 83.325us 11.518us 61.43% 13.406us 2.234us 6
+ aten::conv1d 0.64% 5.659us 14.02% 123.282us 41.094us 0.000us 0.00% 7.231us 2.410us 3
+ aten::convolution 1.14% 9.999us 13.38% 117.623us 39.208us 0.000us 0.00% 7.231us 2.410us 3
+ aten::_convolution 2.72% 23.952us 12.24% 107.624us 35.875us 0.000us 0.00% 7.231us 2.410us 3
+ aten::_conv_depthwise2d 2.67% 23.519us 7.63% 67.130us 22.377us 7.231us 38.57% 7.231us 2.410us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.231us 38.57% 7.231us 2.410us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.854us 31.22% 5.854us 1.951us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 30.21% 5.664us 1.888us 3
+ Activity Buffer Request 29.52% 259.596us 29.52% 259.596us 259.596us 1.888us 10.07% 1.888us 1.888us 1
+ aten::empty_strided 3.38% 29.730us 3.38% 29.730us 4.955us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 23.99% 210.946us 23.99% 210.946us 23.438us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.07% 18.190us 2.71% 23.871us 2.652us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.11% 9.761us 1.11% 9.761us 0.651us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.24% 10.890us 1.24% 10.890us 3.630us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.13% 9.920us 1.13% 9.920us 3.307us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.68% 5.972us 0.85% 7.452us 2.484us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 800.944us
-Self CUDA time total: 18.720us
+Self CPU time total: 879.393us
+Self CUDA time total: 18.749us
@@ -4633,29 +4633,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 324.666us 1676.91% 324.666us 324.666us 1
- torch_eager 15.17% 119.302us 99.37% 781.483us 781.483us 0.000us 0.00% 21.249us 21.249us 1
- aten::to 0.72% 5.648us 65.85% 517.928us 86.321us 0.000us 0.00% 13.345us 2.224us 6
- aten::_to_copy 2.87% 22.611us 65.14% 512.280us 85.380us 0.000us 0.00% 13.345us 2.224us 6
- aten::copy_ 6.22% 48.900us 58.49% 460.037us 76.673us 11.457us 59.18% 13.345us 2.224us 6
- aten::conv1d 0.87% 6.869us 14.99% 117.911us 39.304us 0.000us 0.00% 7.904us 2.635us 3
- aten::convolution 1.27% 10.002us 14.12% 111.042us 37.014us 0.000us 0.00% 7.904us 2.635us 3
- aten::_convolution 2.89% 22.710us 12.85% 101.040us 33.680us 0.000us 0.00% 7.904us 2.635us 3
- aten::_conv_depthwise2d 2.75% 21.590us 8.00% 62.920us 20.973us 7.904us 40.82% 7.904us 2.635us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 40.82% 7.904us 2.635us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.825us 30.09% 5.825us 1.942us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.09% 5.632us 1.877us 3
- Activity Buffer Request 30.25% 237.875us 30.25% 237.875us 237.875us 1.888us 9.75% 1.888us 1.888us 1
- aten::empty_strided 3.77% 29.632us 3.77% 29.632us 4.939us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 24.87% 195.612us 24.87% 195.612us 21.735us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.14% 16.821us 2.78% 21.881us 2.431us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.08% 8.481us 1.08% 8.481us 0.565us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.22% 9.600us 1.22% 9.600us 3.200us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.19% 9.380us 1.19% 9.380us 3.127us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.75% 5.869us 0.93% 7.280us 2.427us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.934us 1741.87% 338.934us 338.934us 1
+ torch_eager 16.71% 145.362us 99.29% 863.592us 863.592us 0.000us 0.00% 21.314us 21.314us 1
+ aten::to 0.71% 6.200us 65.36% 568.524us 94.754us 0.000us 0.00% 13.282us 2.214us 6
+ aten::_to_copy 2.85% 24.831us 64.65% 562.324us 93.721us 0.000us 0.00% 13.282us 2.214us 6
+ aten::copy_ 5.81% 50.550us 58.39% 507.883us 84.647us 11.426us 58.72% 13.282us 2.214us 6
+ aten::conv1d 0.78% 6.753us 14.06% 122.315us 40.772us 0.000us 0.00% 8.032us 2.677us 3
+ aten::convolution 1.19% 10.380us 13.29% 115.562us 38.521us 0.000us 0.00% 8.032us 2.677us 3
+ aten::_convolution 2.63% 22.841us 12.09% 105.182us 35.061us 0.000us 0.00% 8.032us 2.677us 3
+ aten::_conv_depthwise2d 2.65% 23.042us 7.65% 66.512us 22.171us 8.032us 41.28% 8.032us 2.677us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.032us 41.28% 8.032us 2.677us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.825us 29.94% 5.825us 1.942us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.601us 28.79% 5.601us 1.867us 3
+ Activity Buffer Request 30.62% 266.307us 30.62% 266.307us 266.307us 1.856us 9.54% 1.856us 1.856us 1
+ aten::empty_strided 3.40% 29.610us 3.40% 29.610us 4.935us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 24.61% 214.076us 24.61% 214.076us 23.786us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.02% 17.612us 2.63% 22.841us 2.538us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.02% 8.840us 1.02% 8.840us 0.589us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.22% 10.630us 1.22% 10.630us 3.543us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.13% 9.790us 1.13% 9.790us 3.263us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.67% 5.798us 0.82% 7.109us 2.370us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 786.473us
-Self CUDA time total: 19.361us
+Self CPU time total: 869.783us
+Self CUDA time total: 19.458us
@@ -4665,29 +4665,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.865us 1704.41% 328.865us 328.865us 1
- torch_eager 14.92% 117.622us 99.37% 783.184us 783.184us 0.000us 0.00% 21.439us 21.439us 1
- aten::to 0.74% 5.810us 66.49% 524.079us 87.347us 0.000us 0.00% 14.207us 2.368us 6
- aten::_to_copy 3.01% 23.701us 65.75% 518.269us 86.378us 0.000us 0.00% 14.207us 2.368us 6
- aten::copy_ 6.49% 51.190us 58.71% 462.718us 77.120us 12.063us 62.52% 14.207us 2.368us 6
- aten::conv1d 0.75% 5.890us 14.60% 115.093us 38.364us 0.000us 0.00% 7.232us 2.411us 3
- aten::convolution 1.22% 9.630us 13.86% 109.203us 36.401us 0.000us 0.00% 7.232us 2.411us 3
- aten::_convolution 2.83% 22.270us 12.63% 99.573us 33.191us 0.000us 0.00% 7.232us 2.411us 3
- aten::_conv_depthwise2d 2.80% 22.070us 7.82% 61.673us 20.558us 7.232us 37.48% 7.232us 2.411us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.232us 37.48% 7.232us 2.411us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.240us 32.34% 6.240us 2.080us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.823us 30.18% 5.823us 1.941us 3
- Activity Buffer Request 29.70% 234.095us 29.70% 234.095us 234.095us 2.144us 11.11% 2.144us 2.144us 1
- aten::empty_strided 4.04% 31.850us 4.04% 31.850us 5.308us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 25.25% 199.015us 25.25% 199.015us 22.113us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.15% 16.950us 2.78% 21.920us 2.436us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.05% 8.280us 1.05% 8.280us 0.552us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.22% 9.600us 1.22% 9.600us 3.200us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.07% 8.421us 1.07% 8.421us 2.807us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.76% 5.960us 0.92% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.862us 1751.78% 340.862us 340.862us 1
+ torch_eager 8.44% 173.073us 99.74% 2.045ms 2.045ms 0.000us 0.00% 21.635us 21.635us 1
+ aten::to 0.33% 6.670us 84.06% 1.723ms 287.196us 0.000us 0.00% 14.307us 2.385us 6
+ aten::_to_copy 1.21% 24.883us 83.74% 1.717ms 286.084us 0.000us 0.00% 14.307us 2.385us 6
+ aten::copy_ 2.36% 48.471us 81.06% 1.662ms 276.949us 12.130us 62.34% 14.307us 2.385us 6
+ aten::conv1d 0.29% 5.970us 5.84% 119.613us 39.871us 0.000us 0.00% 7.328us 2.443us 3
+ aten::convolution 0.48% 9.780us 5.54% 113.643us 37.881us 0.000us 0.00% 7.328us 2.443us 3
+ aten::_convolution 1.14% 23.420us 5.07% 103.863us 34.621us 0.000us 0.00% 7.328us 2.443us 3
+ aten::_conv_depthwise2d 1.10% 22.512us 3.15% 64.503us 21.501us 7.328us 37.66% 7.328us 2.443us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 37.66% 7.328us 2.443us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.241us 32.07% 6.241us 2.080us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 30.27% 5.889us 1.963us 3
+ Activity Buffer Request 69.34% 1.421ms 69.34% 1.421ms 1.421ms 2.177us 11.19% 2.177us 2.177us 1
+ aten::empty_strided 1.46% 29.930us 1.46% 29.930us 4.988us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.50% 215.256us 10.50% 215.256us 23.917us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.86% 17.669us 1.13% 23.180us 2.576us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.47% 9.581us 0.47% 9.581us 0.639us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.48% 9.759us 0.48% 9.759us 3.253us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.43% 8.742us 0.43% 8.742us 2.914us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.28% 5.760us 0.35% 7.110us 2.370us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 788.184us
-Self CUDA time total: 19.295us
+Self CPU time total: 2.050ms
+Self CUDA time total: 19.458us
@@ -4697,29 +4697,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.745us 1665.90% 334.745us 334.745us 1
- torch_eager 14.26% 118.712us 99.40% 827.395us 827.395us 0.000us 0.00% 22.270us 22.270us 1
- aten::to 0.70% 5.840us 67.41% 561.119us 93.520us 0.000us 0.00% 14.335us 2.389us 6
- aten::_to_copy 2.86% 23.780us 66.71% 555.279us 92.546us 0.000us 0.00% 14.335us 2.389us 6
- aten::copy_ 6.22% 51.741us 60.26% 501.588us 83.598us 12.159us 60.51% 14.335us 2.389us 6
- aten::conv1d 0.81% 6.751us 14.52% 120.873us 40.291us 0.000us 0.00% 7.935us 2.645us 3
- aten::convolution 1.20% 9.989us 13.71% 114.122us 38.041us 0.000us 0.00% 7.935us 2.645us 3
- aten::_convolution 2.78% 23.181us 12.51% 104.133us 34.711us 0.000us 0.00% 7.935us 2.645us 3
- aten::_conv_depthwise2d 2.64% 22.000us 7.72% 64.243us 21.414us 7.935us 39.49% 7.935us 2.645us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 39.49% 7.935us 2.645us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.239us 31.05% 6.239us 2.080us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.46% 5.920us 1.973us 3
- Activity Buffer Request 32.59% 271.245us 32.59% 271.245us 271.245us 2.176us 10.83% 2.176us 2.176us 1
- aten::empty_strided 3.59% 29.911us 3.59% 29.911us 4.985us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 24.22% 201.614us 24.22% 201.614us 22.402us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.06% 17.131us 2.68% 22.291us 2.477us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.07% 8.900us 1.07% 8.900us 0.593us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.16% 9.640us 1.16% 9.640us 3.213us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.15% 9.591us 1.15% 9.591us 3.197us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.79% 6.549us 0.97% 8.109us 2.703us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 367.067us 1820.95% 367.067us 367.067us 1
+ torch_eager 17.50% 145.595us 99.30% 826.111us 826.111us 0.000us 0.00% 22.366us 22.366us 1
+ aten::to 0.75% 6.199us 63.72% 530.082us 88.347us 0.000us 0.00% 14.431us 2.405us 6
+ aten::_to_copy 2.95% 24.573us 62.97% 523.883us 87.314us 0.000us 0.00% 14.431us 2.405us 6
+ aten::copy_ 6.31% 52.521us 56.15% 467.170us 77.862us 12.223us 60.64% 14.431us 2.405us 6
+ aten::conv1d 0.69% 5.760us 14.59% 121.354us 40.451us 0.000us 0.00% 7.935us 2.645us 3
+ aten::convolution 1.24% 10.281us 13.89% 115.594us 38.531us 0.000us 0.00% 7.935us 2.645us 3
+ aten::_convolution 2.68% 22.269us 12.66% 105.313us 35.104us 0.000us 0.00% 7.935us 2.645us 3
+ aten::_conv_depthwise2d 2.73% 22.701us 8.02% 66.711us 22.237us 7.935us 39.36% 7.935us 2.645us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 39.36% 7.935us 2.645us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 31.27% 6.304us 2.101us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 29.36% 5.919us 1.973us 3
+ Activity Buffer Request 27.00% 224.665us 27.00% 224.665us 224.665us 2.208us 10.95% 2.208us 2.208us 1
+ aten::empty_strided 3.86% 32.140us 3.86% 32.140us 5.357us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.71% 213.894us 25.71% 213.894us 23.766us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.05% 17.041us 2.71% 22.553us 2.506us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.14% 9.503us 1.14% 9.503us 0.634us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.31% 10.920us 1.31% 10.920us 3.640us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.10% 9.180us 1.10% 9.180us 3.060us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.81% 6.740us 0.98% 8.160us 2.720us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 832.395us
-Self CUDA time total: 20.094us
+Self CPU time total: 831.951us
+Self CUDA time total: 20.158us
@@ -4729,29 +4729,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.142us 918.64% 330.142us 330.142us 1
- torch_eager 14.68% 120.212us 99.34% 813.674us 813.674us 0.000us 0.00% 38.530us 38.530us 1
- aten::conv1d 0.79% 6.500us 14.15% 115.923us 38.641us 0.000us 0.00% 20.161us 6.720us 3
- aten::convolution 1.18% 9.650us 13.36% 109.423us 36.474us 0.000us 0.00% 20.161us 6.720us 3
- aten::_convolution 2.75% 22.509us 12.18% 99.773us 33.258us 0.000us 0.00% 20.161us 6.720us 3
- aten::_conv_depthwise2d 2.55% 20.922us 7.56% 61.883us 20.628us 20.161us 56.10% 20.161us 6.720us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.161us 56.10% 20.161us 6.720us 3
- aten::to 0.72% 5.880us 67.15% 549.969us 91.661us 0.000us 0.00% 18.369us 3.061us 6
- aten::_to_copy 2.82% 23.099us 66.43% 544.089us 90.682us 0.000us 0.00% 18.369us 3.061us 6
- aten::copy_ 6.44% 52.723us 59.97% 491.160us 81.860us 15.777us 43.90% 18.369us 3.061us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 23.51% 8.448us 2.816us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.329us 20.39% 7.329us 2.443us 3
- Activity Buffer Request 32.20% 263.764us 32.20% 263.764us 263.764us 2.592us 7.21% 2.592us 2.592us 1
- aten::empty_strided 3.64% 29.830us 3.64% 29.830us 4.972us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 24.00% 196.543us 24.00% 196.543us 21.838us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.14% 17.540us 2.77% 22.711us 2.523us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.07% 8.761us 1.07% 8.761us 0.584us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.21% 9.871us 1.21% 9.871us 3.290us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.13% 9.220us 1.13% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.68% 5.610us 0.85% 7.000us 2.333us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 363.100us 1005.93% 363.100us 363.100us 1
+ torch_eager 14.77% 122.163us 99.35% 821.971us 821.971us 0.000us 0.00% 38.688us 38.688us 1
+ aten::conv1d 0.72% 5.951us 17.29% 143.024us 47.675us 0.000us 0.00% 20.160us 6.720us 3
+ aten::convolution 1.22% 10.110us 16.57% 137.073us 45.691us 0.000us 0.00% 20.160us 6.720us 3
+ aten::_convolution 3.04% 25.151us 15.35% 126.963us 42.321us 0.000us 0.00% 20.160us 6.720us 3
+ aten::_conv_depthwise2d 4.80% 39.711us 10.31% 85.271us 28.424us 20.160us 55.85% 20.160us 6.720us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.160us 55.85% 20.160us 6.720us 3
+ aten::to 0.75% 6.172us 63.79% 527.804us 87.967us 0.000us 0.00% 18.528us 3.088us 6
+ aten::_to_copy 2.99% 24.751us 63.05% 521.632us 86.939us 0.000us 0.00% 18.528us 3.088us 6
+ aten::copy_ 6.14% 50.790us 56.45% 467.021us 77.837us 15.936us 44.15% 18.528us 3.088us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.512us 23.58% 8.512us 2.837us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.424us 20.57% 7.424us 2.475us 3
+ Activity Buffer Request 27.93% 231.066us 27.93% 231.066us 231.066us 2.592us 7.18% 2.592us 2.592us 1
+ aten::empty_strided 3.61% 29.860us 3.61% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.33% 209.585us 25.33% 209.585us 23.287us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.11% 17.441us 2.75% 22.791us 2.532us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.15% 9.501us 1.15% 9.501us 0.633us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.26% 10.400us 1.26% 10.400us 3.467us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.30% 10.740us 1.30% 10.740us 3.580us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.76% 6.269us 0.93% 7.730us 2.577us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 819.054us
-Self CUDA time total: 35.938us
+Self CPU time total: 827.381us
+Self CUDA time total: 36.096us
@@ -4761,29 +4761,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 330.946us 872.79% 330.946us 330.946us 1
- torch_eager 6.07% 120.841us 99.75% 1.987ms 1.987ms 0.000us 0.00% 40.478us 40.478us 1
- aten::conv1d 0.33% 6.510us 5.92% 117.833us 39.278us 0.000us 0.00% 22.271us 7.424us 3
- aten::convolution 0.49% 9.850us 5.59% 111.323us 37.108us 0.000us 0.00% 22.271us 7.424us 3
- aten::_convolution 1.11% 22.181us 5.10% 101.473us 33.824us 0.000us 0.00% 22.271us 7.424us 3
- aten::_conv_depthwise2d 1.10% 21.811us 3.17% 63.042us 21.014us 22.271us 58.73% 22.271us 7.424us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.271us 58.73% 22.271us 7.424us 3
- aten::to 0.30% 5.981us 86.38% 1.720ms 286.727us 0.000us 0.00% 18.207us 3.034us 6
- aten::_to_copy 1.18% 23.522us 86.08% 1.714ms 285.730us 0.000us 0.00% 18.207us 3.034us 6
- aten::copy_ 2.55% 50.829us 83.41% 1.661ms 276.860us 15.647us 41.27% 18.207us 3.034us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 21.94% 8.320us 2.773us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.327us 19.32% 7.327us 2.442us 3
- Activity Buffer Request 72.02% 1.434ms 72.02% 1.434ms 1.434ms 2.560us 6.75% 2.560us 2.560us 1
- aten::empty_strided 1.49% 29.700us 1.49% 29.700us 4.950us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 9.93% 197.835us 9.93% 197.835us 21.982us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.90% 17.980us 1.17% 23.390us 2.599us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.44% 8.840us 0.44% 8.840us 0.589us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.50% 9.970us 0.50% 9.970us 3.323us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.47% 9.410us 0.47% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.31% 6.110us 0.38% 7.490us 2.497us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.025us 883.88% 336.025us 336.025us 1
+ torch_eager 14.70% 120.902us 99.36% 817.351us 817.351us 0.000us 0.00% 40.610us 40.610us 1
+ aten::conv1d 0.71% 5.820us 14.44% 118.823us 39.608us 0.000us 0.00% 22.304us 7.435us 3
+ aten::convolution 1.12% 9.190us 13.74% 113.003us 37.668us 0.000us 0.00% 22.304us 7.435us 3
+ aten::_convolution 2.83% 23.270us 12.62% 103.813us 34.604us 0.000us 0.00% 22.304us 7.435us 3
+ aten::_conv_depthwise2d 2.83% 23.309us 7.79% 64.072us 21.357us 22.304us 58.67% 22.304us 7.435us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.304us 58.67% 22.304us 7.435us 3
+ aten::to 0.73% 5.990us 66.75% 549.075us 91.513us 0.000us 0.00% 18.306us 3.051us 6
+ aten::_to_copy 2.91% 23.953us 66.02% 543.085us 90.514us 0.000us 0.00% 18.306us 3.051us 6
+ aten::copy_ 6.07% 49.902us 59.57% 490.042us 81.674us 15.713us 41.33% 18.306us 3.051us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.353us 21.97% 8.353us 2.784us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 19.36% 7.360us 2.453us 3
+ Activity Buffer Request 30.85% 253.806us 30.85% 253.806us 253.806us 2.593us 6.82% 2.593us 2.593us 1
+ aten::empty_strided 3.54% 29.090us 3.54% 29.090us 4.848us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.29% 208.074us 25.29% 208.074us 23.119us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.19% 18.051us 2.84% 23.371us 2.597us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.11% 9.160us 1.11% 9.160us 0.611us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.21% 9.961us 1.21% 9.961us 3.320us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.10% 9.062us 1.10% 9.062us 3.021us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.80% 6.580us 0.96% 7.920us 2.640us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.992ms
-Self CUDA time total: 37.918us
+Self CPU time total: 822.611us
+Self CUDA time total: 38.017us
@@ -4793,29 +4793,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 385.308us 602.34% 385.308us 385.308us 1
- torch_eager 14.42% 123.450us 99.41% 851.045us 851.045us 0.000us 0.00% 68.065us 68.065us 1
- aten::conv1d 0.67% 5.711us 13.49% 115.513us 38.504us 0.000us 0.00% 41.633us 13.878us 3
- aten::convolution 1.22% 10.470us 12.83% 109.802us 36.601us 0.000us 0.00% 41.633us 13.878us 3
- aten::_convolution 2.63% 22.491us 11.60% 99.332us 33.111us 0.000us 0.00% 41.633us 13.878us 3
- aten::_conv_depthwise2d 2.49% 21.351us 7.22% 61.852us 20.617us 41.633us 65.08% 41.633us 13.878us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.633us 65.08% 41.633us 13.878us 3
- aten::to 0.71% 6.120us 68.08% 582.862us 97.144us 0.000us 0.00% 26.432us 4.405us 6
- aten::_to_copy 2.87% 24.611us 67.37% 576.742us 96.124us 0.000us 0.00% 26.432us 4.405us 6
- aten::copy_ 6.21% 53.173us 60.75% 520.070us 86.678us 22.336us 34.92% 26.432us 4.405us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.936us 18.66% 11.936us 3.979us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.400us 16.26% 10.400us 3.467us 3
- Activity Buffer Request 28.33% 242.554us 28.33% 242.554us 242.554us 4.096us 6.40% 4.096us 4.096us 1
- aten::empty_strided 3.74% 32.061us 3.74% 32.061us 5.344us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 28.79% 246.523us 28.79% 246.523us 27.391us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.02% 17.269us 2.63% 22.529us 2.503us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.08% 9.240us 1.08% 9.240us 0.616us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.11% 9.521us 1.11% 9.521us 3.174us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.03% 8.800us 1.03% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.68% 5.830us 0.84% 7.230us 2.410us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.486us 522.89% 335.486us 335.486us 1
+ torch_eager 15.29% 123.163us 99.38% 800.491us 800.491us 0.000us 0.00% 68.256us 68.256us 1
+ aten::conv1d 0.73% 5.840us 14.87% 119.763us 39.921us 0.000us 0.00% 41.760us 13.920us 3
+ aten::convolution 1.21% 9.761us 14.14% 113.923us 37.974us 0.000us 0.00% 41.760us 13.920us 3
+ aten::_convolution 2.84% 22.911us 12.93% 104.162us 34.721us 0.000us 0.00% 41.760us 13.920us 3
+ aten::_conv_depthwise2d 2.80% 22.570us 8.02% 64.572us 21.524us 41.760us 65.09% 41.760us 13.920us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.760us 65.09% 41.760us 13.920us 3
+ aten::to 0.73% 5.842us 65.67% 528.904us 88.151us 0.000us 0.00% 26.496us 4.416us 6
+ aten::_to_copy 2.94% 23.712us 64.94% 523.062us 87.177us 0.000us 0.00% 26.496us 4.416us 6
+ aten::copy_ 6.02% 48.492us 58.29% 469.521us 78.253us 22.400us 34.91% 26.496us 4.416us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.968us 18.65% 11.968us 3.989us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 16.26% 10.432us 3.477us 3
+ Activity Buffer Request 29.33% 236.206us 29.33% 236.206us 236.206us 4.096us 6.38% 4.096us 4.096us 1
+ aten::empty_strided 3.70% 29.829us 3.70% 29.829us 4.971us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.91% 208.693us 25.91% 208.693us 23.188us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.18% 17.569us 2.86% 23.069us 2.563us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.14% 9.222us 1.14% 9.222us 0.615us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.20% 9.631us 1.20% 9.631us 3.210us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.06% 8.501us 1.06% 8.501us 2.834us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.83% 6.660us 0.99% 7.990us 2.663us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 856.136us
-Self CUDA time total: 63.969us
+Self CPU time total: 805.451us
+Self CUDA time total: 64.160us
@@ -4825,29 +4825,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.859us 513.70% 357.859us 357.859us 1
- torch_eager 20.53% 180.503us 99.40% 873.955us 873.955us 0.000us 0.00% 73.695us 73.695us 1
- aten::conv1d 0.63% 5.530us 15.78% 138.703us 46.234us 0.000us 0.00% 47.359us 15.786us 3
- aten::convolution 1.12% 9.840us 15.15% 133.173us 44.391us 0.000us 0.00% 47.359us 15.786us 3
- aten::_convolution 2.65% 23.331us 14.03% 123.333us 41.111us 0.000us 0.00% 47.359us 15.786us 3
- aten::_conv_depthwise2d 2.63% 23.161us 9.53% 83.782us 27.927us 47.359us 67.98% 47.359us 15.786us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.359us 67.98% 47.359us 15.786us 3
- aten::to 0.72% 6.308us 59.85% 526.239us 87.707us 0.000us 0.00% 26.336us 4.389us 6
- aten::_to_copy 2.80% 24.578us 59.14% 519.931us 86.655us 0.000us 0.00% 26.336us 4.389us 6
- aten::copy_ 6.12% 53.792us 52.84% 464.590us 77.432us 22.304us 32.02% 26.336us 4.389us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.840us 17.00% 11.840us 3.947us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.464us 15.02% 10.464us 3.488us 3
- Activity Buffer Request 26.53% 233.244us 26.53% 233.244us 233.244us 4.032us 5.79% 4.032us 4.032us 1
- aten::empty_strided 3.50% 30.763us 3.50% 30.763us 5.127us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 22.92% 201.494us 22.92% 201.494us 22.388us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.03% 17.891us 2.67% 23.440us 2.604us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.06% 9.339us 1.06% 9.339us 0.623us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 2.95% 25.971us 2.95% 25.971us 8.657us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.22% 10.710us 1.22% 10.710us 3.570us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.71% 6.240us 0.88% 7.780us 2.593us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.218us 487.48% 340.218us 340.218us 1
+ torch_eager 15.18% 124.853us 99.38% 817.682us 817.682us 0.000us 0.00% 73.887us 73.887us 1
+ aten::conv1d 0.72% 5.910us 14.57% 119.903us 39.968us 0.000us 0.00% 47.328us 15.776us 3
+ aten::convolution 1.21% 9.960us 13.86% 113.993us 37.998us 0.000us 0.00% 47.328us 15.776us 3
+ aten::_convolution 2.81% 23.101us 12.64% 104.033us 34.678us 0.000us 0.00% 47.328us 15.776us 3
+ aten::_conv_depthwise2d 2.62% 21.561us 7.83% 64.432us 21.477us 47.328us 67.81% 47.328us 15.776us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.328us 67.81% 47.328us 15.776us 3
+ aten::to 0.75% 6.180us 66.30% 545.475us 90.913us 0.000us 0.00% 26.559us 4.426us 6
+ aten::_to_copy 2.97% 24.459us 65.55% 539.295us 89.882us 0.000us 0.00% 26.559us 4.426us 6
+ aten::copy_ 6.14% 50.491us 58.93% 484.862us 80.810us 22.463us 32.19% 26.559us 4.426us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.032us 17.24% 12.032us 4.011us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 14.95% 10.431us 3.477us 3
+ Activity Buffer Request 30.21% 248.576us 30.21% 248.576us 248.576us 4.096us 5.87% 4.096us 4.096us 1
+ aten::empty_strided 3.64% 29.974us 3.64% 29.974us 4.996us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.32% 208.345us 25.32% 208.345us 23.149us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.09% 17.201us 2.72% 22.401us 2.489us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.11% 9.120us 1.11% 9.120us 0.608us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.32% 10.899us 1.32% 10.899us 3.633us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.15% 9.422us 1.15% 9.422us 3.141us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.80% 6.580us 0.98% 8.070us 2.690us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 879.215us
-Self CUDA time total: 69.663us
+Self CPU time total: 822.752us
+Self CUDA time total: 69.791us
@@ -4857,29 +4857,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 365.250us 197.10% 365.250us 365.250us 1
- torch_eager 14.70% 119.032us 99.37% 804.604us 804.604us 0.000us 0.00% 195.299us 195.299us 1
- aten::conv1d 0.95% 7.700us 17.22% 139.393us 46.464us 0.000us 0.00% 133.056us 44.352us 3
- aten::convolution 1.24% 10.040us 16.26% 131.693us 43.898us 0.000us 0.00% 133.056us 44.352us 3
- aten::_convolution 2.91% 23.550us 15.02% 121.653us 40.551us 0.000us 0.00% 133.056us 44.352us 3
- aten::_conv_depthwise2d 2.69% 21.763us 10.08% 81.613us 27.204us 133.056us 71.80% 133.056us 44.352us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.056us 71.80% 133.056us 44.352us 3
- aten::to 0.75% 6.042us 64.10% 518.999us 86.500us 0.000us 0.00% 62.243us 10.374us 6
- aten::_to_copy 2.90% 23.470us 63.35% 512.957us 85.493us 0.000us 0.00% 62.243us 10.374us 6
- aten::copy_ 6.35% 51.412us 56.59% 458.237us 76.373us 52.258us 28.20% 62.243us 10.374us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.250us 15.78% 29.250us 9.750us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.008us 12.42% 23.008us 7.669us 3
- Activity Buffer Request 28.43% 230.213us 28.43% 230.213us 230.213us 9.985us 5.39% 9.985us 9.985us 1
- aten::empty_strided 3.86% 31.250us 3.86% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 26.81% 217.052us 26.81% 217.052us 24.117us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.10% 17.030us 2.74% 22.170us 2.463us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.13% 9.170us 1.13% 9.170us 0.611us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.22% 9.870us 1.22% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.18% 9.540us 1.18% 9.540us 3.180us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.78% 6.320us 1.00% 8.100us 2.700us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.276us 192.10% 357.276us 357.276us 1
+ torch_eager 7.25% 148.445us 99.75% 2.043ms 2.043ms 0.000us 0.00% 196.063us 196.063us 1
+ aten::conv1d 0.28% 5.714us 6.04% 123.725us 41.242us 0.000us 0.00% 133.535us 44.512us 3
+ aten::convolution 0.50% 10.209us 5.76% 118.011us 39.337us 0.000us 0.00% 133.535us 44.512us 3
+ aten::_convolution 1.22% 24.922us 5.26% 107.802us 35.934us 0.000us 0.00% 133.535us 44.512us 3
+ aten::_conv_depthwise2d 1.06% 21.740us 3.25% 66.540us 22.180us 133.535us 71.80% 133.535us 44.512us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.535us 71.80% 133.535us 44.512us 3
+ aten::to 0.32% 6.558us 85.01% 1.741ms 290.215us 0.000us 0.00% 62.528us 10.421us 6
+ aten::_to_copy 1.28% 26.242us 84.69% 1.735ms 289.122us 0.000us 0.00% 62.528us 10.421us 6
+ aten::copy_ 2.37% 48.539us 81.91% 1.678ms 279.634us 52.448us 28.20% 62.528us 10.421us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.536us 15.88% 29.536us 9.845us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 12.32% 22.912us 7.637us 3
+ Activity Buffer Request 70.45% 1.443ms 70.45% 1.443ms 1.443ms 10.080us 5.42% 10.080us 10.080us 1
+ aten::empty_strided 1.50% 30.691us 1.50% 30.691us 5.115us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.22% 209.265us 10.22% 209.265us 23.252us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.93% 19.072us 1.20% 24.640us 2.738us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.247us 0.45% 9.247us 0.616us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.55% 11.270us 0.55% 11.270us 3.757us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.51% 10.520us 0.51% 10.520us 3.507us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.29% 5.931us 0.35% 7.230us 2.410us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 809.694us
-Self CUDA time total: 185.314us
+Self CPU time total: 2.048ms
+Self CUDA time total: 185.983us
@@ -4889,29 +4889,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 352.824us 168.80% 352.824us 352.824us 1
- torch_eager 14.40% 121.160us 99.40% 836.424us 836.424us 0.000us 0.00% 222.266us 222.266us 1
- aten::conv1d 0.71% 5.981us 14.17% 119.243us 39.748us 0.000us 0.00% 153.724us 51.241us 3
- aten::convolution 1.17% 9.810us 13.46% 113.262us 37.754us 0.000us 0.00% 153.724us 51.241us 3
- aten::_convolution 2.76% 23.250us 12.29% 103.452us 34.484us 0.000us 0.00% 153.724us 51.241us 3
- aten::_conv_depthwise2d 2.65% 22.340us 7.64% 64.321us 21.440us 153.724us 73.55% 153.724us 51.241us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.724us 73.55% 153.724us 51.241us 3
- aten::to 0.70% 5.880us 67.58% 568.691us 94.782us 0.000us 0.00% 68.542us 11.424us 6
- aten::_to_copy 2.81% 23.631us 66.88% 562.811us 93.802us 0.000us 0.00% 68.542us 11.424us 6
- aten::copy_ 7.48% 62.921us 60.21% 506.640us 84.440us 55.294us 26.45% 68.542us 11.424us 6
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.352us 15.48% 32.352us 10.784us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.942us 10.98% 22.942us 7.647us 3
- Activity Buffer Request 31.88% 268.245us 31.88% 268.245us 268.245us 13.248us 6.34% 13.248us 13.248us 1
- aten::empty_strided 3.87% 32.540us 3.87% 32.540us 5.423us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 23.51% 197.824us 23.51% 197.824us 21.980us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 2.07% 17.378us 2.68% 22.521us 2.502us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 1.06% 8.883us 1.06% 8.883us 0.592us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.19% 9.991us 1.19% 9.991us 3.330us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 1.15% 9.640us 1.15% 9.640us 3.213us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.71% 5.990us 0.89% 7.470us 2.490us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 358.235us 170.21% 358.235us 358.235us 1
+ torch_eager 15.50% 124.275us 99.34% 796.461us 796.461us 0.000us 0.00% 224.253us 224.253us 1
+ aten::conv1d 0.70% 5.590us 14.78% 118.483us 39.494us 0.000us 0.00% 154.174us 51.391us 3
+ aten::convolution 1.24% 9.921us 14.08% 112.893us 37.631us 0.000us 0.00% 154.174us 51.391us 3
+ aten::_convolution 2.81% 22.549us 12.84% 102.972us 34.324us 0.000us 0.00% 154.174us 51.391us 3
+ aten::_conv_depthwise2d 2.82% 22.632us 8.11% 65.062us 21.687us 154.174us 73.26% 154.174us 51.391us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.174us 73.26% 154.174us 51.391us 3
+ aten::to 0.74% 5.971us 65.46% 524.833us 87.472us 0.000us 0.00% 70.079us 11.680us 6
+ aten::_to_copy 3.23% 25.880us 64.72% 518.862us 86.477us 0.000us 0.00% 70.079us 11.680us 6
+ aten::copy_ 6.33% 50.713us 57.67% 462.401us 77.067us 56.287us 26.74% 70.079us 11.680us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 33.248us 15.80% 33.248us 11.083us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.039us 10.95% 23.039us 7.680us 3
+ Activity Buffer Request 28.19% 225.995us 28.19% 225.995us 225.995us 13.792us 6.55% 13.792us 13.792us 1
+ aten::empty_strided 3.81% 30.581us 3.81% 30.581us 5.097us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 25.98% 208.263us 25.98% 208.263us 23.140us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.24% 17.992us 2.91% 23.301us 2.589us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.16% 9.309us 1.16% 9.309us 0.621us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.31% 10.480us 1.31% 10.480us 3.493us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.17% 9.380us 1.17% 9.380us 3.127us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.74% 5.910us 0.92% 7.370us 2.457us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 841.495us
-Self CUDA time total: 209.018us
+Self CPU time total: 801.751us
+Self CUDA time total: 210.461us
@@ -4921,29 +4921,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 6.78% 125.712us 53.74% 996.387us 996.387us 0.000us 0.00% 1.527ms 1.527ms 1
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.423ms 100.39% 1.423ms 1.423ms 1
- aten::to 0.35% 6.438us 38.84% 720.182us 120.030us 0.000us 0.00% 832.992us 138.832us 6
- aten::_to_copy 1.55% 28.691us 38.49% 713.744us 118.957us 0.000us 0.00% 832.992us 138.832us 6
- aten::copy_ 2.90% 53.742us 26.33% 488.279us 81.380us 724.000us 51.06% 832.992us 138.832us 6
- aten::conv1d 0.38% 6.960us 6.55% 121.533us 40.511us 0.000us 0.00% 693.950us 231.317us 3
- aten::convolution 0.56% 10.430us 6.18% 114.573us 38.191us 0.000us 0.00% 693.950us 231.317us 3
- aten::_convolution 1.25% 23.268us 5.62% 104.143us 34.714us 0.000us 0.00% 693.950us 231.317us 3
- aten::_conv_depthwise2d 1.23% 22.830us 3.48% 64.552us 21.517us 693.950us 48.94% 693.950us 231.317us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 693.950us 48.94% 693.950us 231.317us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 410.655us 28.96% 410.655us 136.885us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 313.345us 22.10% 313.345us 104.448us 3
- Activity Buffer Request 13.73% 254.654us 13.73% 254.654us 254.654us 108.992us 7.69% 108.992us 108.992us 1
- aten::empty_strided 2.01% 37.271us 10.61% 196.774us 32.796us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 10.89% 201.884us 10.89% 201.884us 22.432us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.98% 18.223us 1.29% 23.933us 2.659us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.51% 9.490us 0.51% 9.490us 0.633us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.54% 10.101us 0.54% 10.101us 3.367us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.52% 9.620us 0.52% 9.620us 3.207us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.34% 6.270us 0.41% 7.680us 2.560us 0.000us 0.00% 0.000us 0.000us 3
+ torch_eager 7.15% 131.473us 52.77% 970.085us 970.085us 0.000us 0.00% 1.521ms 1.521ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.421ms 100.40% 1.421ms 1.421ms 1
+ aten::to 0.36% 6.571us 37.17% 683.219us 113.870us 0.000us 0.00% 824.180us 137.363us 6
+ aten::_to_copy 1.61% 29.612us 36.81% 676.648us 112.775us 0.000us 0.00% 824.180us 137.363us 6
+ aten::copy_ 2.81% 51.569us 25.14% 462.051us 77.009us 718.613us 50.76% 824.180us 137.363us 6
+ aten::conv1d 0.36% 6.680us 6.82% 125.423us 41.808us 0.000us 0.00% 696.981us 232.327us 3
+ aten::convolution 0.57% 10.460us 6.46% 118.743us 39.581us 0.000us 0.00% 696.981us 232.327us 3
+ aten::_convolution 1.31% 24.040us 5.89% 108.283us 36.094us 0.000us 0.00% 696.981us 232.327us 3
+ aten::_conv_depthwise2d 1.25% 22.981us 3.69% 67.913us 22.638us 696.981us 49.24% 696.981us 232.327us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 696.981us 49.24% 696.981us 232.327us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 410.458us 29.00% 410.458us 136.819us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 308.155us 21.77% 308.155us 102.718us 3
+ Activity Buffer Request 11.91% 218.936us 11.91% 218.936us 218.936us 105.567us 7.46% 105.567us 105.567us 1
+ aten::empty_strided 2.01% 37.011us 10.06% 184.985us 30.831us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 11.74% 215.777us 11.74% 215.777us 23.975us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.99% 18.200us 1.31% 24.000us 2.667us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.53% 9.740us 0.53% 9.740us 0.649us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.59% 10.839us 0.59% 10.839us 3.613us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.54% 9.862us 0.54% 9.862us 3.287us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.34% 6.240us 0.42% 7.700us 2.567us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 1.854ms
-Self CUDA time total: 1.418ms
+Self CPU time total: 1.838ms
+Self CUDA time total: 1.416ms
@@ -4953,109 +4953,57 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_eager 4.03% 122.972us 65.43% 1.999ms 1.999ms 0.000us 0.00% 1.502ms 1.502ms 1
+ torch_eager 6.74% 124.615us 43.66% 806.720us 806.720us 0.000us 0.00% 1.502ms 1.502ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.433ms 100.41% 1.433ms 1.433ms 1
- aten::to 0.19% 5.740us 56.63% 1.730ms 288.331us 0.000us 0.00% 766.432us 127.739us 6
- aten::_to_copy 0.79% 24.119us 56.45% 1.724ms 287.375us 0.000us 0.00% 766.432us 127.739us 6
- aten::copy_ 1.70% 52.020us 54.70% 1.671ms 278.493us 691.168us 48.43% 766.432us 127.739us 6
- aten::conv1d 0.23% 6.891us 3.86% 118.002us 39.334us 0.000us 0.00% 736.031us 245.344us 3
- aten::convolution 0.33% 9.930us 3.64% 111.111us 37.037us 0.000us 0.00% 736.031us 245.344us 3
- aten::_convolution 0.74% 22.558us 3.31% 101.181us 33.727us 0.000us 0.00% 736.031us 245.344us 3
- aten::_conv_depthwise2d 0.70% 21.291us 2.07% 63.232us 21.077us 736.031us 51.57% 736.031us 245.344us 3
-void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 736.031us 51.57% 736.031us 245.344us 3
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 401.120us 28.11% 401.120us 133.707us 3
-void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 290.048us 20.32% 290.048us 96.683us 3
- Activity Buffer Request 47.17% 1.441ms 47.17% 1.441ms 1.441ms 75.264us 5.27% 75.264us 75.264us 1
- aten::empty_strided 0.95% 29.171us 0.95% 29.171us 4.862us 0.000us 0.00% 0.000us 0.000us 6
- cudaLaunchKernel 6.58% 201.084us 6.58% 201.084us 22.343us 0.000us 0.00% 0.000us 0.000us 9
- aten::unsqueeze 0.57% 17.550us 0.75% 22.971us 2.552us 0.000us 0.00% 0.000us 0.000us 9
- aten::as_strided 0.30% 9.131us 0.30% 9.131us 0.609us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 0.33% 9.960us 0.33% 9.960us 3.320us 0.000us 0.00% 0.000us 0.000us 3
- aten::resize_ 0.30% 9.060us 0.30% 9.060us 3.020us 0.000us 0.00% 0.000us 0.000us 3
- aten::squeeze 0.18% 5.561us 0.23% 7.041us 2.347us 0.000us 0.00% 0.000us 0.000us 3
+ aten::to 0.34% 6.269us 28.35% 523.751us 87.292us 0.000us 0.00% 764.786us 127.464us 6
+ aten::_to_copy 1.27% 23.480us 28.01% 517.482us 86.247us 0.000us 0.00% 764.786us 127.464us 6
+ aten::copy_ 2.74% 50.661us 25.15% 464.712us 77.452us 690.099us 48.36% 764.786us 127.464us 6
+ aten::conv1d 0.32% 5.870us 7.00% 129.374us 43.125us 0.000us 0.00% 737.040us 245.680us 3
+ aten::convolution 0.54% 9.999us 6.68% 123.504us 41.168us 0.000us 0.00% 737.040us 245.680us 3
+ aten::_convolution 1.31% 24.293us 6.14% 113.505us 37.835us 0.000us 0.00% 737.040us 245.680us 3
+ aten::_conv_depthwise2d 1.62% 30.010us 3.95% 73.060us 24.353us 737.040us 51.64% 737.040us 245.680us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 737.040us 51.64% 737.040us 245.680us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 399.673us 28.01% 399.673us 133.224us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 290.426us 20.35% 290.426us 96.809us 3
+ Activity Buffer Request 12.15% 224.466us 12.15% 224.466us 224.466us 74.687us 5.23% 74.687us 74.687us 1
+ aten::empty_strided 1.59% 29.290us 1.59% 29.290us 4.882us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 11.52% 212.785us 11.52% 212.785us 23.643us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.94% 17.281us 1.23% 22.771us 2.530us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.55% 10.081us 0.55% 10.081us 0.672us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.57% 10.440us 0.57% 10.440us 3.480us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.51% 9.410us 0.51% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.33% 6.150us 0.41% 7.641us 2.547us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 3.055ms
+Self CPU time total: 1.848ms
Self CUDA time total: 1.427ms
impl wl p50(ms) ok
torch_eager cuda_B2_D2048_S128_W2 0.09 True
torch_eager cuda_B2_D2048_S128_W4 0.08 True
-torch_eager cuda_B2_D2048_S2048_W2 0.14 True
+torch_eager cuda_B2_D2048_S2048_W2 0.15 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
torch_eager cuda_B2_D2048_S512_W2 0.09 True
-torch_eager cuda_B2_D2048_S512_W4 0.08 True
+torch_eager cuda_B2_D2048_S512_W4 0.09 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.09 True
torch_eager cuda_B2_D64_S2048_W2 0.09 True
-torch_eager cuda_B2_D64_S2048_W4 0.08 True
+torch_eager cuda_B2_D64_S2048_W4 0.09 True
torch_eager cuda_B2_D64_S512_W2 0.09 True
torch_eager cuda_B2_D64_S512_W4 0.09 True
-torch_eager cuda_B4_D2048_S128_W2 0.08 True
-torch_eager cuda_B4_D2048_S128_W4 0.08 True
+torch_eager cuda_B4_D2048_S128_W2 0.09 True
+torch_eager cuda_B4_D2048_S128_W4 0.09 True
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
-torch_eager cuda_B4_D2048_S512_W2 0.09 True
+torch_eager cuda_B4_D2048_S512_W2 0.10 True
torch_eager cuda_B4_D2048_S512_W4 0.10 True
-torch_eager cuda_B4_D64_S128_W2 0.08 True
+torch_eager cuda_B4_D64_S128_W2 0.09 True
torch_eager cuda_B4_D64_S128_W4 0.08 True
-torch_eager cuda_B4_D64_S2048_W2 0.08 True
-torch_eager cuda_B4_D64_S2048_W4 0.08 True
-torch_eager cuda_B4_D64_S512_W2 0.08 True
-torch_eager cuda_B4_D64_S512_W4 0.08 True
+torch_eager cuda_B4_D64_S2048_W2 0.09 True
+torch_eager cuda_B4_D64_S2048_W4 0.09 True
+torch_eager cuda_B4_D64_S512_W2 0.09 True
+torch_eager cuda_B4_D64_S512_W4 0.09 True
-
-
-
- Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
-Downloading pillow (6.7MiB)
-Downloading fonttools (4.7MiB)
-Downloading setuptools (1.1MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading sympy (6.0MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading networkx (1.9MiB)
-Downloading numpy (16.2MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading torch (846.9MiB)
-Downloading triton (148.3MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading setuptools
- Downloading fonttools
- Downloading networkx
- Downloading pillow
- Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
- Downloading nvidia-cuda-cupti-cu12
- Downloading matplotlib
- Downloading numpy
- Downloading sympy
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 37 packages in 218ms
-
-
Artifacts:
causal_conv1d.jsonl
diff --git a/causal_conv1d/results/artifacts/combine/latency.svg b/causal_conv1d/results/artifacts/combine/latency.svg
index 9b058d2666ce3f17f1e0271794e89c52b55a50d5..1051764b171c27ddd8f8651b286d107eb666bd69 100644
--- a/causal_conv1d/results/artifacts/combine/latency.svg
+++ b/causal_conv1d/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:cf8858bb054bd7e8f82af77fd05a6475b7ee3a9a335ba4a6506cd1c694804777
+oid sha256:6fdf61512b0add92f3d8e4a284ecb814f7a3b11b2db0fe3af610896a05d7072f
size 35426
diff --git a/causal_conv1d/results/combined_results.html b/causal_conv1d/results/combined_results.html
index 478077209c7e2fef5044dc68f9a6ef240e0167c9..6a99b42f98995858e618176be6ad4beb1b59c2c4 100644
--- a/causal_conv1d/results/combined_results.html
+++ b/causal_conv1d/results/combined_results.html
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
- 2025-10-30T15:53:58.349427
+ 2025-10-31T20:14:05.716143
image/svg+xml
@@ -4451,70 +4451,70 @@ body[data-tool="eraser"] .main-content {
-
+
-
+
- 0.1
+ 0.1
-
+
-
+
- 0.2
+ 0.2
-
+
-
+
- 0.3
+ 0.3
-
+
-
+
- 0.4
+ 0.4
-
+
-
+
- 0.5
+ 0.5
@@ -4522,66 +4522,66 @@ body[data-tool="eraser"] .main-content {
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -4640,7 +4640,7 @@ body[data-tool="eraser"] .main-content {
▼ output
▶ uv-logs
|
-Cell: combine | 4.38s
+Cell: combine | 4.43s
| ▶ run
Copy
Raw
@@ -4753,28 +4753,28 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
torch_eager cuda_B2_D2048_S128_W2 0.09 True
torch_eager cuda_B2_D2048_S128_W4 0.08 True
-torch_eager cuda_B2_D2048_S2048_W2 0.14 True
+torch_eager cuda_B2_D2048_S2048_W2 0.15 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
torch_eager cuda_B2_D2048_S512_W2 0.09 True
-torch_eager cuda_B2_D2048_S512_W4 0.08 True
+torch_eager cuda_B2_D2048_S512_W4 0.09 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.09 True
torch_eager cuda_B2_D64_S2048_W2 0.09 True
-torch_eager cuda_B2_D64_S2048_W4 0.08 True
+torch_eager cuda_B2_D64_S2048_W4 0.09 True
torch_eager cuda_B2_D64_S512_W2 0.09 True
torch_eager cuda_B2_D64_S512_W4 0.09 True
-torch_eager cuda_B4_D2048_S128_W2 0.08 True
-torch_eager cuda_B4_D2048_S128_W4 0.08 True
+torch_eager cuda_B4_D2048_S128_W2 0.09 True
+torch_eager cuda_B4_D2048_S128_W4 0.09 True
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
-torch_eager cuda_B4_D2048_S512_W2 0.09 True
+torch_eager cuda_B4_D2048_S512_W2 0.10 True
torch_eager cuda_B4_D2048_S512_W4 0.10 True
-torch_eager cuda_B4_D64_S128_W2 0.08 True
+torch_eager cuda_B4_D64_S128_W2 0.09 True
torch_eager cuda_B4_D64_S128_W4 0.08 True
-torch_eager cuda_B4_D64_S2048_W2 0.08 True
-torch_eager cuda_B4_D64_S2048_W4 0.08 True
-torch_eager cuda_B4_D64_S512_W2 0.08 True
-torch_eager cuda_B4_D64_S512_W4 0.08 True
+torch_eager cuda_B4_D64_S2048_W2 0.09 True
+torch_eager cuda_B4_D64_S2048_W4 0.09 True
+torch_eager cuda_B4_D64_S512_W2 0.09 True
+torch_eager cuda_B4_D64_S512_W4 0.09 True
GENERATING COMBINED VISUALIZATION
@@ -4794,7 +4794,7 @@ Implementations included:
-Installed 37 packages in 211ms
+Installed 37 packages in 238ms
@@ -4807,7 +4807,7 @@ Installed 37 packages in 211ms
- 2025-10-30T15:53:58.349427
+ 2025-10-31T20:14:05.716143
image/svg+xml
@@ -5151,70 +5151,70 @@ Installed 37 packages in 211ms
-
+
-
+
- 0.1
+ 0.1
-
+
-
+
- 0.2
+ 0.2
-
+
-
+
- 0.3
+ 0.3
-
+
-
+
- 0.4
+ 0.4
-
+
-
+
- 0.5
+ 0.5
@@ -5222,66 +5222,66 @@ Installed 37 packages in 211ms
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl b/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..52c7930d88f40dd4da2a4cc2aa3b8068bb350deb
--- /dev/null
+++ b/deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl
@@ -0,0 +1,4 @@
+{"ts": "2025-10-31T20:13:50Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.3733269999629556, "p50": 3.3932979999917734, "p90": 3.4002180000243243, "mean": 3.393551400040451, "iqr": 0.010580999969533877, "raw_times": [3.3896370000547904, 3.4002180000243243, 3.3932979999917734, 3.3733269999629556, 3.411277000168411], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.4049870000671945, "peak_bytes": 5929472, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
+{"ts": "2025-10-31T20:13:51Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.99112300010529, "p50": 4.007804000139004, "p90": 4.020502999992459, "mean": 4.014501400024528, "iqr": 0.017490000118414173, "raw_times": [4.050064000011844, 4.020502999992459, 4.007804000139004, 4.003012999874045, 3.99112300010529], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.017783999870517, "peak_bytes": 15161856, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
+{"ts": "2025-10-31T20:13:51Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.004662999932407, "p50": 4.020202999981848, "p90": 4.030714000009539, "mean": 4.022331200030749, "iqr": 0.011850999953821884, "raw_times": [4.018863000055717, 4.004662999932407, 4.0372130001742335, 4.020202999981848, 4.030714000009539], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.032904000041526, "peak_bytes": 11958784, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
+{"ts": "2025-10-31T20:13:52Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.005022999990615, "p50": 4.020072999992408, "p90": 4.0240040000298904, "mean": 4.01746140000796, "iqr": 0.009850999958871398, "raw_times": [4.014153000071019, 4.005022999990615, 4.024053999955868, 4.0240040000298904, 4.020072999992408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.024974000003567, "peak_bytes": 30977024, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
diff --git a/deformable_detr/impls/cells/benchmark.py b/deformable_detr/impls/cells/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..66ccdf2085524240060089c8658a5256c484037b
--- /dev/null
+++ b/deformable_detr/impls/cells/benchmark.py
@@ -0,0 +1,118 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_deformable_detr(
+ value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
+):
+ """
+ PyTorch native reference implementation of multi-scale deformable attention.
+ Uses vectorized bilinear interpolation for reasonable performance.
+ """
+ bs, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+ _, _, _, channels = value.shape
+
+ output = torch.zeros(bs, num_queries, num_heads, channels, device=value.device, dtype=value.dtype)
+
+ # Split value tensor by levels
+ value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1)
+
+ # Iterate through each level (can't avoid this loop easily)
+ for level_idx in range(num_levels):
+ h, w = spatial_shapes[level_idx].tolist()
+ value_level = value_list[level_idx] # (bs, h*w, num_heads, channels)
+
+ # Reshape to spatial grid: (bs, num_heads, channels, h, w)
+ value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2)
+
+ # Get sampling locations and weights for this level
+ # loc: (bs, num_queries, num_heads, num_points, 2)
+ loc = sampling_locations[:, :, :, level_idx, :, :]
+ # weight: (bs, num_queries, num_heads, num_points)
+ weight = attention_weights[:, :, :, level_idx, :]
+
+ # Convert normalized coordinates to pixel coordinates
+ # loc[..., 0] is x (width), loc[..., 1] is y (height)
+ x = loc[..., 0] * w - 0.5 # (bs, num_queries, num_heads, num_points)
+ y = loc[..., 1] * h - 0.5
+
+ # Get integer coordinates for bilinear interpolation
+ x0 = torch.floor(x).long()
+ y0 = torch.floor(y).long()
+ x1 = x0 + 1
+ y1 = y0 + 1
+
+ # Compute interpolation weights BEFORE clamping (important!)
+ lw = x - x0.float() # weight for x direction
+ lh = y - y0.float() # weight for y direction
+ hw = 1 - lw
+ hh = 1 - lh
+
+ # Create mask for valid sample locations
+ valid = (y > -1) & (x > -1) & (y < h) & (x < w)
+
+ # Create masks for each corner being in bounds
+ mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float()
+ mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float()
+ mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float()
+ mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float()
+
+ # Clamp coordinates for safe indexing
+ x0_clamped = torch.clamp(x0, 0, w - 1)
+ x1_clamped = torch.clamp(x1, 0, w - 1)
+ y0_clamped = torch.clamp(y0, 0, h - 1)
+ y1_clamped = torch.clamp(y1, 0, h - 1)
+
+ # Bilinear interpolation weights for all 4 corners
+ w_tl = (hh * hw).unsqueeze(-1) # top-left: (bs, num_queries, num_heads, num_points, 1)
+ w_tr = (hh * lw).unsqueeze(-1) # top-right
+ w_bl = (lh * hw).unsqueeze(-1) # bottom-left
+ w_br = (lh * lw).unsqueeze(-1) # bottom-right
+
+ # Gather values from the 4 corners using advanced indexing
+ batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points)
+ head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points)
+
+ # Gather corner values with clamped indices, then apply corner masks
+ v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl
+ v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr
+ v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl
+ v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br
+
+ # Bilinear interpolation
+ sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
+
+ # Apply valid mask (only accumulate if entire sample location is valid)
+ sampled = sampled * valid.unsqueeze(-1).float()
+
+ # Apply attention weights and sum over points
+ # weight: (bs, num_queries, num_heads, num_points)
+ # Expand weight: (bs, num_queries, num_heads, num_points, 1)
+ weighted_sampled = sampled * weight.unsqueeze(-1)
+
+ # Sum over points: (bs, num_queries, num_heads, channels)
+ output += weighted_sampled.sum(dim=3)
+
+ # Flatten last two dimensions to match kernel output
+ return output.reshape(bs, num_queries, num_heads * channels)
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
+ impl_name="torch_eager",
+ impl_tags={"family": "pytorch", "backend": "eager"},
+ impl_func=torch_deformable_detr,
+ dtype="float32",
+)
\ No newline at end of file
diff --git a/deformable_detr/impls/cells/nv.py b/deformable_detr/impls/cells/nv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5
--- /dev/null
+++ b/deformable_detr/impls/cells/nv.py
@@ -0,0 +1,2 @@
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
\ No newline at end of file
diff --git a/deformable_detr/impls/hf_kernels_deformable_detr.html b/deformable_detr/impls/hf_kernels_deformable_detr.html
new file mode 100644
index 0000000000000000000000000000000000000000..8203846442acfc0a17b0a7372d2971964aac9caf
--- /dev/null
+++ b/deformable_detr/impls/hf_kernels_deformable_detr.html
@@ -0,0 +1,4350 @@
+
+
+
+
+
+ hf_kernels_deformable_detr
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
HF Kernels - Deformable DETR
+
GPU Info
+
+
+
+
+
import subprocess
+print ( subprocess . run ([ "nvidia-smi" ], capture_output = True , text = True ) . stdout )
+
+
+
+
+
+
+
Fri Oct 31 20:13:34 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 43C P0 83W / 350W | 0MiB / 46068MiB | 60% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
Deformable DETR Multi-Scale Deformable Attention Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum , run_benchmark
+from kernels import get_kernel
+
+# Load the deformable DETR kernel
+deformable_detr = get_kernel ( "kernels-community/deformable-detr" )
+
+
+def hf_kernels_deformable_detr (
+ value , spatial_shapes , level_start_index , sampling_locations , attention_weights , im2col_step = 64
+):
+ """HuggingFace Kernels Deformable DETR Multi-Scale Deformable Attention"""
+ return deformable_detr . ms_deform_attn_forward (
+ value = value ,
+ spatial_shapes = spatial_shapes ,
+ level_start_index = level_start_index ,
+ sampling_loc = sampling_locations ,
+ attn_weight = attention_weights ,
+ im2col_step = im2col_step
+ )
+
+
+run_benchmark (
+ kernel_type = KernelTypeEnum . DEFORMABLE_DETR ,
+ impl_name = "hf_kernels_deformable_detr" ,
+ impl_tags = { "family" : "hf-kernels" , "backend" : "cuda" },
+ impl_func = hf_kernels_deformable_detr ,
+ dtype = "float32" ,
+)
+
+
+
+
+
+
+
Running deformable_detr benchmark on cuda with 4 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q100_H8_E256_L4_P4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 195.201us 770.15% 195.201us 195.201us 1
+ hf_kernels_deformable_detr 7.43% 141.524us 99.61% 1.898ms 1.898ms 0.000us 0.00% 26.403us 26.403us 1
+ _deformable_detr_57c3d32::ms_deform_attn_forward 3.93% 74.960us 92.19% 1.756ms 585.455us 22.464us 88.63% 26.403us 8.801us 3
+void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.464us 88.63% 22.464us 7.488us 3
+ aten::zeros 1.20% 22.800us 85.08% 1.621ms 540.337us 0.000us 0.00% 3.939us 1.313us 3
+ aten::zero_ 0.89% 16.910us 82.13% 1.565ms 521.590us 0.000us 0.00% 3.939us 1.313us 3
+ aten::fill_ 1.72% 32.820us 81.24% 1.548ms 515.953us 2.882us 11.37% 3.939us 1.313us 3
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.882us 11.37% 2.882us 0.961us 3
+ Activity Buffer Request 77.24% 1.472ms 77.24% 1.472ms 1.472ms 1.057us 4.17% 1.057us 1.057us 1
+ aten::empty 1.76% 33.441us 1.76% 33.441us 11.147us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.19% 60.842us 3.19% 60.842us 10.140us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.89% 16.922us 0.89% 16.922us 2.820us 0.000us 0.00% 0.000us 0.000us 6
+ aten::select 1.13% 21.591us 1.37% 26.081us 8.694us 0.000us 0.00% 0.000us 0.000us 3
+ aten::as_strided 0.24% 4.490us 0.24% 4.490us 1.497us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.39% 7.340us 0.39% 7.340us 7.340us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.905ms
+Self CUDA time total: 25.346us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q300_H8_E256_L4_P4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 144.191us 546.22% 144.191us 144.191us 1
+ hf_kernels_deformable_detr 4.39% 75.912us 99.67% 1.722ms 1.722ms 0.000us 0.00% 27.358us 27.358us 1
+ _deformable_detr_57c3d32::ms_deform_attn_forward 2.01% 34.700us 95.28% 1.646ms 548.647us 23.550us 89.21% 27.358us 9.119us 3
+void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.550us 89.21% 23.550us 7.850us 3
+ aten::zeros 0.49% 8.451us 91.07% 1.573ms 524.424us 0.000us 0.00% 3.808us 1.269us 3
+ aten::zero_ 0.50% 8.669us 89.54% 1.547ms 515.616us 0.000us 0.00% 3.808us 1.269us 3
+ aten::fill_ 1.60% 27.701us 89.04% 1.538ms 512.727us 2.848us 10.79% 3.808us 1.269us 3
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.848us 10.79% 2.848us 0.949us 3
+ Activity Buffer Request 85.90% 1.484ms 85.90% 1.484ms 1.484ms 0.960us 3.64% 0.960us 0.960us 1
+ aten::empty 1.04% 17.971us 1.04% 17.971us 5.990us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 2.40% 41.442us 2.40% 41.442us 6.907us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.54% 9.400us 0.54% 9.400us 1.567us 0.000us 0.00% 0.000us 0.000us 6
+ aten::select 0.66% 11.329us 0.79% 13.720us 4.573us 0.000us 0.00% 0.000us 0.000us 3
+ aten::as_strided 0.14% 2.391us 0.14% 2.391us 0.797us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.33% 5.680us 0.33% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.728ms
+Self CUDA time total: 26.398us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q100_H8_E256_L4_P4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 140.288us 549.37% 140.288us 140.288us 1
+ hf_kernels_deformable_detr 4.34% 74.492us 99.67% 1.709ms 1.709ms 0.000us 0.00% 26.464us 26.464us 1
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.96% 33.680us 95.32% 1.635ms 544.984us 22.752us 89.10% 26.464us 8.821us 3
+void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.752us 89.10% 22.752us 7.584us 3
+ aten::zeros 0.50% 8.650us 91.19% 1.564ms 521.367us 0.000us 0.00% 3.712us 1.237us 3
+ aten::zero_ 0.47% 8.130us 89.69% 1.538ms 512.773us 0.000us 0.00% 3.712us 1.237us 3
+ aten::fill_ 1.63% 27.881us 89.21% 1.530ms 510.063us 2.784us 10.90% 3.712us 1.237us 3
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.784us 10.90% 2.784us 0.928us 3
+ Activity Buffer Request 86.04% 1.476ms 86.04% 1.476ms 1.476ms 0.928us 3.63% 0.928us 0.928us 1
+ aten::empty 1.00% 17.131us 1.00% 17.131us 5.710us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 2.42% 41.510us 2.42% 41.510us 6.918us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.52% 8.991us 0.52% 8.991us 1.498us 0.000us 0.00% 0.000us 0.000us 6
+ aten::select 0.62% 10.681us 0.77% 13.291us 4.430us 0.000us 0.00% 0.000us 0.000us 3
+ aten::as_strided 0.15% 2.610us 0.15% 2.610us 0.870us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.33% 5.730us 0.33% 5.730us 5.730us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.715ms
+Self CUDA time total: 25.536us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q300_H8_E256_L4_P4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 151.934us 322.76% 151.934us 151.934us 1
+ hf_kernels_deformable_detr 3.86% 74.313us 99.75% 1.919ms 1.919ms 0.000us 0.00% 48.129us 48.129us 1
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.79% 34.420us 95.88% 1.844ms 614.769us 43.968us 93.40% 48.129us 16.043us 3
+void ms_deformable_im2col_gpu_kernel<float>(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 43.968us 93.40% 43.968us 14.656us 3
+ aten::zeros 0.45% 8.600us 92.03% 1.770ms 590.092us 0.000us 0.00% 4.161us 1.387us 3
+ aten::zero_ 0.45% 8.690us 90.72% 1.745ms 581.642us 0.000us 0.00% 4.161us 1.387us 3
+ aten::fill_ 1.44% 27.641us 90.26% 1.736ms 578.745us 3.105us 6.60% 4.161us 1.387us 3
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.105us 6.60% 3.105us 1.035us 3
+ Activity Buffer Request 76.84% 1.478ms 76.84% 1.478ms 1.478ms 1.056us 2.24% 1.056us 1.056us 1
+ aten::empty 0.87% 16.750us 0.87% 16.750us 5.583us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 12.74% 245.037us 12.74% 245.037us 40.839us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.49% 9.420us 0.49% 9.420us 1.570us 0.000us 0.00% 0.000us 0.000us 6
+ aten::select 0.66% 12.781us 0.82% 15.781us 5.260us 0.000us 0.00% 0.000us 0.000us 3
+ aten::as_strided 0.16% 3.000us 0.16% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.25% 4.890us 0.25% 4.890us 4.890us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.924ms
+Self CUDA time total: 47.073us
+
+
+impl wl p50(ms) ok
+hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
+hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.05 True
+hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.05 True
+hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
+
+
+
+
+Installed 52 packages in 237ms
+
+
+
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
+Fetching 7 files: 14%|█▍ | 1/7 [00:00<00:00, 6.20it/s]
+Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 9.26it/s]
+Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 12.59it/s]
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/deformable_detr/impls/index.html b/deformable_detr/impls/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..507f4753d9c1efbdcbe259d5a8105e4524b0527f
--- /dev/null
+++ b/deformable_detr/impls/index.html
@@ -0,0 +1,89 @@
+
+
+
+
+
+ Index of /deformable_detr/impls
+
+
+
+
+ Index of /deformable_detr/impls
+
+
+
\ No newline at end of file
diff --git a/deformable_detr/impls/torch_deformable_detr.html b/deformable_detr/impls/torch_deformable_detr.html
new file mode 100644
index 0000000000000000000000000000000000000000..1d330b066f83130623802310ab8c5a5ceec69b71
--- /dev/null
+++ b/deformable_detr/impls/torch_deformable_detr.html
@@ -0,0 +1,4434 @@
+
+
+
+
+
+ torch_deformable_detr
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
PyTorch Native - Deformable DETR
+
GPU Info
+
+
+
+
+
import subprocess
+print ( subprocess . run ([ "nvidia-smi" ], capture_output = True , text = True ) . stdout )
+
+
+
+
+
+
+
Fri Oct 31 20:13:34 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 43C P0 83W / 350W | 0MiB / 46068MiB | 60% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
Deformable DETR Multi-Scale Deformable Attention Benchmark (PyTorch Native)
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum , run_benchmark
+
+
+def torch_deformable_detr (
+ value , spatial_shapes , level_start_index , sampling_locations , attention_weights , im2col_step = 64
+):
+ """
+ PyTorch native reference implementation of multi-scale deformable attention.
+ Uses vectorized bilinear interpolation for reasonable performance.
+ """
+ bs , num_queries , num_heads , num_levels , num_points , _ = sampling_locations . shape
+ _ , _ , _ , channels = value . shape
+
+ output = torch . zeros ( bs , num_queries , num_heads , channels , device = value . device , dtype = value . dtype )
+
+ # Split value tensor by levels
+ value_list = value . split ([ int ( h * w ) for h , w in spatial_shapes . tolist ()], dim = 1 )
+
+ # Iterate through each level (can't avoid this loop easily)
+ for level_idx in range ( num_levels ):
+ h , w = spatial_shapes [ level_idx ] . tolist ()
+ value_level = value_list [ level_idx ] # (bs, h*w, num_heads, channels)
+
+ # Reshape to spatial grid: (bs, num_heads, channels, h, w)
+ value_spatial = value_level . reshape ( bs , h , w , num_heads , channels ) . permute ( 0 , 3 , 4 , 1 , 2 )
+
+ # Get sampling locations and weights for this level
+ # loc: (bs, num_queries, num_heads, num_points, 2)
+ loc = sampling_locations [:, :, :, level_idx , :, :]
+ # weight: (bs, num_queries, num_heads, num_points)
+ weight = attention_weights [:, :, :, level_idx , :]
+
+ # Convert normalized coordinates to pixel coordinates
+ # loc[..., 0] is x (width), loc[..., 1] is y (height)
+ x = loc [ ... , 0 ] * w - 0.5 # (bs, num_queries, num_heads, num_points)
+ y = loc [ ... , 1 ] * h - 0.5
+
+ # Get integer coordinates for bilinear interpolation
+ x0 = torch . floor ( x ) . long ()
+ y0 = torch . floor ( y ) . long ()
+ x1 = x0 + 1
+ y1 = y0 + 1
+
+ # Compute interpolation weights BEFORE clamping (important!)
+ lw = x - x0 . float () # weight for x direction
+ lh = y - y0 . float () # weight for y direction
+ hw = 1 - lw
+ hh = 1 - lh
+
+ # Create mask for valid sample locations
+ valid = ( y > - 1 ) & ( x > - 1 ) & ( y < h ) & ( x < w )
+
+ # Create masks for each corner being in bounds
+ mask_tl = (( y0 >= 0 ) & ( x0 >= 0 )) . unsqueeze ( - 1 ) . float ()
+ mask_tr = (( y0 >= 0 ) & ( x1 <= w - 1 )) . unsqueeze ( - 1 ) . float ()
+ mask_bl = (( y1 <= h - 1 ) & ( x0 >= 0 )) . unsqueeze ( - 1 ) . float ()
+ mask_br = (( y1 <= h - 1 ) & ( x1 <= w - 1 )) . unsqueeze ( - 1 ) . float ()
+
+ # Clamp coordinates for safe indexing
+ x0_clamped = torch . clamp ( x0 , 0 , w - 1 )
+ x1_clamped = torch . clamp ( x1 , 0 , w - 1 )
+ y0_clamped = torch . clamp ( y0 , 0 , h - 1 )
+ y1_clamped = torch . clamp ( y1 , 0 , h - 1 )
+
+ # Bilinear interpolation weights for all 4 corners
+ w_tl = ( hh * hw ) . unsqueeze ( - 1 ) # top-left: (bs, num_queries, num_heads, num_points, 1)
+ w_tr = ( hh * lw ) . unsqueeze ( - 1 ) # top-right
+ w_bl = ( lh * hw ) . unsqueeze ( - 1 ) # bottom-left
+ w_br = ( lh * lw ) . unsqueeze ( - 1 ) # bottom-right
+
+ # Gather values from the 4 corners using advanced indexing
+ batch_idx = torch . arange ( bs , device = value . device ) . view ( bs , 1 , 1 , 1 ) . expand ( bs , num_queries , num_heads , num_points )
+ head_idx = torch . arange ( num_heads , device = value . device ) . view ( 1 , 1 , num_heads , 1 ) . expand ( bs , num_queries , num_heads , num_points )
+
+ # Gather corner values with clamped indices, then apply corner masks
+ v_tl = value_spatial [ batch_idx , head_idx , :, y0_clamped , x0_clamped ] * mask_tl
+ v_tr = value_spatial [ batch_idx , head_idx , :, y0_clamped , x1_clamped ] * mask_tr
+ v_bl = value_spatial [ batch_idx , head_idx , :, y1_clamped , x0_clamped ] * mask_bl
+ v_br = value_spatial [ batch_idx , head_idx , :, y1_clamped , x1_clamped ] * mask_br
+
+ # Bilinear interpolation
+ sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
+
+ # Apply valid mask (only accumulate if entire sample location is valid)
+ sampled = sampled * valid . unsqueeze ( - 1 ) . float ()
+
+ # Apply attention weights and sum over points
+ # weight: (bs, num_queries, num_heads, num_points)
+ # Expand weight: (bs, num_queries, num_heads, num_points, 1)
+ weighted_sampled = sampled * weight . unsqueeze ( - 1 )
+
+ # Sum over points: (bs, num_queries, num_heads, channels)
+ output += weighted_sampled . sum ( dim = 3 )
+
+ # Flatten last two dimensions to match kernel output
+ return output . reshape ( bs , num_queries , num_heads * channels )
+
+
+run_benchmark (
+ kernel_type = KernelTypeEnum . DEFORMABLE_DETR ,
+ impl_name = "torch_eager" ,
+ impl_tags = { "family" : "pytorch" , "backend" : "eager" },
+ impl_func = torch_deformable_detr ,
+ dtype = "float32" ,
+)
+
+
+
+
+
+
+
Running deformable_detr benchmark on cuda with 4 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_Q100_H8_E256_L4_P4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 20.095ms 1353.99% 20.095ms 20.095ms 1
+ torch_eager 21.57% 4.703ms 99.97% 21.796ms 21.796ms 0.000us 0.00% 1.485ms 1.485ms 1
+ aten::index 4.62% 1.006ms 16.78% 3.660ms 76.241us 237.342us 15.99% 371.712us 7.744us 48
+ aten::copy_ 4.87% 1.061ms 11.32% 2.469ms 11.275us 365.385us 24.62% 365.385us 1.668us 219
+ aten::mul 5.80% 1.265ms 9.92% 2.163ms 11.267us 294.264us 19.83% 294.264us 1.533us 192
+void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 237.342us 15.99% 237.342us 4.945us 48
+ aten::to 0.67% 145.268us 11.20% 2.441ms 14.275us 0.000us 0.00% 231.015us 1.351us 171
+ aten::_to_copy 2.25% 489.538us 10.53% 2.296ms 18.665us 0.000us 0.00% 231.015us 1.878us 123
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 202.558us 13.65% 202.558us 1.688us 120
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.074us 11.26% 167.074us 1.989us 84
+ aten::contiguous 0.40% 86.639us 8.70% 1.898ms 19.769us 0.000us 0.00% 134.370us 1.400us 96
+ aten::clone 0.85% 185.683us 8.31% 1.811ms 18.866us 0.000us 0.00% 134.370us 1.400us 96
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.370us 9.05% 134.370us 1.400us 96
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.390us 7.77% 115.390us 1.202us 96
+ aten::__and__ 0.63% 137.184us 4.49% 979.904us 11.666us 0.000us 0.00% 100.670us 1.198us 84
+ aten::bitwise_and 2.39% 521.552us 3.87% 842.720us 10.032us 100.670us 6.78% 100.670us 1.198us 84
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 100.670us 6.78% 100.670us 1.198us 84
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 85.858us 5.78% 85.858us 1.192us 72
+ aten::sub 2.24% 488.685us 3.68% 801.476us 11.132us 78.884us 5.32% 78.884us 1.096us 72
+ aten::add 1.55% 338.597us 2.59% 564.753us 9.413us 74.082us 4.99% 74.082us 1.235us 60
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 21.803ms
+Self CUDA time total: 1.484ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_Q300_H8_E256_L4_P4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 18.852ms 1182.31% 18.852ms 18.852ms 1
+ torch_eager 20.99% 4.304ms 99.97% 20.495ms 20.495ms 0.000us 0.00% 1.595ms 1.595ms 1
+ aten::index 4.61% 945.020us 16.80% 3.444ms 71.750us 251.167us 15.75% 382.850us 7.976us 48
+ aten::copy_ 5.04% 1.033ms 11.78% 2.414ms 11.023us 364.991us 22.89% 364.991us 1.667us 219
+ aten::mul 5.94% 1.218ms 10.22% 2.095ms 10.911us 359.138us 22.52% 359.138us 1.871us 192
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 267.618us 16.78% 267.618us 2.230us 120
+void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 251.167us 15.75% 251.167us 5.233us 48
+ aten::to 0.59% 120.975us 11.17% 2.290ms 13.390us 0.000us 0.00% 233.308us 1.364us 171
+ aten::_to_copy 2.01% 411.895us 10.58% 2.169ms 17.632us 0.000us 0.00% 233.308us 1.897us 123
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.797us 10.59% 168.797us 2.009us 84
+ aten::contiguous 0.41% 84.261us 8.87% 1.818ms 18.936us 0.000us 0.00% 131.683us 1.372us 96
+ aten::clone 0.84% 172.318us 8.46% 1.734ms 18.058us 0.000us 0.00% 131.683us 1.372us 96
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 131.683us 8.26% 131.683us 1.372us 96
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 118.123us 7.41% 118.123us 1.230us 96
+ aten::__and__ 0.40% 81.276us 4.41% 903.196us 10.752us 0.000us 0.00% 104.833us 1.248us 84
+ aten::bitwise_and 2.46% 504.088us 4.01% 821.920us 9.785us 104.833us 6.57% 104.833us 1.248us 84
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.833us 6.57% 104.833us 1.248us 84
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.190us 6.53% 104.190us 1.447us 72
+ aten::add 1.62% 331.582us 2.72% 557.857us 9.298us 91.491us 5.74% 91.491us 1.525us 60
+ aten::sub 2.17% 445.533us 3.70% 758.959us 10.541us 80.509us 5.05% 80.509us 1.118us 72
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 20.501ms
+Self CUDA time total: 1.595ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_Q100_H8_E256_L4_P4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 18.792ms 1222.95% 18.792ms 18.792ms 1
+ torch_eager 21.02% 4.299ms 99.97% 20.449ms 20.449ms 0.000us 0.00% 1.538ms 1.538ms 1
+ aten::index 4.62% 944.347us 16.78% 3.432ms 71.497us 243.904us 15.87% 378.785us 7.891us 48
+ aten::copy_ 5.14% 1.051ms 11.72% 2.396ms 10.942us 368.961us 24.01% 368.961us 1.685us 219
+ aten::mul 5.96% 1.219ms 10.23% 2.092ms 10.898us 325.334us 21.17% 325.334us 1.694us 192
+void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 243.904us 15.87% 243.904us 5.081us 48
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 234.457us 15.26% 234.457us 1.954us 120
+ aten::to 0.61% 125.558us 11.02% 2.255ms 13.184us 0.000us 0.00% 234.080us 1.369us 171
+ aten::_to_copy 1.92% 392.900us 10.41% 2.129ms 17.309us 0.000us 0.00% 234.080us 1.903us 123
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.246us 11.01% 169.246us 2.015us 84
+ aten::contiguous 0.42% 85.559us 8.81% 1.802ms 18.772us 0.000us 0.00% 134.881us 1.405us 96
+ aten::clone 0.80% 164.449us 8.39% 1.717ms 17.880us 0.000us 0.00% 134.881us 1.405us 96
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.881us 8.78% 134.881us 1.405us 96
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.650us 7.53% 115.650us 1.205us 96
+ aten::__and__ 0.39% 78.814us 4.36% 891.116us 10.609us 0.000us 0.00% 101.539us 1.209us 84
+ aten::bitwise_and 2.44% 499.687us 3.97% 812.302us 9.670us 101.539us 6.61% 101.539us 1.209us 84
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 101.539us 6.61% 101.539us 1.209us 84
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 96.065us 6.25% 96.065us 1.334us 72
+ aten::add 1.62% 331.717us 2.71% 554.333us 9.239us 83.900us 5.46% 83.900us 1.398us 60
+ aten::sub 2.21% 451.413us 3.69% 755.537us 10.494us 79.361us 5.16% 79.361us 1.102us 72
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 20.454ms
+Self CUDA time total: 1.537ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_Q300_H8_E256_L4_P4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.115ms 1086.36% 19.115ms 19.115ms 1
+ torch_eager 21.90% 4.346ms 99.98% 19.842ms 19.842ms 0.000us 0.00% 1.761ms 1.761ms 1
+ aten::mul 6.18% 1.226ms 10.60% 2.104ms 10.960us 450.887us 25.63% 450.887us 2.348us 192
+ aten::index 4.92% 977.403us 17.78% 3.530ms 73.537us 282.433us 16.05% 420.451us 8.759us 48
+ aten::copy_ 5.20% 1.031ms 12.05% 2.392ms 10.922us 372.637us 21.18% 372.637us 1.702us 219
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 357.955us 20.34% 357.955us 2.983us 120
+void at::native::index_elementwise_kernel<128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 282.433us 16.05% 282.433us 5.884us 48
+ aten::to 0.65% 128.684us 11.66% 2.315ms 13.536us 0.000us 0.00% 234.619us 1.372us 171
+ aten::_to_copy 2.23% 442.466us 11.01% 2.186ms 17.772us 0.000us 0.00% 234.619us 1.907us 123
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 170.397us 9.68% 170.397us 2.029us 84
+ aten::contiguous 0.44% 87.582us 9.26% 1.837ms 19.140us 0.000us 0.00% 138.018us 1.438us 96
+ aten::clone 0.85% 168.452us 8.82% 1.750ms 18.228us 0.000us 0.00% 138.018us 1.438us 96
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 138.018us 7.84% 138.018us 1.438us 96
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 129.055us 7.33% 129.055us 1.792us 72
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.244us 6.66% 117.244us 1.221us 96
+ aten::add 1.68% 334.180us 2.81% 557.305us 9.288us 113.660us 6.46% 113.660us 1.894us 60
+ aten::__and__ 0.41% 80.800us 4.55% 902.601us 10.745us 0.000us 0.00% 105.726us 1.259us 84
+ aten::bitwise_and 2.56% 508.561us 4.14% 821.801us 9.783us 105.726us 6.01% 105.726us 1.259us 84
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 105.726us 6.01% 105.726us 1.259us 84
+ aten::sub 2.25% 446.108us 3.80% 754.277us 10.476us 82.273us 4.68% 82.273us 1.143us 72
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 19.847ms
+Self CUDA time total: 1.760ms
+
+
+impl wl p50(ms) ok
+torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.39 True
+torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.01 True
+torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.02 True
+torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.02 True
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/deformable_detr/index.html b/deformable_detr/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..8e1ad7fe09342a610e525c8bba679a7f74857855
--- /dev/null
+++ b/deformable_detr/index.html
@@ -0,0 +1,89 @@
+
+
+
+
+
+ Index of /deformable_detr
+
+
+
+
+ Index of /deformable_detr
+
+
+
\ No newline at end of file
diff --git a/deformable_detr/results/artifacts/combine/latency.svg b/deformable_detr/results/artifacts/combine/latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..cfe61b52935bc93cabc302ceb7b7fc02981aa5f7
--- /dev/null
+++ b/deformable_detr/results/artifacts/combine/latency.svg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b38828b5c85834f31812d3f314ebdc3cc2e8481610a6d31b84a4f9b0ad78c0f2
+size 17800
diff --git a/deformable_detr/results/cells/combine.py b/deformable_detr/results/cells/combine.py
new file mode 100644
index 0000000000000000000000000000000000000000..973c7b81cc8cea6af69ab5e32268c4e63e71c8bb
--- /dev/null
+++ b/deformable_detr/results/cells/combine.py
@@ -0,0 +1,26 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "HF Kernels Deformable DETR": "UVNOTE_FILE_HF_KERNELS_DEFORMABLE_DETR_BENCHMARK",
+ "PyTorch Deformable DETR": "UVNOTE_FILE_TORCH_DEFORMABLE_DETR_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="deformable_detr.jsonl",
+ svg_filename="latency.svg"
+)
\ No newline at end of file
diff --git a/deformable_detr/results/combined_results.html b/deformable_detr/results/combined_results.html
new file mode 100644
index 0000000000000000000000000000000000000000..a985624a2d9079877fe0cd1dcdefc5494402713c
--- /dev/null
+++ b/deformable_detr/results/combined_results.html
@@ -0,0 +1,4805 @@
+
+
+
+
+
+ Deformable DETR Benchmark - Combined Results
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
Deformable DETR Multi-Scale Deformable Attention Benchmarks - Aggregated Results
+
This document combines benchmark results from multiple Deformable DETR implementations.
+
Combined Summary and Visualization
+
+
+
+
+
+
+
+ 2025-10-31T20:14:23.345627
+ image/svg+xml
+
+
+ Matplotlib v3.10.7, https://matplotlib.org/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ cuda_B1_Q100_H8_E256_L4_P4
+
+
+
+
+
+
+
+
+
+
+
+
+ cuda_B1_Q300_H8_E256_L4_P4
+
+
+
+
+
+
+
+
+
+
+
+
+ cuda_B2_Q100_H8_E256_L4_P4
+
+
+
+
+
+
+
+
+
+
+
+
+ cuda_B2_Q300_H8_E256_L4_P4
+
+
+
+ Workload
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.5
+
+
+
+
+
+
+
+
+
+
+
+
+ 1.0
+
+
+
+
+
+
+
+
+
+
+
+
+ 1.5
+
+
+
+
+
+
+
+
+
+
+
+
+ 2.0
+
+
+
+
+
+
+
+
+
+
+
+
+ 2.5
+
+
+
+
+
+
+
+
+
+
+
+
+ 3.0
+
+
+
+
+
+
+
+
+
+
+
+
+ 3.5
+
+
+
+
+
+
+
+
+
+
+
+
+ 4.0
+
+
+
+ Latency P50 (ms)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Attention Implementation Latency
+
+
+
+
+
+
+
+
+
+
+
+
+ hf_kernels_deformable_detr
+
+
+
+
+
+
+
+
+ torch_eager
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "HF Kernels Deformable DETR" : "UVNOTE_FILE_HF_KERNELS_DEFORMABLE_DETR_BENCHMARK" ,
+ "PyTorch Deformable DETR" : "UVNOTE_FILE_TORCH_DEFORMABLE_DETR_BENCHMARK" ,
+}
+
+# Generate combined results with visualization
+generate_combined_results (
+ cache_env_map = cache_env_map ,
+ output_filename = "deformable_detr.jsonl" ,
+ svg_filename = "latency.svg"
+)
+
+
+
+
+
+
+
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ HF Kernels Deformable DETR : /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/8ab95d7f8f4c6a375b95806e646e4e6f12f0749960d319cf7587215b378ccfa9
+✓ PyTorch Deformable DETR : /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/9c0a40cf66719a0b460ebb0ca3b41bcaf6c5486905bbf2045a65be2710694dfa
+
+ ✓ Found HF Kernels Deformable DETR
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/8ab95d7f8f4c6a375b95806e646e4e6f12f0749960d319cf7587215b378ccfa9/deformable_detr.jsonl
+ ✓ Found PyTorch Deformable DETR
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/deformable_detr/impls/.uvnote/cache/9c0a40cf66719a0b460ebb0ca3b41bcaf6c5486905bbf2045a65be2710694dfa/deformable_detr.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl wl p50(ms) ok
+hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
+hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.05 True
+hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.05 True
+hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
+torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.39 True
+torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.01 True
+torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.02 True
+torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.02 True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 8 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+ ✓ HF Kernels Deformable DETR
+ ✓ PyTorch Deformable DETR
+
+
+
+
+Installed 37 packages in 216ms
+
+
+
+
Artifacts:
+
latency.svg
+
+
+
+
+
+
+
+ 2025-10-31T20:14:23.345627
+ image/svg+xml
+
+
+ Matplotlib v3.10.7, https://matplotlib.org/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ cuda_B1_Q100_H8_E256_L4_P4
+
+
+
+
+
+
+
+
+
+
+
+
+ cuda_B1_Q300_H8_E256_L4_P4
+
+
+
+
+
+
+
+
+
+
+
+
+ cuda_B2_Q100_H8_E256_L4_P4
+
+
+
+
+
+
+
+
+
+
+
+
+ cuda_B2_Q300_H8_E256_L4_P4
+
+
+
+ Workload
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.5
+
+
+
+
+
+
+
+
+
+
+
+
+ 1.0
+
+
+
+
+
+
+
+
+
+
+
+
+ 1.5
+
+
+
+
+
+
+
+
+
+
+
+
+ 2.0
+
+
+
+
+
+
+
+
+
+
+
+
+ 2.5
+
+
+
+
+
+
+
+
+
+
+
+
+ 3.0
+
+
+
+
+
+
+
+
+
+
+
+
+ 3.5
+
+
+
+
+
+
+
+
+
+
+
+
+ 4.0
+
+
+
+ Latency P50 (ms)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Attention Implementation Latency
+
+
+
+
+
+
+
+
+
+
+
+
+ hf_kernels_deformable_detr
+
+
+
+
+
+
+
+
+ torch_eager
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/deformable_detr/results/index.html b/deformable_detr/results/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..259f497868f81b516b1f0c893e4974cda430c731
--- /dev/null
+++ b/deformable_detr/results/index.html
@@ -0,0 +1,88 @@
+
+
+
+
+
+ Index of /deformable_detr/results
+
+
+
+
+ Index of /deformable_detr/results
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl
index de592afa82ec05256019431f6592e8e321594c40..d381f496ddfa4abddae090de1e302f3856ab3fc4 100644
--- a/flash_attn/impls/artifacts/benchmark/attention.jsonl
+++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl
@@ -1,6 +1,6 @@
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9094910000158052, "p50": 0.9113720000186731, "p90": 0.9181919999718957, "mean": 0.9141214000010223, "iqr": 0.007780999965234514, "raw_times": [0.9104110000066612, 0.9094910000158052, 0.9113720000186731, 0.9181919999718957, 0.9211409999920761], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9259819999556385, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9480720000283327, "p50": 0.9496129999888581, "p90": 0.9558429999856344, "mean": 0.952826599996115, "iqr": 0.00735100002202671, "raw_times": [0.9480720000283327, 0.9484919999636077, 0.9496129999888581, 0.9558429999856344, 0.962113000014142], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9554529999604711, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0061439999731192, "p50": 1.0189639999680367, "p90": 1.0215840000000753, "mean": 1.017895999996199, "iqr": 0.0038299999687296804, "raw_times": [1.0189639999680367, 1.025034000008418, 1.0177540000313456, 1.0061439999731192, 1.0215840000000753], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0171540000101231, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0146539999595916, "p50": 1.019383999960155, "p90": 1.0202839999919888, "mean": 1.018159799980367, "iqr": 0.004200999967451935, "raw_times": [1.0202839999919888, 1.0146539999595916, 1.0160830000245369, 1.0203939999655631, 1.019383999960155], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0248149999938505, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1737179999613545, "p50": 1.184327000032681, "p90": 1.1859380000487363, "mean": 1.186479800003326, "iqr": 0.010300000042207103, "raw_times": [1.1756380000065292, 1.1737179999613545, 1.1859380000487363, 1.184327000032681, 1.2127779999673294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1959679999904438, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-30T15:53:30Z", "run": "30999c763e9c4aa995d0df5078964128", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1702179999701912, "p50": 1.1838479999823903, "p90": 1.1906280000175684, "mean": 1.1843698000006952, "iqr": 0.016700999992735888, "raw_times": [1.1739270000248325, 1.1702179999701912, 1.1838479999823903, 1.1906280000175684, 1.2032280000084938], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1880579999683505, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.208432000112225, "p50": 1.215130999980829, "p90": 1.2198710001030122, "mean": 1.215487200033749, "iqr": 0.006680000069536618, "raw_times": [1.2208109999392036, 1.208432000112225, 1.2198710001030122, 1.2131910000334756, 1.215130999980829], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2240119999660237, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.26713200006634, "p50": 1.2766830000146001, "p90": 1.277253000125711, "mean": 1.2749268000789016, "iqr": 0.004750000016429112, "raw_times": [1.277253000125711, 1.26713200006634, 1.2766830000146001, 1.281063000078575, 1.2725030001092819], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2717629999769997, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2928539999847999, "p50": 1.3003640001443273, "p90": 1.3163240000721999, "mean": 1.3067478000721167, "iqr": 0.01689100008661626, "raw_times": [1.3003640001443273, 1.2928539999847999, 1.2994329999855836, 1.3163240000721999, 1.3247640001736727], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3026630001604644, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.3232850001259067, "p50": 1.3295650001055037, "p90": 1.3361950000216893, "mean": 1.332684600038192, "iqr": 0.007890999995652237, "raw_times": [1.328304000026037, 1.3361950000216893, 1.3295650001055037, 1.3232850001259067, 1.3460739999118232], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3245140000890387, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4790479999646777, "p50": 1.4950690001569455, "p90": 1.4989779999723396, "mean": 1.4914904000306706, "iqr": 0.017840000055002747, "raw_times": [1.5032190001420531, 1.4950690001569455, 1.4790479999646777, 1.4811379999173369, 1.4989779999723396], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.5107090000583412, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.511368999899787, "p50": 1.5117090001695033, "p90": 1.512698999931672, "mean": 1.516499199988175, "iqr": 0.00113999999484804, "raw_times": [1.511368999899787, 1.512698999931672, 1.5117090001695033, 1.511558999936824, 1.5351600000030885], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.5183190000698232, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py
index 15f02e2ed444e10eba9708f3f69247414b6c962b..8f163bdd918898ced9e858cd4197a85572d7ec8e 100644
--- a/flash_attn/impls/cells/benchmark.py
+++ b/flash_attn/impls/cells/benchmark.py
@@ -4,7 +4,6 @@
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
-# "kernels",
# ]
#
# [tool.uv.sources]
@@ -13,19 +12,18 @@
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
-from kernels import get_kernel
-# Load the flash attention 3 kernel
-hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
-
-def hf_flash_attention3(query, key, value):
- return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
+def torch_flash(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
- impl_name="hf_kernels_flash_attn3",
- impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
- impl_func=hf_flash_attention3,
+ impl_name="torch_flash_ma",
+ impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
+ impl_func=torch_flash,
)
\ No newline at end of file
diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html
index 501ea20e924b7038a53903e7992899b1953d98eb..1852a8c0fb83365b1e619b7e38354ebd1d45d747 100644
--- a/flash_attn/impls/flash_attention.html
+++ b/flash_attn/impls/flash_attention.html
@@ -4110,7 +4110,7 @@ Cell: nv | 0.21s
| ▶ run
Copy
Raw
-GitHub
+GitHub
@@ -4123,7 +4123,7 @@ Cell: nv | 0.21s
-
Thu Oct 30 15:52:36 2025
+Fri Oct 31 20:13:43 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
@@ -4132,7 +4132,7 @@ Cell: nv | 0.21s
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
-| N/A 30C P0 75W / 350W | 0MiB / 46068MiB | 11% Default |
+| N/A 43C P0 83W / 350W | 0MiB / 46068MiB | 11% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
@@ -4154,13 +4154,13 @@ Cell: nv | 0.21s
▼ code
▼ output
- ▶ uv-logs
+ ▶ uv-logs
|
-Cell: benchmark | 7.50s
+Cell: benchmark | 3.87s
| ▶ run
Copy
Raw
-GitHub
+GitHub
@@ -4207,29 +4207,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.587ms 102.23% 3.587ms 3.587ms 1
- torch_flash_ma 7.11% 370.236us 47.42% 2.468ms 2.468ms 0.000us 0.00% 3.549ms 3.549ms 1
- aten::scaled_dot_product_attention 0.85% 44.391us 4.44% 231.334us 77.111us 0.000us 0.00% 2.791ms 930.498us 3
- aten::_scaled_dot_product_flash_attention 0.51% 26.381us 3.59% 186.943us 62.314us 0.000us 0.00% 2.791ms 930.498us 3
- aten::_flash_attention_forward 0.76% 39.658us 2.57% 134.002us 44.667us 2.791ms 79.55% 2.791ms 930.498us 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.791ms 79.55% 2.791ms 930.498us 3
- aten::contiguous 0.30% 15.641us 34.37% 1.789ms 149.098us 0.000us 0.00% 757.697us 63.141us 12
- aten::clone 0.74% 38.596us 34.07% 1.774ms 147.794us 0.000us 0.00% 757.697us 63.141us 12
- aten::copy_ 1.78% 92.553us 31.63% 1.647ms 137.218us 717.505us 20.45% 757.697us 63.141us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 717.505us 20.45% 717.505us 59.792us 12
- Activity Buffer Request 27.90% 1.452ms 27.90% 1.452ms 1.452ms 40.192us 1.15% 40.192us 40.192us 1
- aten::transpose 1.49% 77.390us 2.00% 104.302us 4.346us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.52% 26.912us 0.52% 26.912us 1.121us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.55% 28.453us 2.13% 110.953us 7.397us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.93% 100.211us 1.93% 100.211us 4.175us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 2.45% 127.363us 2.45% 127.363us 8.491us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.32% 16.580us 0.32% 16.580us 5.527us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.05% 2.441us 0.05% 2.441us 0.407us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.18% 9.241us 0.18% 9.241us 3.080us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 52.58% 2.737ms 52.58% 2.737ms 2.737ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.600ms 101.99% 3.600ms 3.600ms 1
+ torch_flash_ma 6.70% 350.157us 46.68% 2.439ms 2.439ms 0.000us 0.00% 3.570ms 3.570ms 1
+ aten::scaled_dot_product_attention 0.81% 42.281us 4.26% 222.626us 74.209us 0.000us 0.00% 2.816ms 938.781us 3
+ aten::_scaled_dot_product_flash_attention 0.52% 27.002us 3.45% 180.345us 60.115us 0.000us 0.00% 2.816ms 938.781us 3
+ aten::_flash_attention_forward 0.79% 41.210us 2.54% 132.453us 44.151us 2.816ms 79.78% 2.816ms 938.781us 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.816ms 79.78% 2.816ms 938.781us 3
+ aten::contiguous 0.29% 15.041us 34.44% 1.800ms 149.962us 0.000us 0.00% 753.884us 62.824us 12
+ aten::clone 0.75% 38.969us 34.15% 1.785ms 148.709us 0.000us 0.00% 753.884us 62.824us 12
+ aten::copy_ 1.73% 90.324us 31.78% 1.661ms 138.388us 713.788us 20.22% 753.884us 62.824us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 713.788us 20.22% 713.788us 59.482us 12
+ Activity Buffer Request 28.08% 1.467ms 28.08% 1.467ms 1.467ms 40.096us 1.14% 40.096us 40.096us 1
+ aten::transpose 1.25% 65.371us 1.68% 87.543us 3.648us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.42% 22.172us 0.42% 22.172us 0.924us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.53% 27.463us 2.06% 107.524us 7.168us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.78% 93.220us 1.78% 93.220us 3.884us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 2.49% 130.035us 2.49% 130.035us 8.669us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.32% 16.730us 0.32% 16.730us 5.577us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.05% 2.690us 0.05% 2.690us 0.448us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.17% 9.000us 0.17% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 53.32% 2.786ms 53.32% 2.786ms 2.786ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.205ms
-Self CUDA time total: 3.509ms
+Self CPU time total: 5.225ms
+Self CUDA time total: 3.530ms
@@ -4239,29 +4239,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 4.72% 248.136us 41.78% 2.196ms 2.196ms 0.000us 0.00% 3.803ms 3.803ms 1
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.759ms 100.28% 3.759ms 3.759ms 1
- aten::scaled_dot_product_attention 0.51% 26.852us 3.40% 178.734us 59.578us 0.000us 0.00% 2.990ms 996.607us 3
- aten::_scaled_dot_product_flash_attention 0.35% 18.418us 2.89% 151.882us 50.627us 0.000us 0.00% 2.990ms 996.607us 3
- aten::_flash_attention_forward 0.65% 34.063us 2.10% 110.562us 36.854us 2.990ms 79.76% 2.990ms 996.607us 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.990ms 79.76% 2.990ms 996.607us 3
- aten::contiguous 0.19% 10.079us 32.75% 1.721ms 143.446us 0.000us 0.00% 813.629us 67.802us 12
- aten::clone 0.54% 28.151us 32.56% 1.711ms 142.606us 0.000us 0.00% 813.629us 67.802us 12
- aten::copy_ 1.97% 103.281us 30.84% 1.621ms 135.084us 758.782us 20.24% 813.629us 67.802us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 758.782us 20.24% 758.782us 63.232us 12
- Activity Buffer Request 27.29% 1.434ms 27.29% 1.434ms 1.434ms 54.847us 1.46% 54.847us 54.847us 1
- aten::transpose 0.98% 51.741us 1.34% 70.423us 2.934us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.36% 18.682us 0.36% 18.682us 0.778us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.38% 19.848us 1.54% 80.939us 5.396us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.45% 76.001us 1.45% 76.001us 3.167us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 2.04% 106.952us 2.04% 106.952us 7.130us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.26% 13.850us 0.26% 13.850us 4.617us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.04% 1.860us 0.04% 1.860us 0.310us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.07% 3.760us 0.07% 3.760us 1.253us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 58.22% 3.060ms 58.22% 3.060ms 3.060ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 4.88% 260.255us 42.26% 2.252ms 2.252ms 0.000us 0.00% 3.798ms 3.798ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.753ms 100.28% 3.753ms 3.753ms 1
+ aten::scaled_dot_product_attention 0.49% 25.890us 3.50% 186.735us 62.245us 0.000us 0.00% 2.976ms 991.858us 3
+ aten::_scaled_dot_product_flash_attention 0.33% 17.842us 3.02% 160.845us 53.615us 0.000us 0.00% 2.976ms 991.858us 3
+ aten::_flash_attention_forward 0.74% 39.289us 2.26% 120.363us 40.121us 2.976ms 79.51% 2.976ms 991.858us 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.976ms 79.51% 2.976ms 991.858us 3
+ aten::contiguous 0.20% 10.403us 33.03% 1.760ms 146.680us 0.000us 0.00% 822.042us 68.504us 12
+ aten::clone 0.53% 28.238us 32.84% 1.750ms 145.813us 0.000us 0.00% 822.042us 68.504us 12
+ aten::copy_ 1.51% 80.312us 31.12% 1.659ms 138.210us 766.874us 20.49% 822.042us 68.504us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 766.874us 20.49% 766.874us 63.906us 12
+ Activity Buffer Request 28.02% 1.493ms 28.02% 1.493ms 1.493ms 55.168us 1.47% 55.168us 55.168us 1
+ aten::transpose 0.94% 50.313us 1.27% 67.673us 2.820us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.33% 17.360us 0.33% 17.360us 0.723us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.40% 21.528us 1.56% 83.370us 5.558us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.43% 76.263us 1.43% 76.263us 3.178us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 2.08% 110.943us 2.08% 110.943us 7.396us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.27% 14.621us 0.27% 14.621us 4.874us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 1.781us 0.03% 1.781us 0.297us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.08% 4.011us 0.08% 4.011us 1.337us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 57.74% 3.077ms 57.74% 3.077ms 3.077ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.255ms
-Self CUDA time total: 3.749ms
+Self CPU time total: 5.329ms
+Self CUDA time total: 3.742ms
@@ -4271,29 +4271,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 4.59% 242.054us 41.69% 2.201ms 2.201ms 0.000us 0.00% 3.795ms 3.795ms 1
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.746ms 100.27% 3.746ms 3.746ms 1
- aten::scaled_dot_product_attention 0.50% 26.150us 3.40% 179.413us 59.804us 0.000us 0.00% 2.957ms 985.581us 3
- aten::_scaled_dot_product_flash_attention 0.35% 18.371us 2.90% 153.263us 51.088us 0.000us 0.00% 2.957ms 985.581us 3
- aten::_flash_attention_forward 0.64% 34.041us 2.11% 111.213us 37.071us 2.957ms 79.14% 2.957ms 985.581us 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.957ms 79.14% 2.957ms 985.581us 3
- aten::contiguous 0.19% 9.991us 32.85% 1.734ms 144.489us 0.000us 0.00% 838.147us 69.846us 12
- aten::clone 0.52% 27.541us 32.66% 1.724ms 143.657us 0.000us 0.00% 838.147us 69.846us 12
- aten::copy_ 1.47% 77.641us 30.91% 1.632ms 135.987us 779.363us 20.86% 838.147us 69.846us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.363us 20.86% 779.363us 64.947us 12
- Activity Buffer Request 27.89% 1.472ms 27.89% 1.472ms 1.472ms 58.784us 1.57% 58.784us 58.784us 1
- aten::transpose 0.96% 50.819us 1.31% 69.110us 2.880us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.35% 18.291us 0.35% 18.291us 0.762us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.38% 20.141us 1.58% 83.392us 5.559us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.49% 78.782us 1.49% 78.782us 3.283us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 1.99% 104.800us 1.99% 104.800us 6.987us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.27% 14.320us 0.27% 14.320us 4.773us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.04% 1.870us 0.04% 1.870us 0.312us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.07% 3.720us 0.07% 3.720us 1.240us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 58.31% 3.078ms 58.31% 3.078ms 3.078ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 4.87% 262.676us 41.62% 2.245ms 2.245ms 0.000us 0.00% 3.882ms 3.882ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.834ms 100.29% 3.834ms 3.834ms 1
+ aten::scaled_dot_product_attention 0.50% 26.770us 3.49% 188.015us 62.672us 0.000us 0.00% 3.044ms 1.015ms 3
+ aten::_scaled_dot_product_flash_attention 0.35% 18.803us 2.99% 161.245us 53.748us 0.000us 0.00% 3.044ms 1.015ms 3
+ aten::_flash_attention_forward 0.74% 39.829us 2.21% 119.102us 39.701us 3.044ms 79.61% 3.044ms 1.015ms 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.044ms 79.61% 3.044ms 1.015ms 3
+ aten::contiguous 0.18% 9.451us 32.36% 1.746ms 145.465us 0.000us 0.00% 838.367us 69.864us 12
+ aten::clone 0.54% 28.881us 32.18% 1.736ms 144.678us 0.000us 0.00% 838.367us 69.864us 12
+ aten::copy_ 1.51% 81.201us 30.48% 1.644ms 137.016us 779.615us 20.39% 838.367us 69.864us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.615us 20.39% 779.615us 64.968us 12
+ Activity Buffer Request 27.31% 1.473ms 27.31% 1.473ms 1.473ms 58.752us 1.54% 58.752us 58.752us 1
+ aten::transpose 1.01% 54.592us 1.34% 72.471us 3.020us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.33% 17.879us 0.33% 17.879us 0.745us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.37% 20.117us 1.53% 82.751us 5.517us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.41% 76.295us 1.41% 76.295us 3.179us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 2.13% 114.795us 2.13% 114.795us 7.653us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.27% 14.801us 0.27% 14.801us 4.934us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.04% 2.110us 0.04% 2.110us 0.352us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.07% 3.990us 0.07% 3.990us 1.330us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 58.38% 3.149ms 58.38% 3.149ms 3.149ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.279ms
-Self CUDA time total: 3.736ms
+Self CPU time total: 5.395ms
+Self CUDA time total: 3.823ms
@@ -4303,29 +4303,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 4.47% 246.252us 42.66% 2.352ms 2.352ms 0.000us 0.00% 3.878ms 3.878ms 1
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.831ms 100.28% 3.831ms 3.831ms 1
- aten::scaled_dot_product_attention 0.47% 26.180us 3.22% 177.714us 59.238us 0.000us 0.00% 3.035ms 1.012ms 3
- aten::_scaled_dot_product_flash_attention 0.34% 18.934us 2.75% 151.534us 50.511us 0.000us 0.00% 3.035ms 1.012ms 3
- aten::_flash_attention_forward 0.60% 33.169us 1.99% 109.931us 36.644us 3.035ms 79.45% 3.035ms 1.012ms 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.035ms 79.45% 3.035ms 1.012ms 3
- aten::contiguous 0.19% 10.269us 34.14% 1.882ms 156.829us 0.000us 0.00% 843.264us 70.272us 12
- aten::clone 0.51% 27.861us 33.95% 1.872ms 155.974us 0.000us 0.00% 843.264us 70.272us 12
- aten::copy_ 1.39% 76.612us 32.27% 1.779ms 148.225us 785.216us 20.55% 843.264us 70.272us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 785.216us 20.55% 785.216us 65.435us 12
- Activity Buffer Request 26.00% 1.433ms 26.00% 1.433ms 1.433ms 58.048us 1.52% 58.048us 58.048us 1
- aten::transpose 0.90% 49.620us 1.24% 68.282us 2.845us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.34% 18.662us 0.34% 18.662us 0.778us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.37% 20.139us 1.52% 83.911us 5.594us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.44% 79.524us 1.44% 79.524us 3.313us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 5.29% 291.664us 5.29% 291.664us 19.444us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.25% 13.850us 0.25% 13.850us 4.617us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.03% 1.810us 0.03% 1.810us 0.302us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.07% 3.620us 0.07% 3.620us 1.207us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 57.34% 3.161ms 57.34% 3.161ms 3.161ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 4.61% 261.106us 43.54% 2.469ms 2.469ms 0.000us 0.00% 3.945ms 3.945ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.898ms 100.28% 3.898ms 3.898ms 1
+ aten::scaled_dot_product_attention 0.46% 26.241us 3.40% 192.654us 64.218us 0.000us 0.00% 3.100ms 1.033ms 3
+ aten::_scaled_dot_product_flash_attention 0.34% 19.509us 2.94% 166.413us 55.471us 0.000us 0.00% 3.100ms 1.033ms 3
+ aten::_flash_attention_forward 0.74% 42.081us 2.16% 122.633us 40.878us 3.100ms 79.76% 3.100ms 1.033ms 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.100ms 79.76% 3.100ms 1.033ms 3
+ aten::contiguous 0.20% 11.161us 34.71% 1.968ms 163.994us 0.000us 0.00% 844.704us 70.392us 12
+ aten::clone 0.52% 29.682us 34.51% 1.957ms 163.064us 0.000us 0.00% 844.704us 70.392us 12
+ aten::copy_ 1.45% 82.261us 32.81% 1.860ms 155.026us 786.784us 20.24% 844.704us 70.392us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 786.784us 20.24% 786.784us 65.565us 12
+ Activity Buffer Request 26.26% 1.489ms 26.26% 1.489ms 1.489ms 57.920us 1.49% 57.920us 57.920us 1
+ aten::transpose 0.95% 53.820us 1.26% 71.322us 2.972us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.31% 17.502us 0.31% 17.502us 0.729us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.39% 21.943us 1.53% 86.983us 5.799us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.40% 79.202us 1.40% 79.202us 3.300us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 5.55% 314.487us 5.55% 314.487us 20.966us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.26% 14.830us 0.26% 14.830us 4.943us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.04% 2.010us 0.04% 2.010us 0.335us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 56.46% 3.201ms 56.46% 3.201ms 3.201ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.512ms
-Self CUDA time total: 3.820ms
+Self CPU time total: 5.670ms
+Self CUDA time total: 3.887ms
@@ -4335,29 +4335,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 4.69% 283.303us 42.14% 2.547ms 2.547ms 0.000us 0.00% 4.304ms 4.304ms 1
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.254ms 100.24% 4.254ms 4.254ms 1
- aten::scaled_dot_product_attention 0.82% 49.722us 3.53% 213.285us 71.095us 0.000us 0.00% 3.439ms 1.146ms 3
- aten::_scaled_dot_product_flash_attention 0.34% 20.582us 2.71% 163.563us 54.521us 0.000us 0.00% 3.439ms 1.146ms 3
- aten::_flash_attention_forward 0.62% 37.231us 1.93% 116.771us 38.924us 3.439ms 81.02% 3.439ms 1.146ms 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.439ms 81.02% 3.439ms 1.146ms 3
- aten::contiguous 0.18% 10.912us 32.97% 1.993ms 166.068us 0.000us 0.00% 865.695us 72.141us 12
- aten::clone 0.50% 30.059us 32.79% 1.982ms 165.158us 0.000us 0.00% 865.695us 72.141us 12
- aten::copy_ 1.39% 83.902us 31.17% 1.884ms 157.000us 805.439us 18.98% 865.695us 72.141us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 805.439us 18.98% 805.439us 67.120us 12
- Activity Buffer Request 24.08% 1.456ms 24.08% 1.456ms 1.456ms 60.256us 1.42% 60.256us 60.256us 1
- aten::transpose 1.06% 63.793us 1.39% 84.162us 3.507us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.34% 20.369us 0.34% 20.369us 0.849us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.36% 21.791us 1.46% 88.331us 5.889us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.33% 80.570us 1.33% 80.570us 3.357us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 6.09% 368.355us 6.09% 368.355us 24.557us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.25% 15.000us 0.25% 15.000us 5.000us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.03% 1.990us 0.03% 1.990us 0.332us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.07% 4.160us 0.07% 4.160us 1.387us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 57.86% 3.497ms 57.86% 3.497ms 3.497ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 5.12% 312.519us 40.82% 2.493ms 2.493ms 0.000us 0.00% 4.416ms 4.416ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.365ms 100.24% 4.365ms 4.365ms 1
+ aten::scaled_dot_product_attention 0.42% 25.922us 3.20% 195.246us 65.082us 0.000us 0.00% 3.547ms 1.182ms 3
+ aten::_scaled_dot_product_flash_attention 0.34% 20.847us 2.77% 169.324us 56.441us 0.000us 0.00% 3.547ms 1.182ms 3
+ aten::_flash_attention_forward 0.72% 44.243us 2.07% 126.303us 42.101us 3.547ms 81.45% 3.547ms 1.182ms 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.547ms 81.45% 3.547ms 1.182ms 3
+ aten::contiguous 0.17% 10.559us 31.73% 1.938ms 161.473us 0.000us 0.00% 869.122us 72.427us 12
+ aten::clone 0.47% 28.763us 31.56% 1.927ms 160.593us 0.000us 0.00% 869.122us 72.427us 12
+ aten::copy_ 1.36% 83.033us 30.01% 1.832ms 152.707us 807.906us 18.55% 869.122us 72.427us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 807.906us 18.55% 807.906us 67.326us 12
+ Activity Buffer Request 24.51% 1.497ms 24.51% 1.497ms 1.497ms 61.216us 1.41% 61.216us 61.216us 1
+ aten::transpose 0.85% 52.195us 1.14% 69.864us 2.911us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.29% 17.669us 0.29% 17.669us 0.736us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.34% 20.921us 1.44% 87.791us 5.853us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.30% 79.270us 1.30% 79.270us 3.303us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 4.55% 277.575us 4.55% 277.575us 18.505us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.27% 16.520us 0.27% 16.520us 5.507us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 1.960us 0.03% 1.960us 0.327us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 59.18% 3.614ms 59.18% 3.614ms 3.614ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 6.045ms
-Self CUDA time total: 4.244ms
+Self CPU time total: 6.107ms
+Self CUDA time total: 4.355ms
@@ -4367,45 +4367,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_flash_ma 4.04% 248.485us 39.71% 2.440ms 2.440ms 0.000us 0.00% 4.431ms 4.431ms 1
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.380ms 100.24% 4.380ms 4.380ms 1
- aten::scaled_dot_product_attention 0.42% 25.679us 2.90% 178.082us 59.361us 0.000us 0.00% 3.552ms 1.184ms 3
- aten::_scaled_dot_product_flash_attention 0.29% 17.912us 2.48% 152.403us 50.801us 0.000us 0.00% 3.552ms 1.184ms 3
- aten::_flash_attention_forward 0.56% 34.360us 1.81% 111.452us 37.151us 3.552ms 81.28% 3.552ms 1.184ms 3
-void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.552ms 81.28% 3.552ms 1.184ms 3
- aten::contiguous 0.17% 10.359us 32.01% 1.967ms 163.915us 0.000us 0.00% 879.392us 73.283us 12
- aten::clone 0.45% 27.371us 31.84% 1.957ms 163.052us 0.000us 0.00% 879.392us 73.283us 12
- aten::copy_ 1.33% 81.681us 30.34% 1.864ms 155.367us 818.048us 18.72% 879.392us 73.283us 12
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 818.048us 18.72% 818.048us 68.171us 12
- Activity Buffer Request 23.48% 1.443ms 23.48% 1.443ms 1.443ms 61.344us 1.40% 61.344us 61.344us 1
- aten::transpose 0.84% 51.433us 1.14% 69.901us 2.913us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.30% 18.468us 0.30% 18.468us 0.769us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.32% 19.754us 1.37% 83.993us 5.600us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty 1.26% 77.740us 1.26% 77.740us 3.239us 0.000us 0.00% 0.000us 0.000us 24
- cudaLaunchKernel 5.92% 364.005us 5.92% 364.005us 24.267us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_strided 0.23% 14.381us 0.23% 14.381us 4.794us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceGetAttribute 0.03% 1.840us 0.03% 1.840us 0.307us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.07% 4.180us 0.07% 4.180us 1.393us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 60.29% 3.705ms 60.29% 3.705ms 3.705ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_flash_ma 3.85% 236.256us 38.02% 2.335ms 2.335ms 0.000us 0.00% 4.535ms 4.535ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.485ms 100.25% 4.485ms 4.485ms 1
+ aten::scaled_dot_product_attention 0.43% 26.452us 2.98% 183.275us 61.092us 0.000us 0.00% 3.655ms 1.218ms 3
+ aten::_scaled_dot_product_flash_attention 0.30% 18.620us 2.55% 156.823us 52.274us 0.000us 0.00% 3.655ms 1.218ms 3
+ aten::_flash_attention_forward 0.59% 36.060us 1.88% 115.323us 38.441us 3.655ms 81.69% 3.655ms 1.218ms 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 81.69% 3.655ms 1.218ms 3
+ aten::contiguous 0.16% 9.770us 30.40% 1.867ms 155.567us 0.000us 0.00% 880.065us 73.339us 12
+ aten::clone 0.46% 28.179us 30.24% 1.857ms 154.753us 0.000us 0.00% 880.065us 73.339us 12
+ aten::copy_ 1.36% 83.563us 28.74% 1.765ms 147.054us 819.137us 18.31% 880.065us 73.339us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 819.137us 18.31% 819.137us 68.261us 12
+ Activity Buffer Request 23.24% 1.427ms 23.24% 1.427ms 1.427ms 60.928us 1.36% 60.928us 60.928us 1
+ aten::transpose 0.86% 52.980us 1.16% 71.060us 2.961us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.29% 18.080us 0.29% 18.080us 0.753us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.34% 20.930us 1.37% 83.913us 5.594us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.25% 77.043us 1.25% 77.043us 3.210us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 4.54% 278.990us 4.54% 278.990us 18.599us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.24% 14.661us 0.24% 14.661us 4.887us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 1.978us 0.03% 1.978us 0.330us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.06% 3.901us 0.06% 3.901us 1.300us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 61.98% 3.806ms 61.98% 3.806ms 3.806ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 6.146ms
-Self CUDA time total: 4.370ms
+Self CPU time total: 6.141ms
+Self CUDA time total: 4.474ms
impl wl p50(ms) ok
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
-torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
-torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
-torch_flash_ma cuda_attn_L384_bfloat16 1.30 True
-torch_flash_ma cuda_attn_L448_bfloat16 1.45 True
-torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
+torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
+torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
+torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
+torch_flash_ma cuda_attn_L448_bfloat16 1.50 True
+torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
-
-
-
-Installed 37 packages in 225ms
-
-
Artifacts:
attention.jsonl
diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html
index 0c6eeb07699e5badcea2a599fa3141678ce81b07..b43f3b2c4b9504821051f29d094124c270a7e0ee 100644
--- a/flash_attn/impls/hf_kernels_flash_attn.html
+++ b/flash_attn/impls/hf_kernels_flash_attn.html
@@ -4104,14 +4104,14 @@ body[data-tool="eraser"] .main-content {
▼ code
▼ output
- ▶ uv-logs
+ ▶ uv-logs
|
-Cell: benchmark | 10.91s
+Cell: benchmark | 5.83s
|
▶ run
Copy
Raw
-
GitHub
-
🤗 HF
+
GitHub
+
🤗 HF
@@ -4161,21 +4161,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 3.74% 162.312us 41.68% 1.808ms 1.808ms 0.000us 0.00% 3.686ms 3.686ms 1
- _flash_attn_9e27194::fwd 1.67% 72.360us 37.94% 1.646ms 548.560us 2.753ms 100.00% 3.686ms 1.229ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.05% 2.754ms 2.754ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.753ms 100.00% 2.753ms 917.639us 3
- Activity Buffer Request 33.08% 1.435ms 33.08% 1.435ms 1.435ms 933.501us 33.91% 933.501us 933.501us 1
- cudaDeviceGetAttribute 0.12% 5.209us 0.12% 5.209us 0.347us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.42% 18.210us 1.24% 53.790us 17.930us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.82% 35.580us 0.82% 35.580us 11.860us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.58% 25.153us 0.58% 25.153us 2.795us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.26% 11.441us 0.26% 11.441us 3.814us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.99% 42.781us 0.99% 42.781us 14.260us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 58.32% 2.530ms 58.32% 2.530ms 2.530ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 3.51% 153.413us 41.11% 1.797ms 1.797ms 0.000us 0.00% 3.733ms 3.733ms 1
+ _flash_attn_9e27194::fwd 1.62% 70.702us 37.60% 1.644ms 547.894us 2.785ms 100.00% 3.733ms 1.244ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.786ms 100.05% 2.786ms 2.786ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.785ms 100.00% 2.785ms 928.303us 3
+ Activity Buffer Request 32.92% 1.439ms 32.92% 1.439ms 1.439ms 947.706us 34.03% 947.706us 947.706us 1
+ cudaDeviceGetAttribute 0.11% 4.891us 0.11% 4.891us 0.326us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.37% 16.181us 1.17% 51.061us 17.020us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.80% 34.880us 0.80% 34.880us 11.627us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.59% 25.681us 0.59% 25.681us 2.853us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.26% 11.340us 0.26% 11.340us 3.780us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.93% 40.731us 0.93% 40.731us 13.577us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 58.89% 2.575ms 58.89% 2.575ms 2.575ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.338ms
-Self CUDA time total: 2.753ms
+Self CPU time total: 4.372ms
+Self CUDA time total: 2.785ms
@@ -4185,21 +4185,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 2.52% 113.464us 37.14% 1.670ms 1.670ms 0.000us 0.00% 3.984ms 3.984ms 1
- _flash_attn_9e27194::fwd 1.10% 49.632us 34.61% 1.557ms 518.855us 2.977ms 100.00% 3.984ms 1.328ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.979ms 100.05% 2.979ms 2.979ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.977ms 100.00% 2.977ms 992.348us 3
- Activity Buffer Request 31.69% 1.425ms 31.69% 1.425ms 1.425ms 1.007ms 33.82% 1.007ms 1.007ms 1
- cudaDeviceGetAttribute 0.08% 3.769us 0.08% 3.769us 0.251us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.17% 7.560us 0.54% 24.080us 8.027us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.37% 16.520us 0.37% 16.520us 5.507us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.47% 21.170us 0.47% 21.170us 2.352us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.08% 3.820us 0.08% 3.820us 1.273us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.64% 28.910us 0.64% 28.910us 9.637us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 62.86% 2.827ms 62.86% 2.827ms 2.827ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 1.94% 86.682us 37.50% 1.676ms 1.676ms 0.000us 0.00% 3.929ms 3.929ms 1
+ _flash_attn_9e27194::fwd 1.06% 47.570us 35.56% 1.589ms 529.734us 2.938ms 100.00% 3.929ms 1.310ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.939ms 100.05% 2.939ms 2.939ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.938ms 100.00% 2.938ms 979.209us 3
+ Activity Buffer Request 32.66% 1.460ms 32.66% 1.460ms 1.460ms 991.166us 33.74% 991.166us 991.166us 1
+ cudaDeviceGetAttribute 0.10% 4.450us 0.10% 4.450us 0.297us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.19% 8.440us 0.55% 24.690us 8.230us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.36% 16.250us 0.36% 16.250us 5.417us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.51% 22.872us 0.51% 22.872us 2.541us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.07% 3.350us 0.07% 3.350us 1.117us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.60% 26.611us 0.60% 26.611us 8.870us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 62.50% 2.794ms 62.50% 2.794ms 2.794ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.497ms
-Self CUDA time total: 2.977ms
+Self CPU time total: 4.469ms
+Self CUDA time total: 2.938ms
@@ -4209,21 +4209,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 2.39% 108.133us 36.58% 1.655ms 1.655ms 0.000us 0.00% 4.040ms 4.040ms 1
- _flash_attn_9e27194::fwd 1.06% 48.029us 34.19% 1.547ms 515.608us 3.016ms 100.00% 4.040ms 1.347ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.017ms 100.05% 3.017ms 3.017ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.016ms 100.00% 3.016ms 1.005ms 3
- Activity Buffer Request 31.28% 1.415ms 31.28% 1.415ms 1.415ms 1.024ms 33.96% 1.024ms 1.024ms 1
- cudaDeviceGetAttribute 0.09% 4.281us 0.09% 4.281us 0.285us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.16% 7.121us 0.52% 23.411us 7.804us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.36% 16.290us 0.36% 16.290us 5.430us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.49% 22.080us 0.49% 22.080us 2.453us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.08% 3.840us 0.08% 3.840us 1.280us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.66% 29.710us 0.66% 29.710us 9.903us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 63.42% 2.870ms 63.42% 2.870ms 2.870ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 2.38% 109.313us 36.70% 1.683ms 1.683ms 0.000us 0.00% 4.081ms 4.081ms 1
+ _flash_attn_9e27194::fwd 1.05% 48.167us 34.31% 1.574ms 524.567us 3.048ms 100.00% 4.081ms 1.360ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.049ms 100.05% 3.049ms 3.049ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.048ms 100.00% 3.048ms 1.016ms 3
+ Activity Buffer Request 31.46% 1.443ms 31.46% 1.443ms 1.443ms 1.033ms 33.90% 1.033ms 1.033ms 1
+ cudaDeviceGetAttribute 0.09% 4.231us 0.09% 4.231us 0.282us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.16% 7.250us 0.52% 23.960us 7.987us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.36% 16.710us 0.36% 16.710us 5.570us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.46% 21.300us 0.46% 21.300us 2.367us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.08% 3.561us 0.08% 3.561us 1.187us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.64% 29.473us 0.64% 29.473us 9.824us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 63.30% 2.903ms 63.30% 2.903ms 2.903ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.525ms
-Self CUDA time total: 3.016ms
+Self CPU time total: 4.586ms
+Self CUDA time total: 3.048ms
@@ -4233,21 +4233,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 2.32% 109.992us 39.04% 1.848ms 1.848ms 0.000us 0.00% 4.060ms 4.060ms 1
- _flash_attn_9e27194::fwd 1.05% 49.564us 36.71% 1.738ms 579.317us 3.035ms 100.00% 4.060ms 1.353ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.037ms 100.05% 3.037ms 3.037ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.035ms 100.00% 3.035ms 1.012ms 3
- Activity Buffer Request 29.72% 1.407ms 29.72% 1.407ms 1.407ms 1.025ms 33.76% 1.025ms 1.025ms 1
- cudaDeviceGetAttribute 0.08% 3.690us 0.08% 3.690us 0.246us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.16% 7.770us 0.54% 25.380us 8.460us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.37% 17.610us 0.37% 17.610us 5.870us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.47% 22.139us 0.47% 22.139us 2.460us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.08% 3.790us 0.08% 3.790us 1.263us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 4.78% 226.343us 4.78% 226.343us 75.448us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 60.96% 2.886ms 60.96% 2.886ms 2.886ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 2.13% 103.094us 38.83% 1.884ms 1.884ms 0.000us 0.00% 4.165ms 4.165ms 1
+ _flash_attn_9e27194::fwd 0.99% 47.838us 36.71% 1.781ms 593.521us 3.114ms 100.00% 4.165ms 1.388ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.116ms 100.05% 3.116ms 3.116ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.114ms 100.00% 3.114ms 1.038ms 3
+ Activity Buffer Request 29.59% 1.435ms 29.59% 1.435ms 1.435ms 1.051ms 33.75% 1.051ms 1.051ms 1
+ cudaDeviceGetAttribute 0.08% 3.800us 0.08% 3.800us 0.253us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.16% 7.891us 0.53% 25.811us 8.604us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.37% 17.920us 0.37% 17.920us 5.973us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.45% 21.731us 0.45% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.08% 3.740us 0.08% 3.740us 1.247us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 4.99% 242.187us 4.99% 242.187us 80.729us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 61.17% 2.967ms 61.17% 2.967ms 2.967ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.734ms
-Self CUDA time total: 3.035ms
+Self CPU time total: 4.851ms
+Self CUDA time total: 3.114ms
@@ -4257,21 +4257,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 2.11% 110.542us 35.45% 1.860ms 1.860ms 0.000us 0.00% 4.719ms 4.719ms 1
- _flash_attn_9e27194::fwd 0.97% 51.080us 33.34% 1.750ms 583.220us 3.535ms 100.00% 4.719ms 1.573ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.537ms 100.04% 3.537ms 3.537ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.535ms 100.00% 3.535ms 1.178ms 3
- Activity Buffer Request 27.95% 1.467ms 27.95% 1.467ms 1.467ms 1.184ms 33.49% 1.184ms 1.184ms 1
- cudaDeviceGetAttribute 0.07% 3.640us 0.07% 3.640us 0.243us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.14% 7.520us 0.47% 24.731us 8.244us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.33% 17.211us 0.33% 17.211us 5.737us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.43% 22.670us 0.43% 22.670us 2.519us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.07% 3.800us 0.07% 3.800us 1.267us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 3.37% 176.824us 3.37% 176.824us 58.941us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 64.55% 3.388ms 64.55% 3.388ms 3.388ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 2.00% 105.522us 34.61% 1.828ms 1.828ms 0.000us 0.00% 4.806ms 4.806ms 1
+ _flash_attn_9e27194::fwd 0.94% 49.622us 32.62% 1.723ms 574.192us 3.597ms 100.00% 4.806ms 1.602ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.599ms 100.05% 3.599ms 3.599ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.597ms 100.00% 3.597ms 1.199ms 3
+ Activity Buffer Request 27.37% 1.446ms 27.37% 1.446ms 1.446ms 1.209ms 33.59% 1.209ms 1.209ms 1
+ cudaDeviceGetAttribute 0.08% 3.991us 0.08% 3.991us 0.266us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.14% 7.250us 0.47% 24.620us 8.207us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.33% 17.370us 0.33% 17.370us 5.790us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.41% 21.681us 0.41% 21.681us 2.409us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.28% 173.384us 3.28% 173.384us 57.795us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 65.39% 3.453ms 65.39% 3.453ms 3.453ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.248ms
-Self CUDA time total: 3.535ms
+Self CPU time total: 5.281ms
+Self CUDA time total: 3.597ms
@@ -4281,41 +4281,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn 2.24% 118.861us 34.58% 1.832ms 1.832ms 0.000us 0.00% 4.834ms 4.834ms 1
- _flash_attn_9e27194::fwd 0.90% 47.900us 32.34% 1.713ms 571.163us 3.618ms 100.00% 4.834ms 1.611ms 3
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.619ms 100.04% 3.619ms 3.619ms 1
-void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.618ms 100.00% 3.618ms 1.206ms 3
- Activity Buffer Request 27.32% 1.448ms 27.32% 1.448ms 1.448ms 1.217ms 33.63% 1.217ms 1.217ms 1
- cudaDeviceGetAttribute 0.07% 3.661us 0.07% 3.661us 0.244us 0.000us 0.00% 0.000us 0.000us 15
- aten::empty_like 0.14% 7.320us 0.50% 26.231us 8.744us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty_strided 0.36% 18.911us 0.36% 18.911us 6.304us 0.000us 0.00% 0.000us 0.000us 3
- aten::empty 0.40% 21.351us 0.40% 21.351us 2.372us 0.000us 0.00% 0.000us 0.000us 9
- cudaFuncSetAttribute 0.08% 4.160us 0.08% 4.160us 1.387us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 3.07% 162.463us 3.07% 162.463us 54.154us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 65.42% 3.466ms 65.42% 3.466ms 3.466ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn 2.02% 107.892us 33.82% 1.810ms 1.810ms 0.000us 0.00% 4.930ms 4.930ms 1
+ _flash_attn_9e27194::fwd 0.91% 48.918us 31.80% 1.702ms 567.268us 3.687ms 100.00% 4.930ms 1.643ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.689ms 100.04% 3.689ms 3.689ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.687ms 100.00% 3.687ms 1.229ms 3
+ Activity Buffer Request 26.86% 1.437ms 26.86% 1.437ms 1.437ms 1.242ms 33.69% 1.242ms 1.242ms 1
+ cudaDeviceGetAttribute 0.07% 3.881us 0.07% 3.881us 0.259us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.14% 7.591us 0.49% 26.111us 8.704us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.35% 18.520us 0.35% 18.520us 6.173us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.39% 20.640us 0.39% 20.640us 2.293us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.07% 3.561us 0.07% 3.561us 1.187us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.01% 161.306us 3.01% 161.306us 53.769us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 66.18% 3.542ms 66.18% 3.542ms 3.542ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.299ms
-Self CUDA time total: 3.618ms
+Self CPU time total: 5.351ms
+Self CUDA time total: 3.687ms
impl wl p50(ms) ok
-hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.94 True
-hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True
-hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.03 True
-hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
-hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
+hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
+hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
+hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
+hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
+hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
-
-
-
-Installed 15 packages in 15ms
+
+Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
+Fetching 20 files: 10%|█ | 2/20 [00:01<00:15, 1.19it/s]
+Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.87it/s]
-
-
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
-Fetching 20 files: 5%|▌ | 1/20 [00:00<00:02, 8.29it/s]
-Fetching 20 files: 10%|█ | 2/20 [00:06<01:08, 3.82s/it]
-Fetching 20 files: 100%|██████████| 20/20 [00:06<00:00, 3.06it/s]
Artifacts:
attention.jsonl
diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html
index 16d419ea57e2fe2c3ccff8a3a3f19df88ec10363..a1db1794336426cb37d9956eacf119e09a093fa1 100644
--- a/flash_attn/impls/hf_kernels_flash_attn3.html
+++ b/flash_attn/impls/hf_kernels_flash_attn3.html
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
▼ output
▶ uv-logs
|
-Cell: benchmark | 5.55s
+Cell: benchmark | 5.53s
|
▶ run
Copy
Raw
-
GitHub
+
GitHub
🤗 HF
@@ -4160,19 +4160,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 4.02% 170.054us 45.66% 1.931ms 1.931ms 0.000us 0.00% 3.489ms 3.489ms 1
- FlashAttnFunc 2.98% 126.112us 41.64% 1.761ms 586.890us 0.000us 0.00% 3.489ms 1.163ms 3
- _flash_attn3_48fe103_dirty::fwd 1.85% 78.440us 38.65% 1.635ms 544.853us 2.605ms 100.00% 3.489ms 1.163ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.606ms 100.06% 2.606ms 2.606ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.605ms 100.00% 2.605ms 868.221us 3
- Activity Buffer Request 34.45% 1.457ms 34.45% 1.457ms 1.457ms 884.680us 33.97% 884.680us 884.680us 1
- aten::empty 1.07% 45.402us 1.07% 45.402us 7.567us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.29% 12.202us 0.29% 12.202us 4.067us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.99% 41.761us 0.99% 41.761us 13.920us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 54.34% 2.298ms 54.34% 2.298ms 2.298ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 3.85% 171.193us 46.01% 2.045ms 2.045ms 0.000us 0.00% 3.614ms 3.614ms 1
+ FlashAttnFunc 3.07% 136.295us 42.15% 1.874ms 624.570us 0.000us 0.00% 3.614ms 1.205ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.94% 86.341us 39.09% 1.737ms 579.138us 2.720ms 100.00% 3.614ms 1.205ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.722ms 100.05% 2.722ms 2.722ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.720ms 100.00% 2.720ms 906.698us 3
+ Activity Buffer Request 34.72% 1.543ms 34.72% 1.543ms 1.543ms 893.600us 32.85% 893.600us 893.600us 1
+ aten::empty 1.07% 47.441us 1.07% 47.441us 7.907us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.31% 13.761us 0.31% 13.761us 4.587us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.05% 46.772us 1.05% 46.772us 15.591us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 53.99% 2.400ms 53.99% 2.400ms 2.400ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.229ms
-Self CUDA time total: 2.605ms
+Self CPU time total: 4.445ms
+Self CUDA time total: 2.720ms
@@ -4182,19 +4182,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 2.90% 125.133us 41.34% 1.782ms 1.782ms 0.000us 0.00% 3.684ms 3.684ms 1
- FlashAttnFunc 2.10% 90.312us 38.43% 1.657ms 552.206us 0.000us 0.00% 3.684ms 1.228ms 3
- _flash_attn3_48fe103_dirty::fwd 1.24% 53.461us 36.34% 1.566ms 522.102us 2.755ms 100.00% 3.684ms 1.228ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.756ms 100.06% 2.756ms 2.756ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.00% 2.755ms 918.309us 3
- Activity Buffer Request 33.60% 1.448ms 33.60% 1.448ms 1.448ms 929.157us 33.73% 929.157us 929.157us 1
- aten::empty 0.64% 27.380us 0.64% 27.380us 4.563us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.13% 5.449us 0.13% 5.449us 1.816us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.74% 31.802us 0.74% 31.802us 10.601us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 58.66% 2.529ms 58.66% 2.529ms 2.529ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 2.41% 104.370us 41.13% 1.784ms 1.784ms 0.000us 0.00% 3.700ms 3.700ms 1
+ FlashAttnFunc 2.00% 86.685us 38.73% 1.679ms 559.738us 0.000us 0.00% 3.700ms 1.233ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.21% 52.631us 36.73% 1.593ms 530.843us 2.768ms 100.00% 3.700ms 1.233ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.769ms 100.06% 2.769ms 2.769ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.768ms 100.00% 2.768ms 922.559us 3
+ Activity Buffer Request 34.10% 1.479ms 34.10% 1.479ms 1.479ms 932.127us 33.68% 932.127us 932.127us 1
+ aten::empty 0.60% 25.981us 0.60% 25.981us 4.330us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.050us 0.12% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.70% 30.140us 0.70% 30.140us 10.047us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 58.87% 2.553ms 58.87% 2.553ms 2.553ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.310ms
-Self CUDA time total: 2.755ms
+Self CPU time total: 4.336ms
+Self CUDA time total: 2.768ms
@@ -4204,19 +4204,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 2.81% 125.615us 39.44% 1.762ms 1.762ms 0.000us 0.00% 3.917ms 3.917ms 1
- FlashAttnFunc 2.03% 90.880us 36.63% 1.637ms 545.546us 0.000us 0.00% 3.917ms 1.306ms 3
- _flash_attn3_48fe103_dirty::fwd 1.20% 53.572us 34.59% 1.546ms 515.252us 2.927ms 100.00% 3.917ms 1.306ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.928ms 100.05% 2.928ms 2.928ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.927ms 100.00% 2.927ms 975.593us 3
- Activity Buffer Request 31.96% 1.428ms 31.96% 1.428ms 1.428ms 990.441us 33.84% 990.441us 990.441us 1
- aten::empty 0.63% 27.950us 0.63% 27.950us 4.658us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.12% 5.340us 0.12% 5.340us 1.780us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.68% 30.562us 0.68% 30.562us 10.187us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 60.56% 2.706ms 60.56% 2.706ms 2.706ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 2.29% 102.411us 40.10% 1.791ms 1.791ms 0.000us 0.00% 3.875ms 3.875ms 1
+ FlashAttnFunc 2.01% 89.903us 37.81% 1.688ms 562.801us 0.000us 0.00% 3.875ms 1.292ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.18% 52.613us 35.79% 1.599ms 532.834us 2.892ms 100.00% 3.875ms 1.292ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.893ms 100.05% 2.893ms 2.893ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.00% 2.892ms 963.972us 3
+ Activity Buffer Request 33.24% 1.485ms 33.24% 1.485ms 1.485ms 983.097us 33.99% 983.097us 983.097us 1
+ aten::empty 0.58% 25.770us 0.58% 25.770us 4.295us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.11% 4.820us 0.11% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.69% 30.740us 0.69% 30.740us 10.247us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 59.90% 2.675ms 59.90% 2.675ms 2.675ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.469ms
-Self CUDA time total: 2.927ms
+Self CPU time total: 4.466ms
+Self CUDA time total: 2.892ms
@@ -4226,19 +4226,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 2.73% 126.513us 42.04% 1.948ms 1.948ms 0.000us 0.00% 3.892ms 3.892ms 1
- FlashAttnFunc 2.03% 94.184us 39.31% 1.821ms 607.134us 0.000us 0.00% 3.892ms 1.297ms 3
- _flash_attn3_48fe103_dirty::fwd 1.14% 52.959us 37.28% 1.727ms 575.740us 2.906ms 100.00% 3.892ms 1.297ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.908ms 100.05% 2.908ms 2.908ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.906ms 100.00% 2.906ms 968.728us 3
- Activity Buffer Request 30.69% 1.422ms 30.69% 1.422ms 1.422ms 985.540us 33.91% 985.540us 985.540us 1
- aten::empty 0.63% 29.361us 0.63% 29.361us 4.893us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.11% 5.241us 0.11% 5.241us 1.747us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 4.70% 217.965us 4.70% 217.965us 72.655us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 57.96% 2.685ms 57.96% 2.685ms 2.685ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 2.68% 125.944us 42.11% 1.982ms 1.982ms 0.000us 0.00% 3.932ms 3.932ms 1
+ FlashAttnFunc 1.98% 92.983us 39.44% 1.856ms 618.639us 0.000us 0.00% 3.932ms 1.311ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.14% 53.661us 37.46% 1.763ms 587.645us 2.953ms 100.00% 3.932ms 1.311ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.954ms 100.06% 2.954ms 2.954ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 100.00% 2.953ms 984.176us 3
+ Activity Buffer Request 30.48% 1.434ms 30.48% 1.434ms 1.434ms 979.803us 33.19% 979.803us 979.803us 1
+ aten::empty 0.58% 27.450us 0.58% 27.450us 4.575us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.11% 5.150us 0.11% 5.150us 1.717us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 5.15% 242.396us 5.15% 242.396us 80.799us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 57.89% 2.724ms 57.89% 2.724ms 2.724ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.633ms
-Self CUDA time total: 2.906ms
+Self CPU time total: 4.706ms
+Self CUDA time total: 2.953ms
@@ -4248,19 +4248,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 2.33% 120.764us 37.09% 1.922ms 1.922ms 0.000us 0.00% 4.645ms 4.645ms 1
- FlashAttnFunc 1.78% 92.240us 34.76% 1.801ms 600.384us 0.000us 0.00% 4.645ms 1.548ms 3
- _flash_attn3_48fe103_dirty::fwd 1.04% 53.829us 32.98% 1.709ms 569.637us 3.482ms 100.00% 4.645ms 1.548ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.483ms 100.04% 3.483ms 3.483ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.482ms 100.00% 3.482ms 1.161ms 3
- Activity Buffer Request 27.80% 1.441ms 27.80% 1.441ms 1.441ms 1.163ms 33.40% 1.163ms 1.163ms 1
- aten::empty 0.54% 28.012us 0.54% 28.012us 4.669us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.10% 5.211us 0.10% 5.211us 1.737us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 3.50% 181.305us 3.50% 181.305us 60.435us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 62.91% 3.260ms 62.91% 3.260ms 3.260ms 0.000us 0.00% 0.000us 0.000us 1
+ hf_kernels_flash_attn3 2.36% 122.892us 37.59% 1.960ms 1.960ms 0.000us 0.00% 4.622ms 4.622ms 1
+ FlashAttnFunc 1.74% 90.533us 35.23% 1.837ms 612.429us 0.000us 0.00% 4.622ms 1.541ms 3
+ _flash_attn3_48fe103_dirty::fwd 0.97% 50.750us 33.49% 1.747ms 582.252us 3.470ms 100.00% 4.622ms 1.541ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.472ms 100.05% 3.472ms 3.472ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.470ms 100.00% 3.470ms 1.157ms 3
+ Activity Buffer Request 27.49% 1.433ms 27.49% 1.433ms 1.433ms 1.152ms 33.20% 1.152ms 1.152ms 1
+ aten::empty 0.51% 26.592us 0.51% 26.592us 4.432us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.10% 5.060us 0.10% 5.060us 1.687us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 4.43% 230.856us 4.43% 230.856us 76.952us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 62.41% 3.255ms 62.41% 3.255ms 3.255ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.182ms
-Self CUDA time total: 3.482ms
+Self CPU time total: 5.215ms
+Self CUDA time total: 3.470ms
@@ -4270,33 +4270,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- hf_kernels_flash_attn3 2.54% 130.883us 37.28% 1.924ms 1.924ms 0.000us 0.00% 4.633ms 4.633ms 1
- FlashAttnFunc 1.80% 93.033us 34.74% 1.793ms 597.564us 0.000us 0.00% 4.633ms 1.544ms 3
- _flash_attn3_48fe103_dirty::fwd 1.02% 52.583us 32.94% 1.700ms 566.553us 3.468ms 100.00% 4.633ms 1.544ms 3
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.04% 3.469ms 3.469ms 1
+ hf_kernels_flash_attn3 2.32% 120.892us 37.51% 1.951ms 1.951ms 0.000us 0.00% 4.639ms 4.639ms 1
+ FlashAttnFunc 1.74% 90.773us 35.18% 1.830ms 610.133us 0.000us 0.00% 4.639ms 1.546ms 3
+ _flash_attn3_48fe103_dirty::fwd 0.99% 51.351us 33.44% 1.740ms 579.875us 3.468ms 100.00% 4.639ms 1.546ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.05% 3.469ms 3.469ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.468ms 100.00% 3.468ms 1.156ms 3
- Activity Buffer Request 27.99% 1.444ms 27.99% 1.444ms 1.444ms 1.165ms 33.61% 1.165ms 1.165ms 1
- aten::empty 0.56% 29.150us 0.56% 29.150us 4.858us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.10% 5.050us 0.10% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 3.27% 168.763us 3.27% 168.763us 56.254us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 62.72% 3.236ms 62.72% 3.236ms 3.236ms 0.000us 0.00% 0.000us 0.000us 1
+ Activity Buffer Request 27.26% 1.418ms 27.26% 1.418ms 1.418ms 1.172ms 33.79% 1.172ms 1.172ms 1
+ aten::empty 0.51% 26.560us 0.51% 26.560us 4.427us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.10% 5.101us 0.10% 5.101us 1.700us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 4.58% 238.367us 4.58% 238.367us 79.456us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 62.49% 3.251ms 62.49% 3.251ms 3.251ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.160ms
+Self CPU time total: 5.202ms
Self CUDA time total: 3.468ms
impl wl p50(ms) ok
-hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True
-hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.95 True
-hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
-hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
-hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
+hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
+hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
+hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
+hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.03 True
+hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
-Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.35it/s]
-Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.71it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.42it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.84it/s]
Artifacts:
diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html
index a146d1ecfde534d0841c299486870e29ea70f3bb..e6d938b9f4ce572baa96778a2f0d11d329ead530 100644
--- a/flash_attn/impls/mem_efficient_attention.html
+++ b/flash_attn/impls/mem_efficient_attention.html
@@ -4110,7 +4110,7 @@ Cell: benchmark | 3.94s
|
▶ run
Copy
Raw
-
GitHub
+
GitHub
@@ -4159,28 +4159,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 5.20% 361.468us 33.36% 2.319ms 2.319ms 0.000us 0.00% 5.387ms 5.387ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.370ms 100.63% 5.370ms 5.370ms 1
- aten::scaled_dot_product_attention 0.48% 33.240us 2.68% 186.333us 62.111us 0.000us 0.00% 4.719ms 1.573ms 3
- aten::_scaled_dot_product_efficient_attention 0.35% 24.389us 2.20% 153.093us 51.031us 0.000us 0.00% 4.719ms 1.573ms 3
- aten::_efficient_attention_forward 0.53% 37.120us 1.50% 104.111us 34.704us 4.719ms 88.44% 4.719ms 1.573ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.719ms 88.44% 4.719ms 1.573ms 3
- aten::contiguous 0.18% 12.841us 24.53% 1.706ms 189.522us 0.000us 0.00% 667.809us 74.201us 9
- aten::clone 0.46% 31.899us 24.35% 1.693ms 188.095us 0.000us 0.00% 667.809us 74.201us 9
- aten::copy_ 1.13% 78.352us 22.86% 1.589ms 176.604us 617.121us 11.56% 667.809us 74.201us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.121us 11.56% 617.121us 68.569us 9
- Activity Buffer Request 20.52% 1.427ms 20.52% 1.427ms 1.427ms 50.688us 0.95% 50.688us 50.688us 1
- aten::transpose 0.98% 68.237us 1.30% 90.074us 3.753us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.31% 21.837us 0.31% 21.837us 0.910us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.25% 17.541us 1.03% 71.521us 7.947us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 1.19% 82.429us 1.19% 82.429us 3.925us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 1.61% 111.770us 1.61% 111.770us 9.314us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.05% 3.512us 0.05% 3.512us 1.171us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.11% 7.660us 0.11% 7.660us 2.553us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 66.64% 4.633ms 66.64% 4.633ms 4.633ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 5.14% 365.276us 32.53% 2.313ms 2.313ms 0.000us 0.00% 5.511ms 5.511ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.492ms 100.58% 5.492ms 5.492ms 1
+ aten::scaled_dot_product_attention 0.43% 30.401us 2.47% 175.534us 58.511us 0.000us 0.00% 4.841ms 1.614ms 3
+ aten::_scaled_dot_product_efficient_attention 0.33% 23.489us 2.04% 145.133us 48.378us 0.000us 0.00% 4.841ms 1.614ms 3
+ aten::_efficient_attention_forward 0.51% 36.572us 1.40% 99.733us 33.244us 4.841ms 88.65% 4.841ms 1.614ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.841ms 88.65% 4.841ms 1.614ms 3
+ aten::contiguous 0.18% 12.851us 23.99% 1.706ms 189.523us 0.000us 0.00% 670.241us 74.471us 9
+ aten::clone 0.46% 32.742us 23.80% 1.693ms 188.095us 0.000us 0.00% 670.241us 74.471us 9
+ aten::copy_ 1.05% 74.801us 22.33% 1.588ms 176.415us 619.776us 11.35% 670.241us 74.471us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 619.776us 11.35% 619.776us 68.864us 9
+ Activity Buffer Request 20.17% 1.434ms 20.17% 1.434ms 1.434ms 50.465us 0.92% 50.465us 50.465us 1
+ aten::transpose 0.93% 66.224us 1.25% 88.644us 3.693us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.32% 22.420us 0.32% 22.420us 0.934us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.25% 17.919us 1.02% 72.382us 8.042us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 1.14% 81.114us 1.14% 81.114us 3.863us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 1.46% 103.973us 1.46% 103.973us 8.664us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.04% 2.960us 0.04% 2.960us 0.987us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.12% 8.310us 0.12% 8.310us 2.770us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 67.47% 4.798ms 67.47% 4.798ms 4.798ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 6.952ms
-Self CUDA time total: 5.336ms
+Self CPU time total: 7.111ms
+Self CUDA time total: 5.460ms
@@ -4190,28 +4190,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 3.61% 259.378us 29.44% 2.116ms 2.116ms 0.000us 0.00% 5.734ms 5.734ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.688ms 100.15% 5.688ms 5.688ms 1
- aten::scaled_dot_product_attention 0.27% 19.560us 2.06% 147.832us 49.277us 0.000us 0.00% 5.042ms 1.681ms 3
- aten::_scaled_dot_product_efficient_attention 0.27% 19.340us 1.78% 128.272us 42.757us 0.000us 0.00% 5.042ms 1.681ms 3
- aten::_efficient_attention_forward 0.39% 28.380us 1.18% 84.990us 28.330us 5.042ms 88.79% 5.042ms 1.681ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.042ms 88.79% 5.042ms 1.681ms 3
- aten::contiguous 0.11% 8.118us 23.11% 1.661ms 184.525us 0.000us 0.00% 691.453us 76.828us 9
- aten::clone 0.32% 22.761us 23.00% 1.653ms 183.623us 0.000us 0.00% 691.453us 76.828us 9
- aten::copy_ 0.95% 68.519us 21.65% 1.556ms 172.887us 636.925us 11.21% 691.453us 76.828us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.925us 11.21% 636.925us 70.769us 9
- Activity Buffer Request 19.69% 1.415ms 19.69% 1.415ms 1.415ms 54.528us 0.96% 54.528us 54.528us 1
- aten::transpose 0.75% 54.034us 1.00% 71.792us 2.991us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.25% 17.758us 0.25% 17.758us 0.740us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.18% 12.992us 1.03% 73.863us 8.207us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 1.22% 87.512us 1.22% 87.512us 4.167us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 1.35% 96.951us 1.35% 96.951us 8.079us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.03% 2.489us 0.03% 2.489us 0.830us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 70.56% 5.071ms 70.56% 5.071ms 5.071ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 3.28% 242.746us 28.00% 2.075ms 2.075ms 0.000us 0.00% 5.933ms 5.933ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.886ms 100.14% 5.886ms 5.886ms 1
+ aten::scaled_dot_product_attention 0.25% 18.240us 1.89% 140.073us 46.691us 0.000us 0.00% 5.241ms 1.747ms 3
+ aten::_scaled_dot_product_efficient_attention 0.25% 18.689us 1.64% 121.833us 40.611us 0.000us 0.00% 5.241ms 1.747ms 3
+ aten::_efficient_attention_forward 0.38% 28.462us 1.09% 81.063us 27.021us 5.241ms 89.17% 5.241ms 1.747ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.241ms 89.17% 5.241ms 1.747ms 3
+ aten::contiguous 0.10% 7.041us 22.26% 1.650ms 183.285us 0.000us 0.00% 691.103us 76.789us 9
+ aten::clone 0.29% 21.342us 22.17% 1.643ms 182.503us 0.000us 0.00% 691.103us 76.789us 9
+ aten::copy_ 0.86% 63.451us 21.24% 1.574ms 174.872us 636.671us 10.83% 691.103us 76.789us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.671us 10.83% 636.671us 70.741us 9
+ Activity Buffer Request 19.50% 1.445ms 19.50% 1.445ms 1.445ms 54.432us 0.93% 54.432us 54.432us 1
+ aten::transpose 0.64% 47.650us 0.87% 64.701us 2.696us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.23% 17.051us 0.23% 17.051us 0.710us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.16% 11.589us 0.64% 47.330us 5.259us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.82% 60.521us 0.82% 60.521us 2.882us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 1.19% 88.044us 1.19% 88.044us 7.337us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.420us 0.03% 2.420us 0.807us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.04% 3.030us 0.04% 3.030us 1.010us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 72.00% 5.335ms 72.00% 5.335ms 5.335ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 7.187ms
-Self CUDA time total: 5.679ms
+Self CPU time total: 7.410ms
+Self CUDA time total: 5.878ms
@@ -4221,28 +4221,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 3.31% 247.873us 28.16% 2.111ms 2.111ms 0.000us 0.00% 6.014ms 6.014ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.964ms 100.13% 5.964ms 5.964ms 1
- aten::scaled_dot_product_attention 0.26% 19.681us 1.94% 145.404us 48.468us 0.000us 0.00% 5.300ms 1.767ms 3
- aten::_scaled_dot_product_efficient_attention 0.25% 18.780us 1.68% 125.723us 41.908us 0.000us 0.00% 5.300ms 1.767ms 3
- aten::_efficient_attention_forward 0.40% 29.910us 1.12% 83.752us 27.917us 5.300ms 89.00% 5.300ms 1.767ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.300ms 89.00% 5.300ms 1.767ms 3
- aten::contiguous 0.10% 7.548us 22.32% 1.673ms 185.921us 0.000us 0.00% 713.444us 79.272us 9
- aten::clone 0.29% 21.851us 22.22% 1.666ms 185.082us 0.000us 0.00% 713.444us 79.272us 9
- aten::copy_ 0.89% 66.441us 21.22% 1.591ms 176.813us 655.331us 11.00% 713.444us 79.272us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 655.331us 11.00% 655.331us 72.815us 9
- Activity Buffer Request 19.37% 1.452ms 19.37% 1.452ms 1.452ms 58.113us 0.98% 58.113us 58.113us 1
- aten::transpose 0.68% 50.773us 0.90% 67.843us 2.827us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.23% 17.070us 0.23% 17.070us 0.711us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.16% 12.290us 0.70% 52.570us 5.841us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 0.87% 64.980us 0.87% 64.980us 3.094us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 1.28% 96.085us 1.28% 96.085us 8.007us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.03% 2.520us 0.03% 2.520us 0.840us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.04% 3.050us 0.04% 3.050us 1.017us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 71.84% 5.386ms 71.84% 5.386ms 5.386ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 3.21% 244.055us 27.47% 2.092ms 2.092ms 0.000us 0.00% 6.130ms 6.130ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.080ms 100.14% 6.080ms 6.080ms 1
+ aten::scaled_dot_product_attention 0.23% 17.641us 1.86% 141.944us 47.315us 0.000us 0.00% 5.414ms 1.805ms 3
+ aten::_scaled_dot_product_efficient_attention 0.25% 19.359us 1.63% 124.303us 41.434us 0.000us 0.00% 5.414ms 1.805ms 3
+ aten::_efficient_attention_forward 0.37% 28.219us 1.06% 80.592us 26.864us 5.414ms 89.17% 5.414ms 1.805ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.414ms 89.17% 5.414ms 1.805ms 3
+ aten::contiguous 0.11% 8.060us 21.81% 1.661ms 184.510us 0.000us 0.00% 716.192us 79.577us 9
+ aten::clone 0.29% 22.431us 21.70% 1.653ms 183.615us 0.000us 0.00% 716.192us 79.577us 9
+ aten::copy_ 0.81% 61.641us 20.75% 1.580ms 175.564us 657.728us 10.83% 716.192us 79.577us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 657.728us 10.83% 657.728us 73.081us 9
+ Activity Buffer Request 19.08% 1.453ms 19.08% 1.453ms 1.453ms 58.464us 0.96% 58.464us 58.464us 1
+ aten::transpose 0.69% 52.203us 0.92% 69.763us 2.907us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.23% 17.560us 0.23% 17.560us 0.732us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.15% 11.581us 0.66% 50.023us 5.558us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.84% 63.785us 0.84% 63.785us 3.037us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 1.14% 86.832us 1.14% 86.832us 7.236us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.04% 3.260us 0.04% 3.260us 1.087us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 72.53% 5.522ms 72.53% 5.522ms 5.522ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 7.498ms
-Self CUDA time total: 5.956ms
+Self CPU time total: 7.614ms
+Self CUDA time total: 6.072ms
@@ -4252,28 +4252,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 3.20% 247.803us 30.17% 2.338ms 2.338ms 0.000us 0.00% 6.050ms 6.050ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.000ms 100.13% 6.000ms 6.000ms 1
- aten::scaled_dot_product_attention 0.37% 28.670us 2.04% 158.093us 52.698us 0.000us 0.00% 5.339ms 1.780ms 3
- aten::_scaled_dot_product_efficient_attention 0.26% 20.220us 1.67% 129.423us 43.141us 0.000us 0.00% 5.339ms 1.780ms 3
- aten::_efficient_attention_forward 0.38% 29.560us 1.08% 83.863us 27.954us 5.339ms 89.10% 5.339ms 1.780ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.339ms 89.10% 5.339ms 1.780ms 3
- aten::contiguous 0.10% 7.610us 24.36% 1.887ms 209.722us 0.000us 0.00% 711.328us 79.036us 9
- aten::clone 0.28% 21.914us 24.26% 1.880ms 208.876us 0.000us 0.00% 711.328us 79.036us 9
- aten::copy_ 0.87% 67.261us 23.30% 1.806ms 200.640us 653.248us 10.90% 711.328us 79.036us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.248us 10.90% 653.248us 72.583us 9
- Activity Buffer Request 18.39% 1.425ms 18.39% 1.425ms 1.425ms 58.080us 0.97% 58.080us 58.080us 1
- aten::transpose 0.68% 52.310us 0.90% 69.650us 2.902us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.22% 17.340us 0.22% 17.340us 0.723us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.16% 12.088us 0.67% 52.209us 5.801us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 0.84% 64.993us 0.84% 64.993us 3.095us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 4.36% 337.546us 4.36% 337.546us 28.129us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.03% 2.491us 0.03% 2.491us 0.830us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.04% 3.020us 0.04% 3.020us 1.007us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 69.83% 5.411ms 69.83% 5.411ms 5.411ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 3.16% 248.365us 29.29% 2.300ms 2.300ms 0.000us 0.00% 6.163ms 6.163ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.114ms 100.14% 6.114ms 6.114ms 1
+ aten::scaled_dot_product_attention 0.24% 19.232us 1.82% 142.774us 47.591us 0.000us 0.00% 5.452ms 1.817ms 3
+ aten::_scaled_dot_product_efficient_attention 0.25% 19.461us 1.57% 123.542us 41.181us 0.000us 0.00% 5.452ms 1.817ms 3
+ aten::_efficient_attention_forward 0.37% 29.029us 1.03% 80.672us 26.891us 5.452ms 89.29% 5.452ms 1.817ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.452ms 89.29% 5.452ms 1.817ms 3
+ aten::contiguous 0.10% 7.931us 23.78% 1.867ms 207.435us 0.000us 0.00% 711.072us 79.008us 9
+ aten::clone 0.30% 23.532us 23.68% 1.859ms 206.554us 0.000us 0.00% 711.072us 79.008us 9
+ aten::copy_ 0.81% 63.779us 22.73% 1.785ms 198.306us 653.792us 10.71% 711.072us 79.008us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.792us 10.71% 653.792us 72.644us 9
+ Activity Buffer Request 18.59% 1.459ms 18.59% 1.459ms 1.459ms 57.280us 0.94% 57.280us 57.280us 1
+ aten::transpose 0.62% 48.610us 0.83% 65.130us 2.714us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.21% 16.520us 0.21% 16.520us 0.688us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.16% 12.281us 0.65% 50.702us 5.634us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.80% 62.502us 0.80% 62.502us 2.976us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 3.60% 282.729us 3.60% 282.729us 23.561us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.471us 0.03% 2.471us 0.824us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.05% 4.120us 0.05% 4.120us 1.373us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 70.71% 5.551ms 70.71% 5.551ms 5.551ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 7.749ms
-Self CUDA time total: 5.992ms
+Self CPU time total: 7.851ms
+Self CUDA time total: 6.106ms
@@ -4283,28 +4283,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 3.22% 253.272us 29.03% 2.283ms 2.283ms 0.000us 0.00% 6.248ms 6.248ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.196ms 100.13% 6.196ms 6.196ms 1
- aten::scaled_dot_product_attention 0.25% 19.441us 2.25% 176.884us 58.961us 0.000us 0.00% 5.524ms 1.841ms 3
- aten::_scaled_dot_product_efficient_attention 0.26% 20.811us 2.00% 157.443us 52.481us 0.000us 0.00% 5.524ms 1.841ms 3
- aten::_efficient_attention_forward 0.41% 31.883us 1.42% 111.902us 37.301us 5.524ms 89.27% 5.524ms 1.841ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.524ms 89.27% 5.524ms 1.841ms 3
- aten::contiguous 0.10% 7.580us 22.97% 1.807ms 200.732us 0.000us 0.00% 724.035us 80.448us 9
- aten::clone 0.28% 22.150us 22.88% 1.799ms 199.890us 0.000us 0.00% 724.035us 80.448us 9
- aten::copy_ 0.85% 67.019us 21.94% 1.725ms 191.709us 664.226us 10.73% 724.035us 80.448us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.226us 10.73% 664.226us 73.803us 9
- Activity Buffer Request 18.12% 1.425ms 18.12% 1.425ms 1.425ms 59.809us 0.97% 59.809us 59.809us 1
- aten::transpose 0.68% 53.201us 0.91% 71.182us 2.966us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.23% 17.981us 0.23% 17.981us 0.749us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.15% 12.001us 0.65% 51.482us 5.720us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 0.81% 63.729us 0.81% 63.729us 3.035us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 3.60% 283.426us 3.60% 283.426us 23.619us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.03% 2.490us 0.03% 2.490us 0.830us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.04% 2.980us 0.04% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 70.97% 5.581ms 70.97% 5.581ms 5.581ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 3.01% 243.675us 28.03% 2.272ms 2.272ms 0.000us 0.00% 6.451ms 6.451ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.399ms 100.13% 6.399ms 6.399ms 1
+ aten::scaled_dot_product_attention 0.23% 18.671us 1.77% 143.224us 47.741us 0.000us 0.00% 5.726ms 1.909ms 3
+ aten::_scaled_dot_product_efficient_attention 0.24% 19.652us 1.54% 124.553us 41.518us 0.000us 0.00% 5.726ms 1.909ms 3
+ aten::_efficient_attention_forward 0.35% 28.317us 0.99% 80.642us 26.881us 5.726ms 89.60% 5.726ms 1.909ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.726ms 89.60% 5.726ms 1.909ms 3
+ aten::contiguous 0.10% 7.791us 22.70% 1.840ms 204.460us 0.000us 0.00% 725.025us 80.558us 9
+ aten::clone 0.29% 23.489us 22.61% 1.832ms 203.594us 0.000us 0.00% 725.025us 80.558us 9
+ aten::copy_ 0.81% 65.293us 21.68% 1.757ms 195.223us 664.641us 10.40% 725.025us 80.558us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.641us 10.40% 664.641us 73.849us 9
+ Activity Buffer Request 17.77% 1.440ms 17.77% 1.440ms 1.440ms 60.384us 0.94% 60.384us 60.384us 1
+ aten::transpose 0.63% 51.151us 0.85% 69.251us 2.885us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.22% 18.100us 0.22% 18.100us 0.754us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.15% 11.960us 0.64% 51.852us 5.761us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.79% 64.314us 0.79% 64.314us 3.063us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 3.36% 272.117us 3.36% 272.117us 22.676us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.500us 0.03% 2.500us 0.833us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.06% 4.532us 0.06% 4.532us 1.511us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 71.97% 5.833ms 71.97% 5.833ms 5.833ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 7.864ms
-Self CUDA time total: 6.188ms
+Self CPU time total: 8.105ms
+Self CUDA time total: 6.391ms
@@ -4314,37 +4314,37 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- torch_mem_eff 3.10% 256.636us 27.41% 2.272ms 2.272ms 0.000us 0.00% 6.685ms 6.685ms 1
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.632ms 100.12% 6.632ms 6.632ms 1
- aten::scaled_dot_product_attention 0.23% 18.791us 1.80% 149.483us 49.828us 0.000us 0.00% 5.954ms 1.985ms 3
- aten::_scaled_dot_product_efficient_attention 0.24% 19.642us 1.58% 130.692us 43.564us 0.000us 0.00% 5.954ms 1.985ms 3
- aten::_efficient_attention_forward 0.40% 33.027us 1.05% 86.901us 28.967us 5.954ms 89.88% 5.954ms 1.985ms 3
-fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.954ms 89.88% 5.954ms 1.985ms 3
- aten::contiguous 0.09% 7.531us 21.68% 1.797ms 199.660us 0.000us 0.00% 731.136us 81.237us 9
- aten::clone 0.27% 22.649us 21.59% 1.789ms 198.823us 0.000us 0.00% 731.136us 81.237us 9
- aten::copy_ 0.82% 67.700us 20.66% 1.712ms 190.261us 670.176us 10.12% 731.136us 81.237us 9
-void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 670.176us 10.12% 670.176us 74.464us 9
- Activity Buffer Request 17.30% 1.434ms 17.30% 1.434ms 1.434ms 60.960us 0.92% 60.960us 60.960us 1
- aten::transpose 0.90% 75.001us 1.12% 92.890us 3.870us 0.000us 0.00% 0.000us 0.000us 24
- aten::as_strided 0.22% 17.889us 0.22% 17.889us 0.745us 0.000us 0.00% 0.000us 0.000us 24
- aten::empty_like 0.15% 12.259us 0.66% 54.410us 6.046us 0.000us 0.00% 0.000us 0.000us 9
- aten::empty 0.81% 67.133us 0.81% 67.133us 3.197us 0.000us 0.00% 0.000us 0.000us 21
- cudaLaunchKernel 2.82% 234.057us 2.82% 234.057us 19.505us 0.000us 0.00% 0.000us 0.000us 12
- cudaStreamIsCapturing 0.03% 2.420us 0.03% 2.420us 0.807us 0.000us 0.00% 0.000us 0.000us 3
- cudaFuncSetAttribute 0.04% 3.430us 0.04% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
- cudaDeviceSynchronize 72.59% 6.017ms 72.59% 6.017ms 6.017ms 0.000us 0.00% 0.000us 0.000us 1
+ torch_mem_eff 2.88% 242.135us 27.00% 2.269ms 2.269ms 0.000us 0.00% 6.759ms 6.759ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.705ms 100.12% 6.705ms 6.705ms 1
+ aten::scaled_dot_product_attention 0.21% 17.851us 1.72% 144.884us 48.295us 0.000us 0.00% 6.024ms 2.008ms 3
+ aten::_scaled_dot_product_efficient_attention 0.23% 19.591us 1.51% 127.033us 42.344us 0.000us 0.00% 6.024ms 2.008ms 3
+ aten::_efficient_attention_forward 0.34% 28.520us 0.97% 81.532us 27.177us 6.024ms 89.96% 6.024ms 2.008ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.024ms 89.96% 6.024ms 2.008ms 3
+ aten::contiguous 0.10% 8.099us 21.87% 1.838ms 204.242us 0.000us 0.00% 734.178us 81.575us 9
+ aten::clone 0.28% 23.122us 21.78% 1.830ms 203.342us 0.000us 0.00% 734.178us 81.575us 9
+ aten::copy_ 0.74% 62.180us 20.86% 1.753ms 194.799us 672.322us 10.04% 734.178us 81.575us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 672.322us 10.04% 672.322us 74.702us 9
+ Activity Buffer Request 17.19% 1.445ms 17.19% 1.445ms 1.445ms 61.856us 0.92% 61.856us 61.856us 1
+ aten::transpose 0.62% 52.351us 0.83% 70.022us 2.918us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.21% 17.671us 0.21% 17.671us 0.736us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.15% 12.653us 0.64% 53.763us 5.974us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.79% 66.761us 0.79% 66.761us 3.179us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 3.19% 267.907us 3.19% 267.907us 22.326us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.430us 0.03% 2.430us 0.810us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.04% 3.350us 0.04% 3.350us 1.117us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 73.00% 6.134ms 73.00% 6.134ms 6.134ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 8.289ms
-Self CUDA time total: 6.624ms
+Self CPU time total: 8.404ms
+Self CUDA time total: 6.697ms
impl wl p50(ms) ok
-torch_mem_eff cuda_attn_L128_bfloat16 1.81 True
-torch_mem_eff cuda_attn_L256_bfloat16 1.88 True
-torch_mem_eff cuda_attn_L320_bfloat16 1.97 True
-torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
-torch_mem_eff cuda_attn_L448_bfloat16 2.09 True
-torch_mem_eff cuda_attn_L512_bfloat16 2.22 True
+torch_mem_eff cuda_attn_L128_bfloat16 1.85 True
+torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
+torch_mem_eff cuda_attn_L320_bfloat16 1.99 True
+torch_mem_eff cuda_attn_L384_bfloat16 2.07 True
+torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
+torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
Artifacts:
diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html
index fce8d8891e35f4da7c7b93129ab9c68bf413d0a6..c964f0f922939bcdffdf70f7e986e24de2938dac 100644
--- a/flash_attn/impls/sage_attention.html
+++ b/flash_attn/impls/sage_attention.html
@@ -4104,13 +4104,14 @@ body[data-tool="eraser"] .main-content {
▼ code
▼ output
- ▶ uv-logs
+ ▶ uv-logs
|
-Cell: benchmark | 4.12s
+Cell: benchmark | 4.69s
|
▶ run
Copy
Raw
-
GitHub
+
GitHub
+
🤗 HF
@@ -4155,24 +4156,27 @@ Cell: benchmark | 4.12s
Running attention benchmark on cuda with 6 workloads.
impl wl p50(ms) ok
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
-
-Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
-Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:00, 17.35it/s]
-Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 15.18it/s]
-Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 21.06it/s]
+
+
+
+Installed 15 packages in 14ms
+
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
+Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 11.73it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 16.12it/s]
Artifacts:
attention.jsonl
diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html
index e1ecdb582681c3ec96cb0b0c54cc3f176cd9f9eb..3e1c781413a91f403396426a1c99ea9ec7673187 100644
--- a/flash_attn/impls/xformers.html
+++ b/flash_attn/impls/xformers.html
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
▼ output
▶ uv-logs
|
-Cell: benchmark | 5.04s
+Cell: benchmark | 33.71s
|
▶ run
Copy
Raw
-
GitHub
+
GitHub
@@ -4158,21 +4158,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 11.46% 506.438us 53.66% 2.372ms 2.372ms 0.000us 0.00% 3.500ms 3.500ms 1
- xformers_flash3::flash_fwd 4.48% 198.083us 41.44% 1.831ms 610.487us 0.000us 0.00% 3.500ms 1.167ms 3
- flash_attn_3::fwd 1.73% 76.649us 36.96% 1.633ms 544.459us 2.610ms 100.00% 3.500ms 1.167ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.612ms 100.06% 2.612ms 2.612ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.610ms 100.00% 2.610ms 870.154us 3
- Activity Buffer Request 33.26% 1.470ms 33.26% 1.470ms 1.470ms 889.248us 34.06% 889.248us 889.248us 1
- aten::empty 0.80% 35.182us 0.80% 35.182us 5.864us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.25% 10.920us 0.25% 10.920us 3.640us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.92% 40.501us 0.92% 40.501us 13.500us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.27% 12.132us 0.77% 33.872us 5.645us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.49% 21.740us 0.49% 21.740us 3.623us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 46.34% 2.048ms 46.34% 2.048ms 2.048ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 10.98% 488.134us 52.82% 2.349ms 2.349ms 0.000us 0.00% 3.539ms 3.539ms 1
+ xformers_flash3::flash_fwd 4.45% 198.034us 41.02% 1.824ms 608.009us 0.000us 0.00% 3.539ms 1.180ms 3
+ flash_attn_3::fwd 1.81% 80.354us 36.57% 1.626ms 541.997us 2.647ms 100.00% 3.539ms 1.180ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.648ms 100.06% 2.648ms 2.648ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.647ms 100.00% 2.647ms 882.203us 3
+ Activity Buffer Request 32.65% 1.452ms 32.65% 1.452ms 1.452ms 892.891us 33.74% 892.891us 892.891us 1
+ aten::empty 0.78% 34.470us 0.78% 34.470us 5.745us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.26% 11.370us 0.26% 11.370us 3.790us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.08% 47.851us 1.08% 47.851us 15.950us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.28% 12.261us 0.82% 36.420us 6.070us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.54% 24.159us 0.54% 24.159us 4.026us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 47.18% 2.098ms 47.18% 2.098ms 2.098ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.420ms
-Self CUDA time total: 2.610ms
+Self CPU time total: 4.447ms
+Self CUDA time total: 2.647ms
@@ -4182,21 +4182,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 7.25% 318.297us 46.47% 2.042ms 2.042ms 0.000us 0.00% 3.722ms 3.722ms 1
- xformers_flash3::flash_fwd 3.37% 148.131us 38.68% 1.699ms 566.453us 0.000us 0.00% 3.722ms 1.241ms 3
- flash_attn_3::fwd 1.17% 51.450us 35.31% 1.551ms 517.076us 2.780ms 100.00% 3.722ms 1.241ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.782ms 100.05% 2.782ms 2.782ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.780ms 100.00% 2.780ms 926.692us 3
- Activity Buffer Request 32.58% 1.431ms 32.58% 1.431ms 1.431ms 942.244us 33.89% 942.244us 942.244us 1
- aten::empty 0.66% 29.210us 0.66% 29.210us 4.868us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.13% 5.512us 0.13% 5.512us 1.837us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.77% 34.031us 0.77% 34.031us 11.344us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.21% 9.369us 0.54% 23.900us 3.983us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.33% 14.531us 0.33% 14.531us 2.422us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 53.53% 2.351ms 53.53% 2.351ms 2.351ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 7.22% 318.208us 46.97% 2.070ms 2.070ms 0.000us 0.00% 3.700ms 3.700ms 1
+ xformers_flash3::flash_fwd 3.33% 146.973us 39.20% 1.728ms 575.898us 0.000us 0.00% 3.700ms 1.233ms 3
+ flash_attn_3::fwd 1.20% 53.004us 35.87% 1.581ms 526.907us 2.767ms 100.00% 3.700ms 1.233ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.769ms 100.05% 2.769ms 2.769ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.767ms 100.00% 2.767ms 922.499us 3
+ Activity Buffer Request 33.12% 1.459ms 33.12% 1.459ms 1.459ms 932.857us 33.71% 932.857us 932.857us 1
+ aten::empty 0.65% 28.790us 0.65% 28.790us 4.798us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.13% 5.860us 0.13% 5.860us 1.953us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.76% 33.580us 0.76% 33.580us 11.193us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.21% 9.291us 0.54% 23.901us 3.983us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.33% 14.610us 0.33% 14.610us 2.435us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 53.03% 2.337ms 53.03% 2.337ms 2.337ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.393ms
-Self CUDA time total: 2.780ms
+Self CPU time total: 4.407ms
+Self CUDA time total: 2.767ms
@@ -4206,21 +4206,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 6.91% 309.504us 45.24% 2.025ms 2.025ms 0.000us 0.00% 3.854ms 3.854ms 1
- xformers_flash3::flash_fwd 3.30% 147.756us 37.80% 1.692ms 563.990us 0.000us 0.00% 3.854ms 1.285ms 3
- flash_attn_3::fwd 1.19% 53.048us 34.50% 1.544ms 514.738us 2.875ms 100.00% 3.854ms 1.285ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.877ms 100.05% 2.877ms 2.877ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.875ms 100.00% 2.875ms 958.381us 3
- Activity Buffer Request 31.77% 1.422ms 31.77% 1.422ms 1.422ms 979.266us 34.06% 979.266us 979.266us 1
- aten::empty 0.67% 29.790us 0.67% 29.790us 4.965us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.12% 5.570us 0.12% 5.570us 1.857us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 0.76% 33.852us 0.76% 33.852us 11.284us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.22% 9.920us 0.53% 23.660us 3.943us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.31% 13.740us 0.31% 13.740us 2.290us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 54.76% 2.451ms 54.76% 2.451ms 2.451ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 6.87% 306.279us 45.67% 2.036ms 2.036ms 0.000us 0.00% 3.803ms 3.803ms 1
+ xformers_flash3::flash_fwd 3.28% 146.193us 38.29% 1.707ms 568.871us 0.000us 0.00% 3.803ms 1.268ms 3
+ flash_attn_3::fwd 1.22% 54.360us 35.01% 1.560ms 520.140us 2.841ms 100.00% 3.803ms 1.268ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.843ms 100.05% 2.843ms 2.843ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.841ms 100.00% 2.841ms 947.064us 3
+ Activity Buffer Request 32.21% 1.435ms 32.21% 1.435ms 1.435ms 961.848us 33.85% 961.848us 961.848us 1
+ aten::empty 0.68% 30.200us 0.68% 30.200us 5.033us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.560us 0.12% 5.560us 1.853us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.78% 34.863us 0.78% 34.863us 11.621us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.20% 8.808us 0.51% 22.610us 3.768us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.31% 13.802us 0.31% 13.802us 2.300us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 54.33% 2.422ms 54.33% 2.422ms 2.422ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.476ms
-Self CUDA time total: 2.875ms
+Self CPU time total: 4.457ms
+Self CUDA time total: 2.841ms
@@ -4230,21 +4230,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 6.53% 306.895us 47.96% 2.255ms 2.255ms 0.000us 0.00% 3.838ms 3.838ms 1
- xformers_flash3::flash_fwd 3.09% 145.243us 40.94% 1.925ms 641.651us 0.000us 0.00% 3.838ms 1.279ms 3
- flash_attn_3::fwd 1.17% 55.062us 37.85% 1.780ms 593.237us 2.865ms 100.00% 3.838ms 1.279ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.866ms 100.05% 2.866ms 2.866ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.865ms 100.00% 2.865ms 954.931us 3
- Activity Buffer Request 30.23% 1.421ms 30.23% 1.421ms 1.421ms 973.182us 33.97% 973.182us 973.182us 1
- aten::empty 0.63% 29.790us 0.63% 29.790us 4.965us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.11% 5.390us 0.11% 5.390us 1.797us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 5.70% 268.094us 5.70% 268.094us 89.365us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.19% 8.710us 0.49% 22.930us 3.822us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.30% 14.220us 0.30% 14.220us 2.370us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 52.04% 2.447ms 52.04% 2.447ms 2.447ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 6.67% 311.798us 48.16% 2.253ms 2.253ms 0.000us 0.00% 3.854ms 3.854ms 1
+ xformers_flash3::flash_fwd 3.68% 172.144us 40.98% 1.917ms 638.949us 0.000us 0.00% 3.854ms 1.285ms 3
+ flash_attn_3::fwd 1.19% 55.670us 37.30% 1.745ms 581.568us 2.881ms 100.00% 3.854ms 1.285ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.05% 2.883ms 2.883ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.881ms 100.00% 2.881ms 960.465us 3
+ Activity Buffer Request 30.77% 1.440ms 30.77% 1.440ms 1.440ms 972.603us 33.75% 972.603us 972.603us 1
+ aten::empty 0.63% 29.580us 0.63% 29.580us 4.930us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.801us 0.12% 5.801us 1.934us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 4.58% 214.036us 4.58% 214.036us 71.345us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.19% 9.019us 0.51% 24.051us 4.009us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.32% 15.032us 0.32% 15.032us 2.505us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 51.84% 2.425ms 51.84% 2.425ms 2.425ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 4.702ms
-Self CUDA time total: 2.865ms
+Self CPU time total: 4.678ms
+Self CUDA time total: 2.881ms
@@ -4254,21 +4254,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 6.46% 328.735us 43.31% 2.206ms 2.206ms 0.000us 0.00% 4.477ms 4.477ms 1
- xformers_flash3::flash_fwd 3.06% 155.642us 36.36% 1.852ms 617.231us 0.000us 0.00% 4.477ms 1.492ms 3
- flash_attn_3::fwd 1.12% 56.881us 33.30% 1.696ms 565.350us 3.348ms 100.00% 4.477ms 1.492ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.349ms 100.04% 3.349ms 3.349ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.348ms 100.00% 3.348ms 1.116ms 3
- Activity Buffer Request 27.91% 1.421ms 27.91% 1.421ms 1.421ms 1.129ms 33.72% 1.129ms 1.129ms 1
- aten::empty 0.63% 32.251us 0.63% 32.251us 5.375us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.11% 5.740us 0.11% 5.740us 1.913us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 3.53% 179.913us 3.53% 179.913us 59.971us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.21% 10.692us 0.50% 25.231us 4.205us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.29% 14.539us 0.29% 14.539us 2.423us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 56.69% 2.887ms 56.69% 2.887ms 2.887ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 5.88% 304.576us 42.22% 2.188ms 2.188ms 0.000us 0.00% 4.552ms 4.552ms 1
+ xformers_flash3::flash_fwd 2.84% 147.154us 35.91% 1.861ms 620.213us 0.000us 0.00% 4.552ms 1.517ms 3
+ flash_attn_3::fwd 1.02% 52.961us 33.07% 1.713ms 571.161us 3.412ms 100.00% 4.552ms 1.517ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.414ms 100.04% 3.414ms 3.414ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.412ms 100.00% 3.412ms 1.137ms 3
+ Activity Buffer Request 27.95% 1.448ms 27.95% 1.448ms 1.448ms 1.140ms 33.41% 1.140ms 1.140ms 1
+ aten::empty 0.56% 29.272us 0.56% 29.272us 4.879us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 6.180us 0.12% 6.180us 2.060us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.41% 176.624us 3.41% 176.624us 58.875us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.17% 9.052us 0.44% 22.882us 3.814us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.27% 13.830us 0.27% 13.830us 2.305us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 57.78% 2.994ms 57.78% 2.994ms 2.994ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.092ms
-Self CUDA time total: 3.348ms
+Self CPU time total: 5.182ms
+Self CUDA time total: 3.412ms
@@ -4278,37 +4278,83 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- xformers_meff 6.24% 320.533us 43.45% 2.233ms 2.233ms 0.000us 0.00% 4.496ms 4.496ms 1
- xformers_flash3::flash_fwd 2.90% 149.124us 36.73% 1.887ms 629.094us 0.000us 0.00% 4.496ms 1.499ms 3
- flash_attn_3::fwd 1.48% 76.290us 33.83% 1.738ms 579.386us 3.368ms 100.00% 4.496ms 1.499ms 3
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.369ms 100.05% 3.369ms 3.369ms 1
-void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.368ms 100.00% 3.368ms 1.123ms 3
- Activity Buffer Request 28.33% 1.456ms 28.33% 1.456ms 1.456ms 1.129ms 33.51% 1.129ms 1.129ms 1
- aten::empty 0.58% 29.962us 0.58% 29.962us 4.994us 0.000us 0.00% 0.000us 0.000us 6
- cudaFuncSetAttribute 0.12% 6.240us 0.12% 6.240us 2.080us 0.000us 0.00% 0.000us 0.000us 3
- cudaLaunchKernel 3.31% 169.832us 3.31% 169.832us 56.611us 0.000us 0.00% 0.000us 0.000us 3
- aten::reshape 0.21% 10.672us 0.48% 24.873us 4.146us 0.000us 0.00% 0.000us 0.000us 6
- aten::view 0.28% 14.201us 0.28% 14.201us 2.367us 0.000us 0.00% 0.000us 0.000us 6
- cudaDeviceSynchronize 56.55% 2.906ms 56.55% 2.906ms 2.906ms 0.000us 0.00% 0.000us 0.000us 1
+ xformers_meff 5.58% 285.697us 41.87% 2.143ms 2.143ms 0.000us 0.00% 4.544ms 4.544ms 1
+ xformers_flash3::flash_fwd 2.91% 148.714us 35.83% 1.834ms 611.255us 0.000us 0.00% 4.544ms 1.515ms 3
+ flash_attn_3::fwd 1.04% 53.311us 32.92% 1.685ms 561.684us 3.402ms 100.00% 4.544ms 1.515ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.403ms 100.05% 3.403ms 3.403ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.402ms 100.00% 3.402ms 1.134ms 3
+ Activity Buffer Request 27.78% 1.422ms 27.78% 1.422ms 1.422ms 1.142ms 33.57% 1.142ms 1.142ms 1
+ aten::empty 0.58% 29.640us 0.58% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.990us 0.12% 5.990us 1.997us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.40% 174.134us 3.40% 174.134us 58.045us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.17% 8.543us 0.45% 23.191us 3.865us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.29% 14.648us 0.29% 14.648us 2.441us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 58.13% 2.975ms 58.13% 2.975ms 2.975ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 5.138ms
-Self CUDA time total: 3.368ms
+Self CPU time total: 5.118ms
+Self CUDA time total: 3.402ms
impl wl p50(ms) ok
-xformers_meff cuda_attn_L128_bfloat16 0.98 True
-xformers_meff cuda_attn_L256_bfloat16 1.02 True
-xformers_meff cuda_attn_L320_bfloat16 1.07 True
+xformers_meff cuda_attn_L128_bfloat16 1.00 True
+xformers_meff cuda_attn_L256_bfloat16 1.03 True
+xformers_meff cuda_attn_L320_bfloat16 1.08 True
xformers_meff cuda_attn_L384_bfloat16 1.08 True
-xformers_meff cuda_attn_L448_bfloat16 1.24 True
+xformers_meff cuda_attn_L448_bfloat16 1.25 True
xformers_meff cuda_attn_L512_bfloat16 1.23 True
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+Downloading networkx (1.9MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading numpy (16.2MiB)
+Downloading torch (846.9MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading matplotlib (8.3MiB)
+Downloading triton (148.3MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
Downloading xformers (111.8MiB)
+Downloading pillow (6.7MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading setuptools
+ Downloading networkx
+ Downloading fonttools
+ Downloading pillow
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading sympy
+ Downloading numpy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
Downloading xformers
-Installed 1 package in 13ms
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 38 packages in 236ms
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg
index 0f51d77bf35af08b6174bc4df17db6fe30a4e491..31d30c5dcfa68f4fc35593a1422ddd982b5374d8 100644
--- a/flash_attn/results/artifacts/combine/latency.svg
+++ b/flash_attn/results/artifacts/combine/latency.svg
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:c6390d15c17c1cced5612c62eb1fb07f7304765d3d9c2c842f634fd3107bbeaf
-size 24786
+oid sha256:520b28a43c879f6952cf0ddeade1438dbb5bd7caf01b6509254a4c68e9446ee6
+size 24783
diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html
index dbe50dede3b447c779732c2f39dd59bfd2928e4f..0682107b1540718d4e870417450dee78797760de 100644
--- a/flash_attn/results/combined_results.html
+++ b/flash_attn/results/combined_results.html
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
- 2025-10-30T15:53:53.940454
+ 2025-10-31T20:14:18.946177
image/svg+xml
@@ -4217,96 +4217,96 @@ body[data-tool="eraser"] .main-content {
-
+
-
+
- 1.0
+ 1.0
-
+
-
+
- 1.2
+ 1.2
-
+
-
+
- 1.4
+ 1.4
-
+
-
+
- 1.6
+ 1.6
-
+
-
+
- 1.8
+ 1.8
-
+
-
+
- 2.0
+ 2.0
-
+
-
+
- 2.2
+ 2.2
@@ -4314,73 +4314,73 @@ body[data-tool="eraser"] .main-content {
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
+
+
+
+
+
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
+
+
+
+
+
@@ -4465,7 +4465,7 @@ body[data-tool="eraser"] .main-content {
▼ output
▶ uv-logs
|
-Cell: combine | 4.26s
+Cell: combine | 4.31s
| ▶ run
Copy
Raw
@@ -4572,47 +4572,47 @@ Summary: 6 found, 0 skipped, 0 missing
COMBINED BENCHMARK SUMMARY
impl wl p50(ms) ok
-hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.94 True
-hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True
-hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.03 True
-hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
-hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
+hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
+hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
+hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
+hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
+hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
-hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True
-hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.95 True
-hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
-hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
-hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
+hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
+hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
+hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
+hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.03 True
+hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
- Error: module 'sage_attention_cb34d81dafacbad9' has no attribute 'fwd'
+ Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
-torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
-torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
-torch_flash_ma cuda_attn_L384_bfloat16 1.30 True
-torch_flash_ma cuda_attn_L448_bfloat16 1.45 True
-torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
-torch_mem_eff cuda_attn_L128_bfloat16 1.81 True
-torch_mem_eff cuda_attn_L256_bfloat16 1.88 True
-torch_mem_eff cuda_attn_L320_bfloat16 1.97 True
-torch_mem_eff cuda_attn_L384_bfloat16 1.97 True
-torch_mem_eff cuda_attn_L448_bfloat16 2.09 True
-torch_mem_eff cuda_attn_L512_bfloat16 2.22 True
-xformers_meff cuda_attn_L128_bfloat16 0.98 True
-xformers_meff cuda_attn_L256_bfloat16 1.02 True
-xformers_meff cuda_attn_L320_bfloat16 1.07 True
+torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
+torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
+torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
+torch_flash_ma cuda_attn_L448_bfloat16 1.50 True
+torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
+torch_mem_eff cuda_attn_L128_bfloat16 1.85 True
+torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
+torch_mem_eff cuda_attn_L320_bfloat16 1.99 True
+torch_mem_eff cuda_attn_L384_bfloat16 2.07 True
+torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
+torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
+xformers_meff cuda_attn_L128_bfloat16 1.00 True
+xformers_meff cuda_attn_L256_bfloat16 1.03 True
+xformers_meff cuda_attn_L320_bfloat16 1.08 True
xformers_meff cuda_attn_L384_bfloat16 1.08 True
-xformers_meff cuda_attn_L448_bfloat16 1.24 True
+xformers_meff cuda_attn_L448_bfloat16 1.25 True
xformers_meff cuda_attn_L512_bfloat16 1.23 True
GENERATING COMBINED VISUALIZATION
@@ -4637,7 +4637,7 @@ Implementations included:
-Installed 37 packages in 190ms
+Installed 37 packages in 225ms
@@ -4650,7 +4650,7 @@ Installed 37 packages in 190ms
- 2025-10-30T15:53:53.940454
+ 2025-10-31T20:14:18.946177
image/svg+xml
@@ -4760,96 +4760,96 @@ Installed 37 packages in 190ms
-
+
-
+
- 1.0
+ 1.0
-
+
-
+
- 1.2
+ 1.2
-
+
-
+
- 1.4
+ 1.4
-
+
-
+
- 1.6
+ 1.6
-
+
-
+
- 1.8
+ 1.8
-
+
-
+
- 2.0
+ 2.0
-
+
-
+
- 2.2
+ 2.2
@@ -4857,73 +4857,73 @@ Installed 37 packages in 190ms
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
+
+
+
+
+
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
-
+
+
+
+
+
+
-
+
-
-
-
-
-
+
+
+
+
+
diff --git a/index.html b/index.html
index 6d43c0f1f0a8b42c583cb3e0d6a059916ac01ccc..11cdf1eef85f4dda68d9e978af612e8aae0078bb 100644
--- a/index.html
+++ b/index.html
@@ -4097,35 +4097,54 @@ body[data-tool="eraser"] .main-content {
-
KERNELS COMMUNITY BENCHMARKS
+
+
+
+
KERNELS COMMUNITY BENCHMARKS
This report aggregates latency and performance benchmarks across core model components.
Each section includes:
- A latency visualization
- Links to detailed implementation benchmarks
TABLE OF CONTENTS
+
RUN YOURSELF
+
To run the benchmarks locally, clone the repository and use uvx to build and run the benchmarks:
+
Note benches are made to run on a machine with a compatible NVIDIA GPU and CUDA installed, other hardware may not not work as expected.
+
git clone https://github.com/huggingface/kernels-benchmarks.git
+cd kernels-benchmarks
+uvx https://github.com/drbh/uvnote.git build benches
+
+
METHODOLOGY
-
Each benchmark is run with the Kernels Benchmarking Framework and follows these principles:
+
Each benchmark is run with the
+Kernels Benchmarking Framework and follows these principles:
- a reference implementation (usually PyTorch native) is included for baseline comparison
- multiple input sizes and batch sizes are tested to reflect real-world usage
- runs are repeatable via python virtual environments and documented dependencies
- results are collected and visualized using standardized scripts
-
+
+
BENCHMARKS
Note: Latency values are measured in milliseconds (ms). Lower values indicate better performance.
-
LAYER NORMALIZATION
+
ACTIVATION FUNCTIONS
-
+
@@ -4133,32 +4152,40 @@ Each section includes:
Implementation
Description
+Source
+HF
+Bench
-HF Kernels Layer Norm
-HuggingFace kernels implementation
+HF Kernels SwiGLU
+HuggingFace kernels SwiGLU implementation
+GitHub
+HF
+Bench
-PyTorch Layer Norm
-PyTorch native implementation
+PyTorch SwiGLU
+PyTorch native SwiGLU implementation
+-
+-
+Bench
-
-
Explore Full Bench
-
ROTARY POSITION EMBEDDINGS
+
FLASH ATTENTION
-
+
@@ -4166,31 +4193,68 @@ Each section includes:
Implementation
Description
+Source
+HF
+Bench
-HF Kernels Rotary
-HuggingFace kernels implementation
+Flash Attention
+Torch SDPA Flash Attention implementation
+-
+-
+Bench
-PyTorch Rotary
-PyTorch native implementation
+HF Kernels Flash Attention 2
+HuggingFace kernels Flash Attention
+GitHub
+HF
+Bench
+
+
+HF Kernels Flash Attention 3
+HuggingFace kernels Flash Attention 3
+GitHub
+HF
+Bench
+
+
+Memory Efficient Attention
+Memory efficient attention implementation
+
+-
+Bench
+
+
+Sage Attention
+Sage attention implementation
+
+HF
+Bench
+
+
+xFormers
+xFormers attention implementation
+GitHub
+-
+Bench
Explore Full Bench
-
FLASH ATTENTION
+
DEFORMABLE DETR
-
+
@@ -4198,38 +4262,72 @@ Each section includes:
Implementation
Description
+Source
+HF
+Bench
-Flash Attention
-Flash Attention implementation
-
-
-HF Kernels Flash Attention
-HuggingFace kernels Flash Attention
+HF Kernels Deformable DETR
+HuggingFace kernels Deformable DETR implementation
+GitHub
+HF
+Bench
-HF Kernels Flash Attention 3
-HuggingFace kernels Flash Attention 3
+PyTorch Deformable DETR
+PyTorch native Deformable DETR implementation
+-
+-
+Bench
+
+
+
+
+ Explore Full Bench
+
+
+
+
+
OPENAI-STYLE MOE
+
+
+
+
+
+
-Memory Efficient Attention
-Memory efficient attention implementation
+Implementation
+Description
+Source
+HF
+Bench
+
+
-Sage Attention
-Sage attention implementation
+GptOssExperts
+GPT OSS reference OpenAI-style MoE
+
+
+Bench
-xFormers
-xFormers attention implementation
+Binned PyTorch
+Binned PyTorch OpenAI-style MoE implementation
+-
+-
+Bench
Explore Full Bench
@@ -4246,16 +4344,25 @@ Each section includes:
Implementation
Description
+Source
+HF
+Bench
HF Kernels Causal Conv1D
HuggingFace kernels implementation
+GitHub
+HF
+Bench
PyTorch Causal Conv1D
PyTorch native implementation
+-
+-
+Bench
@@ -4268,9 +4375,9 @@ Each section includes:
-
ACTIVATION FUNCTIONS
+
ROTARY POSITION EMBEDDINGS
-
+
@@ -4278,28 +4385,77 @@ Each section includes:
Implementation
Description
+Source
+HF
+Bench
-HF Kernels SwiGLU
-HuggingFace kernels SwiGLU implementation
+HF Kernels Rotary
+HuggingFace kernels implementation
+GitHub
+HF
+Bench
-PyTorch SwiGLU
-PyTorch native SwiGLU implementation
+PyTorch Rotary
+PyTorch native implementation
+-
+-
+Bench
Explore Full Bench
+
LAYER NORMALIZATION
+
+
+
+
+
+
+
+Implementation
+Description
+Source
+HF
+Bench
+
+
+
+
+HF Kernels Layer Norm
+HuggingFace kernels implementation
+GitHub
+HF
+Bench
+
+
+PyTorch Layer Norm
+PyTorch native implementation
+-
+-
+Bench
+
+
+
+
+
+ Explore Full Bench
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
Binned PyTorch - OpenAI-style MoE
+
GPU Info
+
+
+
+
+
import subprocess
+print ( subprocess . run ([ "nvidia-smi" ], capture_output = True , text = True ) . stdout )
+
+
+
+
+
+
+
Fri Oct 31 20:00:34 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 34C P0 81W / 350W | 0MiB / 46068MiB | 18% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
OpenAI-style MoE Benchmark (Binned PyTorch)
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum , run_benchmark
+
+
+def binned_gather ( x , indices , bins , expert_capacity , top_k ):
+ E , H = bins . shape [ 0 ], x . shape [ 1 ]
+ out = torch . zeros (( E , expert_capacity , H ), device = x . device , dtype = x . dtype )
+ for e in range ( E ):
+ start = 0 if e == 0 else bins [ e - 1 ]
+ end = bins [ e ]
+ n = min ( end - start , expert_capacity )
+ for i in range ( n ):
+ flat_pos = indices [ start + i ]
+ tok = flat_pos // top_k
+ out [ e , i ] = x [ tok ]
+ return out
+
+
+def binned_scatter ( x , indices , weights , bins , expert_capacity , top_k ):
+ E , C , H = x . shape
+ N = indices . shape [ 0 ] // top_k
+ out = torch . zeros (( N , top_k , H ), dtype = x . dtype , device = x . device )
+ for e in range ( E ):
+ start = 0 if e == 0 else bins [ e - 1 ]
+ end = bins [ e ]
+ n = end - start
+ if n == 0 :
+ continue
+ take = min ( n , expert_capacity )
+ for i in range ( take ):
+ flat_pos = indices [ start + i ] # flattened (token, slot)
+ tok = flat_pos // top_k
+ slot = flat_pos % top_k
+ scale = weights [ flat_pos ] if weights is not None else 1.0
+ out [ tok , slot ] = x [ e , i ] * scale
+ return out . sum ( dim = 1 )
+
+
+def sort_tokens_by_expert ( router_indices , num_experts ):
+ flat_indices = router_indices . flatten ()
+ sorted_values , sorted_indices = torch . sort ( flat_indices )
+ tokens_per_expert = torch . bincount ( sorted_values , minlength = num_experts )
+ bins = torch . cumsum ( tokens_per_expert , dim = 0 )
+ return sorted_indices , sorted_values , bins , tokens_per_expert
+
+
+def binned_experts_ref (
+ hidden_states ,
+ router_indices ,
+ routing_weights ,
+ gate_up_proj ,
+ gate_up_proj_bias ,
+ down_proj ,
+ down_proj_bias ,
+ expert_capacity ,
+):
+ B , S , H = hidden_states . shape
+ E , K = routing_weights . shape [ 2 ], router_indices . shape [ 1 ]
+
+ indices , _ , bins , _ = sort_tokens_by_expert ( router_indices , E )
+ x = binned_gather ( hidden_states . view ( - 1 , H ), indices , bins , expert_capacity , K )
+
+ gate_up = torch . bmm ( x , gate_up_proj ) + gate_up_proj_bias [ ... , None , :]
+ gate , up = gate_up [ ... , :: 2 ], gate_up [ ... , 1 :: 2 ]
+
+ # clamp to limit
+ limit = 7.0
+ gate = gate . clamp ( min = None , max = limit )
+ up = up . clamp ( min =- limit , max = limit )
+
+ glu = gate * torch . sigmoid ( gate * 1.702 )
+ x = ( up + 1 ) * glu
+ x = torch . bmm ( x , down_proj ) + down_proj_bias [ ... , None , :]
+
+ # build routing weights aligned to (token, slot)
+ flat_dense = routing_weights . view ( - 1 , E ) # [B*S, E]
+ flat_router = router_indices . view ( - 1 , K ) # [B*S, K]
+ selected = torch . gather ( flat_dense , 1 , flat_router ) . reshape ( - 1 ) # [B*S*K]
+
+ # scatter back
+ y = binned_scatter ( x , indices , selected , bins , expert_capacity , K ) # [B*S, H]
+
+ return y . view ( B , S , H )
+
+
+def binned_torch_openai_moe (
+ hidden_states ,
+ router_indices ,
+ routing_weights ,
+ gate_up_proj ,
+ gate_up_proj_bias ,
+ down_proj ,
+ down_proj_bias ,
+):
+ """
+ Binned PyTorch implementation of OpenAI-style MoE.
+ Sorts tokens by expert assignment for more efficient batched processing.
+ """
+ B , S = hidden_states . shape [ 0 ], hidden_states . shape [ 1 ]
+ K = router_indices . shape [ 1 ]
+
+ # Set expert_capacity to a reasonable value (max tokens per expert)
+ # Use 2x the average to handle imbalance
+ expert_capacity = ( B * S * K * 2 ) // routing_weights . shape [ 2 ]
+
+ return binned_experts_ref (
+ hidden_states ,
+ router_indices ,
+ routing_weights ,
+ gate_up_proj ,
+ gate_up_proj_bias ,
+ down_proj ,
+ down_proj_bias ,
+ expert_capacity ,
+ )
+
+
+run_benchmark (
+ kernel_type = KernelTypeEnum . OPENAI_MOE ,
+ impl_name = "binned_torch" ,
+ impl_tags = { "family" : "pytorch" , "backend" : "eager" },
+ impl_func = binned_torch_openai_moe ,
+ dtype = "float32" ,
+)
+
+
+
+
+
+
+
Running openai_moe benchmark on cuda with 8 workloads.
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S512_E2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 906.550ms 1808.50% 906.550ms 906.550ms 1
+ binned_torch 25.29% 229.728ms 100.00% 908.308ms 908.308ms 0.000us 0.00% 50.129ms 50.129ms 1
+ aten::item 1.81% 16.434ms 25.66% 233.033ms 15.186us 0.000us 0.00% 15.809ms 1.030us 15345
+ aten::_local_scalar_dense 6.08% 55.189ms 23.85% 216.599ms 14.115us 15.808ms 31.54% 15.809ms 1.030us 15345
+ Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.808ms 31.54% 15.808ms 1.030us 15345
+ aten::bmm 0.02% 187.925us 0.02% 226.636us 37.773us 7.688ms 15.34% 7.688ms 1.281ms 6
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.688ms 15.34% 7.688ms 1.281ms 6
+ aten::floor_divide 5.37% 48.789ms 13.13% 119.247ms 19.409us 7.554ms 15.07% 7.554ms 1.230us 6144
+ aten::copy_ 3.71% 33.699ms 9.08% 82.451ms 13.394us 6.606ms 13.18% 6.607ms 1.073us 6156
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.602ms 13.17% 6.602ms 1.073us 6153
+ aten::mul 3.08% 27.972ms 5.49% 49.893ms 16.194us 4.718ms 9.41% 4.718ms 1.531us 3081
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.471ms 8.92% 4.471ms 1.456us 3072
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032ms 8.04% 4.032ms 1.312us 3072
+ aten::remainder 3.03% 27.567ms 4.66% 42.309ms 13.772us 3.722ms 7.42% 3.722ms 1.212us 3072
+ aten::add 2.91% 26.436ms 4.87% 44.207ms 14.575us 3.546ms 7.07% 3.546ms 1.169us 3033
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.524ms 7.03% 3.524ms 1.147us 3072
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.156ms 6.30% 3.156ms 1.042us 3030
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.964ms 3.92% 1.964ms 1.279us 1536
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.758ms 3.51% 1.758ms 1.145us 1536
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 286.305us 0.57% 286.305us 47.718us 6
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 908.315ms
+Self CUDA time total: 50.127ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S512_E4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 939.657ms 1760.51% 939.657ms 939.657ms 1
+ binned_torch 24.72% 232.366ms 100.00% 940.175ms 940.175ms 0.000us 0.00% 53.379ms 53.379ms 1
+ aten::item 1.65% 15.471ms 26.56% 249.752ms 14.748us 0.000us 0.00% 17.339ms 1.024us 16935
+ aten::_local_scalar_dense 6.16% 57.893ms 24.92% 234.282ms 13.834us 17.337ms 32.48% 17.339ms 1.024us 16935
+ Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.337ms 32.48% 17.337ms 1.024us 16935
+ aten::bmm 0.02% 191.684us 0.02% 230.777us 38.463us 7.882ms 14.77% 7.882ms 1.314ms 6
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.882ms 14.77% 7.882ms 1.314ms 6
+ aten::floor_divide 5.10% 47.974ms 12.37% 116.337ms 18.935us 7.540ms 14.13% 7.541ms 1.227us 6144
+ aten::copy_ 3.80% 35.738ms 9.00% 84.586ms 13.740us 6.593ms 12.35% 6.595ms 1.071us 6156
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.590ms 12.35% 6.590ms 1.071us 6153
+ aten::add 4.16% 39.066ms 7.01% 65.874ms 14.342us 5.113ms 9.58% 5.113ms 1.113us 4593
+ aten::mul 2.92% 27.472ms 5.20% 48.883ms 15.866us 4.715ms 8.83% 4.715ms 1.530us 3081
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.472ms 8.38% 4.472ms 1.456us 3072
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.021ms 7.53% 4.021ms 1.309us 3072
+ aten::remainder 2.73% 25.664ms 4.27% 40.147ms 13.069us 3.707ms 6.95% 3.707ms 1.207us 3072
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.519ms 6.59% 3.519ms 1.146us 3072
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.178ms 5.95% 3.178ms 1.049us 3030
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.958ms 3.67% 1.958ms 1.275us 1536
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.749ms 3.28% 1.749ms 1.139us 1536
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.537ms 2.88% 1.537ms 0.985us 1560
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 940.182ms
+Self CUDA time total: 53.374ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S1024_E2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.751s 1703.41% 1.751s 1.751s 1
+ binned_torch 24.63% 431.727ms 100.00% 1.753s 1.753s 0.000us 0.00% 102.829ms 102.829ms 1
+ aten::item 1.69% 29.621ms 25.96% 455.095ms 14.915us 0.000us 0.00% 31.387ms 1.029us 30513
+ aten::_local_scalar_dense 5.96% 104.552ms 24.27% 425.474ms 13.944us 31.383ms 30.52% 31.387ms 1.029us 30513
+ Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.383ms 30.52% 31.383ms 1.029us 30513
+ aten::bmm 0.01% 224.614us 0.02% 267.595us 44.599us 15.143ms 14.73% 15.143ms 2.524ms 6
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.143ms 14.73% 15.143ms 2.524ms 6
+ aten::floor_divide 5.56% 97.549ms 13.34% 233.779ms 19.025us 15.089ms 14.68% 15.090ms 1.228us 12288
+ aten::copy_ 4.01% 70.283ms 9.47% 166.011ms 13.497us 13.317ms 12.95% 13.317ms 1.083us 12300
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.313ms 12.95% 13.313ms 1.083us 12294
+ aten::mul 3.14% 55.060ms 5.66% 99.236ms 16.128us 11.295ms 10.99% 11.297ms 1.836us 6153
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.940ms 9.67% 9.940ms 1.618us 6144
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.059ms 7.84% 8.059ms 1.312us 6144
+ aten::add 2.85% 49.952ms 4.90% 85.866ms 14.522us 7.505ms 7.30% 7.506ms 1.269us 5913
+ aten::remainder 3.02% 53.015ms 4.74% 83.117ms 13.528us 7.414ms 7.21% 7.416ms 1.207us 6144
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.031ms 6.84% 7.031ms 1.144us 6144
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.224ms 6.05% 6.224ms 1.053us 5910
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.914ms 3.81% 3.914ms 1.274us 3072
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.500ms 3.40% 3.500ms 1.139us 3072
+ aten::clamp 0.00% 71.603us 0.01% 117.833us 19.639us 1.180ms 1.15% 1.180ms 196.722us 6
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.753s
+Self CUDA time total: 102.819ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B1_S1024_E4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.834s 1680.90% 1.834s 1.834s 1
+ binned_torch 24.76% 454.393ms 100.00% 1.835s 1.835s 0.000us 0.00% 109.119ms 109.119ms 1
+ aten::item 1.65% 30.229ms 26.42% 484.819ms 14.374us 0.000us 0.00% 34.734ms 1.030us 33729
+ aten::_local_scalar_dense 6.08% 111.551ms 24.77% 454.590ms 13.478us 34.731ms 31.83% 34.734ms 1.030us 33729
+ Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 34.731ms 31.83% 34.731ms 1.030us 33729
+ aten::bmm 0.01% 219.836us 0.01% 260.868us 43.478us 15.243ms 13.97% 15.243ms 2.540ms 6
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.243ms 13.97% 15.243ms 2.540ms 6
+ aten::floor_divide 5.37% 98.619ms 12.62% 231.581ms 18.846us 15.065ms 13.81% 15.065ms 1.226us 12288
+ aten::copy_ 3.65% 66.986ms 8.64% 158.623ms 12.896us 13.313ms 12.20% 13.316ms 1.083us 12300
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.309ms 12.20% 13.309ms 1.082us 12297
+ aten::mul 2.96% 54.365ms 5.27% 96.616ms 15.702us 10.967ms 10.05% 10.969ms 1.783us 6153
+ aten::add 4.05% 74.247ms 6.97% 127.934ms 14.060us 10.631ms 9.74% 10.631ms 1.168us 9099
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.613ms 8.81% 9.613ms 1.565us 6144
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.047ms 7.37% 8.047ms 1.310us 6144
+ aten::remainder 2.81% 51.641ms 4.37% 80.193ms 13.052us 7.438ms 6.82% 7.438ms 1.211us 6144
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.018ms 6.43% 7.018ms 1.142us 6144
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.225ms 5.71% 6.225ms 1.053us 5910
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.928ms 3.60% 3.928ms 1.279us 3072
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.510ms 3.22% 3.510ms 1.143us 3072
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.154ms 2.89% 3.154ms 0.990us 3186
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.835s
+Self CUDA time total: 109.111ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S512_E2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.518s 1672.53% 3.518s 3.518s 1
+ binned_torch 24.37% 858.118ms 100.00% 3.521s 3.521s 0.000us 0.00% 210.357ms 210.357ms 1
+ Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.177ms 30.04% 63.177ms 1.026us 61586
+ aten::item 1.69% 59.432ms 26.02% 916.275ms 14.878us 0.000us 0.00% 63.177ms 1.026us 61587
+ aten::_local_scalar_dense 5.96% 209.806ms 24.34% 856.843ms 13.913us 63.176ms 30.03% 63.177ms 1.026us 61587
+ aten::floor_divide 5.42% 190.698ms 13.50% 475.217ms 19.337us 30.482ms 14.49% 30.486ms 1.240us 24576
+ aten::bmm 0.01% 235.397us 0.01% 281.998us 47.000us 29.291ms 13.93% 29.291ms 4.882ms 6
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.291ms 13.93% 29.291ms 4.882ms 6
+ aten::copy_ 3.77% 132.744ms 9.15% 322.282ms 13.107us 26.808ms 12.75% 26.810ms 1.090us 24588
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.805ms 12.74% 26.805ms 1.090us 24582
+ aten::mul 3.15% 110.895ms 5.78% 203.457ms 16.545us 25.566ms 12.15% 25.568ms 2.079us 12297
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.101ms 10.51% 22.101ms 1.799us 12288
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.470ms 7.83% 16.470ms 1.340us 12288
+ aten::add 2.99% 105.439ms 5.15% 181.211ms 14.601us 16.115ms 7.66% 16.116ms 1.298us 12411
+ aten::remainder 2.99% 105.111ms 4.72% 166.195ms 13.525us 14.836ms 7.05% 14.838ms 1.208us 12288
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.014ms 6.66% 14.014ms 1.140us 12288
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 12.996ms 6.18% 12.996ms 1.047us 12408
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 7.830ms 3.72% 7.830ms 1.274us 6144
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.006ms 3.33% 7.006ms 1.140us 6144
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.626ms 1.25% 2.626ms 437.595us 6
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 3.521s
+Self CUDA time total: 210.342ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S512_E4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.742s 1679.57% 3.742s 3.742s 1
+ binned_torch 24.42% 914.204ms 100.00% 3.744s 3.744s 0.000us 0.00% 222.834ms 222.834ms 1
+ aten::item 1.73% 64.729ms 26.53% 993.125ms 14.638us 0.000us 0.00% 69.848ms 1.030us 67845
+ aten::_local_scalar_dense 6.14% 229.850ms 24.80% 928.396ms 13.684us 69.844ms 31.35% 69.848ms 1.030us 67845
+ Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 69.844ms 31.35% 69.844ms 1.030us 67841
+ aten::floor_divide 5.29% 197.931ms 12.52% 468.921ms 19.080us 30.509ms 13.69% 30.515ms 1.242us 24576
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.140ms 13.08% 29.140ms 4.857ms 6
+ aten::bmm 0.01% 232.675us 0.01% 273.538us 45.590us 29.140ms 13.08% 29.140ms 4.857ms 6
+ aten::copy_ 3.66% 136.881ms 8.73% 326.908ms 13.295us 26.646ms 11.96% 26.647ms 1.084us 24588
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.643ms 11.96% 26.643ms 1.084us 24581
+ aten::mul 2.96% 110.832ms 5.24% 196.253ms 15.959us 25.520ms 11.45% 25.522ms 2.075us 12297
+ aten::add 4.16% 155.619ms 7.13% 266.948ms 14.322us 22.169ms 9.95% 22.169ms 1.189us 18639
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.076ms 9.91% 22.076ms 1.797us 12288
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.462ms 7.39% 16.462ms 1.340us 12287
+ aten::remainder 2.77% 103.887ms 4.33% 162.240ms 13.203us 14.877ms 6.68% 14.879ms 1.211us 12288
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.047ms 6.30% 14.047ms 1.143us 12287
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 12.957ms 5.82% 12.957ms 1.044us 12407
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 7.856ms 3.53% 7.856ms 1.279us 6144
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.021ms 3.15% 7.021ms 1.143us 6144
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.109ms 2.74% 6.109ms 0.981us 6228
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 3.744s
+Self CUDA time total: 222.814ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S1024_E2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 6.967s 1665.27% 6.967s 6.967s 1
+ binned_torch 24.68% 1.721s 100.00% 6.973s 6.973s 0.000us 0.00% 418.392ms 418.392ms 1
+ aten::item 1.64% 114.231ms 25.94% 1.809s 14.732us 0.000us 0.00% 125.163ms 1.020us 122763
+ aten::_local_scalar_dense 5.97% 416.624ms 24.30% 1.694s 13.802us 125.151ms 29.91% 125.163ms 1.020us 122763
+ Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 125.151ms 29.91% 125.151ms 1.019us 122762
+ aten::floor_divide 5.62% 391.846ms 13.33% 929.253ms 18.906us 61.051ms 14.59% 61.053ms 1.242us 49152
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 57.281ms 13.69% 57.281ms 9.547ms 6
+ aten::bmm 0.00% 234.996us 0.00% 276.787us 46.131us 57.281ms 13.69% 57.281ms 9.547ms 6
+ aten::copy_ 3.92% 273.517ms 9.35% 652.240ms 13.268us 53.435ms 12.77% 53.437ms 1.087us 49158
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.433ms 12.77% 53.433ms 1.087us 49154
+ aten::mul 3.15% 219.950ms 5.62% 391.612ms 15.929us 51.411ms 12.29% 51.419ms 2.091us 24585
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.451ms 10.62% 44.451ms 1.809us 24576
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 32.993ms 7.89% 32.993ms 1.343us 24576
+ aten::add 2.87% 200.428ms 4.94% 344.166ms 14.085us 31.887ms 7.62% 31.889ms 1.305us 24435
+ aten::remainder 3.00% 208.953ms 4.67% 325.902ms 13.261us 29.680ms 7.09% 29.684ms 1.208us 24576
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 28.059ms 6.71% 28.059ms 1.142us 24576
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.247ms 6.03% 25.247ms 1.033us 24431
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.667ms 3.74% 15.667ms 1.275us 12288
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.014ms 3.35% 14.014ms 1.140us 12288
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.233ms 1.25% 5.233ms 872.184us 6
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 6.973s
+Self CUDA time total: 418.361ms
+
+
+
+======================================================================
+PROFILE TRACE: binned_torch | cuda_B4_S1024_E4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.368s 1660.72% 7.368s 7.368s 1
+ binned_torch 24.39% 1.797s 100.00% 7.370s 7.370s 0.000us 0.00% 443.698ms 443.698ms 1
+ aten::item 1.69% 124.742ms 26.51% 1.954s 14.504us 0.000us 0.00% 137.717ms 1.022us 134715
+ aten::_local_scalar_dense 6.11% 450.407ms 24.82% 1.829s 13.577us 137.708ms 31.04% 137.717ms 1.022us 134715
+ Memcpy DtoH (Device -> Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 137.710ms 31.04% 137.710ms 1.022us 134711
+ aten::floor_divide 5.42% 399.563ms 12.65% 932.414ms 18.970us 61.071ms 13.77% 61.077ms 1.243us 49152
+ aten::bmm 0.00% 230.664us 0.00% 272.466us 45.411us 57.304ms 12.92% 57.304ms 9.551ms 6
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 57.304ms 12.92% 57.304ms 9.551ms 6
+ aten::copy_ 3.65% 269.132ms 8.67% 639.259ms 13.004us 54.065ms 12.19% 54.067ms 1.100us 49158
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.062ms 12.19% 54.062ms 1.100us 49153
+ aten::mul 2.96% 217.959ms 5.26% 387.551ms 15.764us 51.653ms 11.64% 51.660ms 2.101us 24585
+void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.653ms 10.06% 44.653ms 1.817us 24576
+ aten::add 4.03% 296.962ms 6.96% 512.647ms 14.100us 43.690ms 9.85% 43.694ms 1.202us 36357
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 32.954ms 7.43% 32.954ms 1.341us 24575
+ aten::remainder 2.83% 208.527ms 4.40% 323.906ms 13.180us 29.662ms 6.69% 29.664ms 1.207us 24576
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 28.119ms 6.34% 28.119ms 1.144us 24576
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.409ms 5.73% 25.409ms 1.040us 24431
+void at::native::vectorized_elementwise_kernel<2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.666ms 3.53% 15.666ms 1.275us 12288
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 13.995ms 3.15% 13.995ms 1.139us 12288
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.644ms 2.62% 11.644ms 0.977us 11922
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 7.370s
+Self CUDA time total: 443.660ms
+
+
+impl wl p50(ms) ok
+binned_torch cuda_B1_S1024_E2 372.79 True
+binned_torch cuda_B1_S1024_E4 382.68 True
+binned_torch cuda_B1_S512_E2 150.05 True
+binned_torch cuda_B1_S512_E4 200.26 True
+binned_torch cuda_B4_S1024_E2 1486.48 True
+binned_torch cuda_B4_S1024_E4 1524.50 True
+binned_torch cuda_B4_S512_E2 742.02 True
+binned_torch cuda_B4_S512_E4 801.90 True
+
+
+
+
+
+
+
+