diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..cc4da7fcd25da93cbac6f187979dc308423e4df7 --- /dev/null +++ b/activation/impls/artifacts/benchmark/activation.jsonl @@ -0,0 +1,9 @@ +{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.022950000015953265, "p50": 0.023951000002853107, "p90": 0.0245499999778076, "mean": 0.02414040001212925, "iqr": 0.0010899999551838846, "raw_times": [0.02579100004140855, 0.0245499999778076, 0.023951000002853107, 0.022950000015953265, 0.023460000022623717], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031180999997104664, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02659000000448941, "p50": 0.03026100000624865, "p90": 0.03163099995617813, "mean": 0.03016299999671901, "iqr": 0.001709999935428641, "raw_times": [0.02659000000448941, 0.03026100000624865, 0.02992100002074949, 0.03163099995617813, 0.032411999995929364], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03256100001181039, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02795999995441889, "p50": 0.0293610000312583, "p90": 0.02937200002861573, "mean": 0.029306999988421012, "iqr": 9.100006082007894e-05, "raw_times": [0.02795999995441889, 0.03056099996001649, 0.0293610000312583, 0.02928099996779565, 0.02937200002861573], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03265100002636245, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02837199997429707, "p50": 0.029151000035199104, "p90": 0.0292910000325719, "mean": 0.028971200003979902, "iqr": 0.0007500000265281415, "raw_times": [0.02854100000604376, 0.0292910000325719, 0.029500999971787678, 0.029151000035199104, 0.02837199997429707], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03205100000513994, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0284509999914917, "p50": 0.02926099995192999, "p90": 0.029411000014079036, "mean": 0.029144599977826147, "iqr": 0.0005010000450056395, "raw_times": [0.028909999969073397, 0.029689999962556612, 0.029411000014079036, 0.0284509999914917, 0.02926099995192999], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031930999966789386, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027061000025696558, "p50": 0.028121000013925368, "p90": 0.02836999999544787, "mean": 0.027967000005446607, "iqr": 0.0005990000317979138, "raw_times": [0.027770999963649956, 0.028512000028513285, 0.028121000013925368, 0.02836999999544787, 0.027061000025696558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030291000030047144, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02748099996097153, "p50": 0.029001000029893476, "p90": 0.030041000002256624, "mean": 0.029116999996858794, "iqr": 0.0011299999869152089, "raw_times": [0.02748099996097153, 0.030150999975830928, 0.030041000002256624, 0.029001000029893476, 0.028911000015341415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031200999956126907, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028581000037775084, "p50": 0.028771000017968618, "p90": 0.02886099997567726, "mean": 0.028774800000519463, "iqr": 0.00020999999605919584, "raw_times": [0.028581000037775084, 0.02900999999155829, 0.028771000017968618, 0.028650999979618064, 0.02886099997567726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03162100000508872, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028431000032469456, "p50": 0.029390999998213374, "p90": 0.029580999978406908, "mean": 0.029274800010625768, "iqr": 0.00035999994452140527, "raw_times": [0.028431000032469456, 0.029221000033885502, 0.0297500000101536, 0.029390999998213374, 0.029580999978406908], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030401000003621448, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null} diff --git a/activation/impls/cells/benchmark.py b/activation/impls/cells/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..04f9df27c14acf429b58dba6cf0677c00cbbbced --- /dev/null +++ b/activation/impls/cells/benchmark.py @@ -0,0 +1,34 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "kernels", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +import torch +import sys +from kernels_benchmark_tools import KernelTypeEnum, run_benchmark +from kernels import get_kernel + +# Load the activation kernel +activation = get_kernel("kernels-community/activation") + + +def hf_kernels_swiglu(input_tensor): + hidden_dim = input_tensor.shape[-1] // 2 + out_shape = input_tensor.shape[:-1] + (hidden_dim,) + out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device) + return activation.silu_and_mul(out, input_tensor) + + +run_benchmark( + kernel_type=KernelTypeEnum.ACTIVATION, + impl_name="hf_kernels_swiglu", + impl_tags={"family": "hf-kernels", "backend": "cuda"}, + impl_func=hf_kernels_swiglu, +) \ No newline at end of file diff --git a/activation/impls/cells/nv.py b/activation/impls/cells/nv.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5 --- /dev/null +++ b/activation/impls/cells/nv.py @@ -0,0 +1,2 @@ +import subprocess +print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html new file mode 100644 index 0000000000000000000000000000000000000000..acb55b041fa0c36b529ec1b92a7fddcfe345e099 --- /dev/null +++ b/activation/impls/hf_kernels_swiglu.html @@ -0,0 +1,4181 @@ + + + + + + hf_kernels_swiglu + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

HF Kernels - SwiGLU Activation

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.21s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 29 00:36:01 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   29C    P0             77W /  350W |       0MiB /  46068MiB |      0%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

SwiGLU Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 4.27s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the activation kernel
+activation = get_kernel("kernels-community/activation")
+
+
+def hf_kernels_swiglu(input_tensor):
+    hidden_dim = input_tensor.shape[-1] // 2
+    out_shape = input_tensor.shape[:-1] + (hidden_dim,)
+    out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
+    return activation.silu_and_mul(out, input_tensor)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ACTIVATION,
+    impl_name="hf_kernels_swiglu",
+    impl_tags={"family": "hf-kernels", "backend": "cuda"},
+    impl_func=hf_kernels_swiglu,
+)
+
+ +
+
+
+
+
Running activation benchmark on cuda with 9 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      74.624us      1850.79%      74.624us      74.624us             1  
+                                      hf_kernels_swiglu        11.04%     191.977us        99.56%       1.732ms       1.732ms       0.000us         0.00%       5.440us       5.440us             1  
+                      _activation_beeaae6::silu_and_mul         1.14%      19.900us        85.86%       1.493ms     497.784us       4.032us       100.00%       5.440us       1.813us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.032us       100.00%       4.032us       1.344us             3  
+                                Activity Buffer Request        82.36%       1.432ms        82.36%       1.432ms       1.432ms       1.408us        34.92%       1.408us       1.408us             1  
+                                            aten::empty         2.66%      46.201us         2.66%      46.201us      15.400us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.36%      41.042us         2.36%      41.042us      13.681us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.44%       7.690us         0.44%       7.690us       7.690us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.739ms
+Self CUDA time total: 4.032us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      58.016us      1462.10%      58.016us      58.016us             1  
+                                      hf_kernels_swiglu         6.64%     105.933us        99.68%       1.591ms       1.591ms       0.000us         0.00%       5.280us       5.280us             1  
+                      _activation_beeaae6::silu_and_mul         1.34%      21.350us        91.75%       1.465ms     488.260us       3.968us       100.00%       5.280us       1.760us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       3.968us       100.00%       3.968us       1.323us             3  
+                                Activity Buffer Request        88.86%       1.419ms        88.86%       1.419ms       1.419ms       1.312us        33.06%       1.312us       1.312us             1  
+                                            aten::empty         1.30%      20.712us         1.30%      20.712us       6.904us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.56%      24.841us         1.56%      24.841us       8.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.32%       5.080us         0.32%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.597ms
+Self CUDA time total: 3.968us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      67.167us      1371.87%      67.167us      67.167us             1  
+                                      hf_kernels_swiglu         6.20%     101.314us        99.65%       1.628ms       1.628ms       0.000us         0.00%       6.560us       6.560us             1  
+                      _activation_beeaae6::silu_and_mul         1.28%      20.850us        92.18%       1.506ms     501.997us       4.896us       100.00%       6.560us       2.187us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.896us       100.00%       4.896us       1.632us             3  
+                                Activity Buffer Request        89.24%       1.458ms        89.24%       1.458ms       1.458ms       1.664us        33.99%       1.664us       1.664us             1  
+                                            aten::empty         1.26%      20.660us         1.26%      20.660us       6.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.67%      27.252us         1.67%      27.252us       9.084us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.35%       5.710us         0.35%       5.710us       5.710us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.634ms
+Self CUDA time total: 4.896us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      69.055us      1610.42%      69.055us      69.055us             1  
+                                      hf_kernels_swiglu         5.98%     106.323us        99.73%       1.773ms       1.773ms       0.000us         0.00%       5.728us       5.728us             1  
+                      _activation_beeaae6::silu_and_mul         1.23%      21.902us        92.63%       1.646ms     548.829us       4.288us       100.00%       5.728us       1.909us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       4.288us       100.00%       4.288us       1.429us             3  
+                                Activity Buffer Request        80.11%       1.424ms        80.11%       1.424ms       1.424ms       1.440us        33.58%       1.440us       1.440us             1  
+                                            aten::empty         1.11%      19.750us         1.11%      19.750us       6.583us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        11.30%     200.767us        11.30%     200.767us      66.922us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.870us         0.27%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.777ms
+Self CUDA time total: 4.288us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      61.438us      1043.62%      61.438us      61.438us             1  
+                                      hf_kernels_swiglu        19.33%      85.364us        98.97%     437.156us     437.156us       0.000us         0.00%       7.871us       7.871us             1  
+                      _activation_beeaae6::silu_and_mul         4.88%      21.551us        75.28%     332.532us     110.844us       5.887us       100.00%       7.871us       2.624us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       5.887us       100.00%       5.887us       1.962us             3  
+                                Activity Buffer Request        35.23%     155.635us        35.23%     155.635us     155.635us       1.984us        33.70%       1.984us       1.984us             1  
+                                            aten::empty         4.36%      19.260us         4.36%      19.260us       6.420us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        35.17%     155.346us        35.17%     155.346us      51.782us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.03%       4.560us         1.03%       4.560us       4.560us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 441.716us
+Self CUDA time total: 5.887us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      64.160us       828.30%      64.160us      64.160us             1  
+                                      hf_kernels_swiglu         7.42%     129.826us        99.74%       1.746ms       1.746ms       0.000us         0.00%      10.339us      10.339us             1  
+                      _activation_beeaae6::silu_and_mul         1.16%      20.220us        91.25%       1.597ms     532.391us       7.746us       100.00%      10.339us       3.446us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       7.746us       100.00%       7.746us       2.582us             3  
+                                Activity Buffer Request        81.29%       1.423ms        81.29%       1.423ms       1.423ms       2.593us        33.48%       2.593us       2.593us             1  
+                                            aten::empty         1.08%      18.840us         1.08%      18.840us       6.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.81%     154.125us         8.81%     154.125us      51.375us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.481us         0.26%       4.481us       4.481us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.750ms
+Self CUDA time total: 7.746us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      70.847us      1069.55%      70.847us      70.847us             1  
+                                      hf_kernels_swiglu         6.38%     111.683us        99.73%       1.745ms       1.745ms       0.000us         0.00%       8.832us       8.832us             1  
+                      _activation_beeaae6::silu_and_mul         1.20%      21.011us        92.19%       1.613ms     537.758us       6.624us       100.00%       8.832us       2.944us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us       100.00%       6.624us       2.208us             3  
+                                Activity Buffer Request        82.19%       1.438ms        82.19%       1.438ms       1.438ms       2.208us        33.33%       2.208us       2.208us             1  
+                                            aten::empty         1.16%      20.281us         1.16%      20.281us       6.760us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.80%     153.915us         8.80%     153.915us      51.305us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.700us         0.27%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.750ms
+Self CUDA time total: 6.624us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.070us       668.11%      63.070us      63.070us             1  
+                                      hf_kernels_swiglu        18.75%      87.072us        98.86%     459.026us     459.026us       0.000us         0.00%      12.608us      12.608us             1  
+                      _activation_beeaae6::silu_and_mul         4.59%      21.321us        76.16%     353.653us     117.884us       9.440us       100.00%      12.608us       4.203us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us       9.440us       100.00%       9.440us       3.147us             3  
+                                Activity Buffer Request        38.99%     181.046us        38.99%     181.046us     181.046us       3.168us        33.56%       3.168us       3.168us             1  
+                                            aten::empty         3.94%      18.301us         3.94%      18.301us       6.100us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.58%     151.286us        32.58%     151.286us      50.429us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.14%       5.310us         1.14%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 464.336us
+Self CUDA time total: 9.440us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_swiglu         0.00%       0.000us         0.00%       0.000us       0.000us      63.326us       483.85%      63.326us      63.326us             1  
+                                      hf_kernels_swiglu        16.17%     100.313us        99.24%     615.771us     615.771us       0.000us         0.00%      17.472us      17.472us             1  
+                      _activation_beeaae6::silu_and_mul         3.48%      21.570us        80.17%     497.486us     165.829us      13.088us       100.00%      17.472us       5.824us             3  
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::...         0.00%       0.000us         0.00%       0.000us       0.000us      13.088us       100.00%      13.088us       4.363us             3  
+                                Activity Buffer Request        52.45%     325.441us        52.45%     325.441us     325.441us       4.384us        33.50%       4.384us       4.384us             1  
+                                            aten::empty         2.90%      17.972us         2.90%      17.972us       5.991us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        24.25%     150.475us        24.25%     150.475us      50.158us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.76%       4.730us         0.76%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 620.501us
+Self CUDA time total: 13.088us
+
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_swiglu        cuda_T128_D1024        0.03  True
+hf_kernels_swiglu        cuda_T128_D2048        0.03  True
+hf_kernels_swiglu        cuda_T128_D768         0.02  True
+hf_kernels_swiglu        cuda_T256_D1024        0.03  True
+hf_kernels_swiglu        cuda_T256_D2048        0.03  True
+hf_kernels_swiglu        cuda_T256_D768         0.03  True
+hf_kernels_swiglu        cuda_T512_D1024        0.03  True
+hf_kernels_swiglu        cuda_T512_D2048        0.03  True
+hf_kernels_swiglu        cuda_T512_D768         0.03  True
+
+
+
▶ UV Install Logs
+ +
+
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s] +Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 12.38it/s] +Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 17.32it/s]
+
+

Artifacts:

+activation.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/activation/impls/index.html b/activation/impls/index.html new file mode 100644 index 0000000000000000000000000000000000000000..02d457f5814d7ec7515a6c7ef12f11b92d7783cf --- /dev/null +++ b/activation/impls/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /activation/impls + + + +
+ ← back +
+

Index of /activation/impls

+ + + \ No newline at end of file diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html new file mode 100644 index 0000000000000000000000000000000000000000..61a6a78129e9c9547448c27bfc80f0f6b42b18ce --- /dev/null +++ b/activation/impls/torch_swiglu.html @@ -0,0 +1,4199 @@ + + + + + + torch_swiglu + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

PyTorch Native - SwiGLU Activation

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.21s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 29 00:36:01 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   29C    P0             77W /  350W |       0MiB /  46068MiB |      0%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

SwiGLU Benchmark (PyTorch Native)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 6.96s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+import torch, torch.nn.functional as F
+
+
+def swiglu_eager(x):
+    d = x.shape[-1] // 2
+    return F.silu(x[..., :d]) * x[..., d:]
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ACTIVATION,
+    impl_name="torch_eager",
+    impl_tags={"family":"hf-kernels", "backend":"eager"},
+    impl_func=swiglu_eager,
+)
+
+ +
+
+
+
+
Running activation benchmark on cuda with 9 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T128_D768
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     208.254us      1623.18%     208.254us     208.254us             1  
+                                            torch_eager        11.63%     222.938us        99.53%       1.908ms       1.908ms       0.000us         0.00%      15.165us      15.165us             1  
+                                             aten::silu         3.35%      64.173us        81.27%       1.558ms     519.434us       6.558us        51.11%       8.893us       2.964us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.558us        51.11%       6.558us       2.186us             3  
+                                              aten::mul         2.01%      38.591us         3.22%      61.711us      20.570us       6.272us        48.89%       6.272us       2.091us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        48.89%       6.272us       2.091us             3  
+                                Activity Buffer Request        75.51%       1.448ms        75.51%       1.448ms       1.448ms       2.335us        18.20%       2.335us       2.335us             1  
+                                            aten::slice         2.75%      52.771us         3.41%      65.422us      10.904us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.66%      12.651us         0.66%      12.651us       2.108us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.62%      69.391us         3.62%      69.391us      11.565us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.47%       9.050us         0.47%       9.050us       9.050us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.917ms
+Self CUDA time total: 12.830us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T128_D1024
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     151.071us      1219.79%     151.071us     151.071us             1  
+                                            torch_eager         7.39%     126.424us        99.65%       1.704ms       1.704ms       0.000us         0.00%      14.561us      14.561us             1  
+                                             aten::silu         2.37%      40.550us        87.76%       1.501ms     500.240us       6.400us        51.68%       8.576us       2.859us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.400us        51.68%       6.400us       2.133us             3  
+                                              aten::mul         1.49%      25.470us         2.58%      44.190us      14.730us       5.985us        48.32%       5.985us       1.995us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.985us        48.32%       5.985us       1.995us             3  
+                                Activity Buffer Request        83.86%       1.434ms        83.86%       1.434ms       1.434ms       2.176us        17.57%       2.176us       2.176us             1  
+                                            aten::slice         1.55%      26.493us         1.91%      32.623us       5.437us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.36%       6.130us         0.36%       6.130us       1.022us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.63%      44.922us         2.63%      44.922us       7.487us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.35%       5.980us         0.35%       5.980us       5.980us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.710ms
+Self CUDA time total: 12.385us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T128_D2048
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     154.943us      1178.09%     154.943us     154.943us             1  
+                                            torch_eager         7.25%     123.104us        99.64%       1.692ms       1.692ms       0.000us         0.00%      15.424us      15.424us             1  
+                                             aten::silu         2.33%      39.532us        87.79%       1.491ms     496.854us       6.784us        51.58%       9.056us       3.019us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        51.58%       6.784us       2.261us             3  
+                                              aten::mul         1.58%      26.910us         2.71%      46.021us      15.340us       6.368us        48.42%       6.368us       2.123us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.368us        48.42%       6.368us       2.123us             3  
+                                Activity Buffer Request        83.90%       1.424ms        83.90%       1.424ms       1.424ms       2.272us        17.27%       2.272us       2.272us             1  
+                                            aten::slice         1.53%      26.021us         1.89%      32.121us       5.353us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.36%       6.100us         0.36%       6.100us       1.017us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         2.69%      45.642us         2.69%      45.642us       7.607us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.36%       6.080us         0.36%       6.080us       6.080us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.698ms
+Self CUDA time total: 13.152us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T256_D768
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     180.768us      1415.79%     180.768us     180.768us             1  
+                                            torch_eager         7.93%     123.526us        99.68%       1.554ms       1.554ms       0.000us         0.00%      14.976us      14.976us             1  
+                                             aten::silu         3.24%      50.441us        85.53%       1.333ms     444.348us       6.592us        51.63%       8.800us       2.933us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        51.63%       6.592us       2.197us             3  
+                                              aten::mul         1.75%      27.260us         4.09%      63.791us      21.264us       6.176us        48.37%       6.176us       2.059us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.176us        48.37%       6.176us       2.059us             3  
+                                Activity Buffer Request        67.46%       1.051ms        67.46%       1.051ms       1.051ms       2.208us        17.29%       2.208us       2.208us             1  
+                                            aten::slice         1.70%      26.549us         2.13%      33.261us       5.543us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.43%       6.712us         0.43%       6.712us       1.119us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        17.18%     267.779us        17.18%     267.779us      44.630us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       4.940us         0.32%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.559ms
+Self CUDA time total: 12.768us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T256_D1024
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     150.816us      1138.41%     150.816us     150.816us             1  
+                                            torch_eager         6.24%     117.054us        99.74%       1.872ms       1.872ms       0.000us         0.00%      15.520us      15.520us             1  
+                                             aten::silu         2.12%      39.802us        89.47%       1.679ms     559.729us       6.784us        51.21%       9.056us       3.019us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        51.21%       6.784us       2.261us             3  
+                                              aten::mul         1.34%      25.111us         2.35%      44.062us      14.687us       6.464us        48.79%       6.464us       2.155us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.464us        48.79%       6.464us       2.155us             3  
+                                Activity Buffer Request        75.90%       1.425ms        75.90%       1.425ms       1.425ms       2.272us        17.15%       2.272us       2.272us             1  
+                                            aten::slice         1.36%      25.472us         1.68%      31.591us       5.265us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.33%       6.119us         0.33%       6.119us       1.020us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.46%     233.778us        12.46%     233.778us      38.963us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       4.950us         0.26%       4.950us       4.950us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.877ms
+Self CUDA time total: 13.248us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T256_D2048
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     143.615us       923.45%     143.615us     143.615us             1  
+                                            torch_eager        17.00%     110.812us        99.16%     646.262us     646.262us       0.000us         0.00%      18.240us      18.240us             1  
+                                             aten::silu         6.35%      41.393us        70.99%     462.667us     154.222us       7.936us        51.03%      10.624us       3.541us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        51.03%       7.936us       2.645us             3  
+                                              aten::mul         3.56%      23.221us         6.51%      42.412us      14.137us       7.616us        48.97%       7.616us       2.539us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.616us        48.97%       7.616us       2.539us             3  
+                                Activity Buffer Request        32.67%     212.907us        32.67%     212.907us     212.907us       2.688us        17.28%       2.688us       2.688us             1  
+                                            aten::slice         3.77%      24.551us         4.66%      30.371us       5.062us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.89%       5.820us         0.89%       5.820us       0.970us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        34.91%     227.558us        34.91%     227.558us      37.926us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.84%       5.490us         0.84%       5.490us       5.490us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 651.752us
+Self CUDA time total: 15.552us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T512_D768
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     155.197us      1080.16%     155.197us     155.197us             1  
+                                            torch_eager         6.30%     118.195us        99.70%       1.872ms       1.872ms       0.000us         0.00%      16.864us      16.864us             1  
+                                             aten::silu         2.16%      40.640us        89.31%       1.677ms     558.889us       7.360us        51.22%       9.856us       3.285us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        51.22%       7.360us       2.453us             3  
+                                              aten::mul         1.39%      26.190us         2.47%      46.331us      15.444us       7.008us        48.78%       7.008us       2.336us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.008us        48.78%       7.008us       2.336us             3  
+                                Activity Buffer Request        76.28%       1.432ms        76.28%       1.432ms       1.432ms       2.496us        17.37%       2.496us       2.496us             1  
+                                            aten::slice         1.31%      24.671us         1.64%      30.721us       5.120us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.32%       6.050us         0.32%       6.050us       1.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.93%     224.049us        11.93%     224.049us      37.341us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.30%       5.540us         0.30%       5.540us       5.540us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.877ms
+Self CUDA time total: 14.368us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T512_D1024
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     144.252us       927.61%     144.252us     144.252us             1  
+                                            torch_eager        18.42%     116.554us        99.16%     627.471us     627.471us       0.000us         0.00%      18.239us      18.239us             1  
+                                             aten::silu         6.52%      41.251us        69.31%     438.595us     146.198us       7.968us        51.24%      10.656us       3.552us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.968us        51.24%       7.968us       2.656us             3  
+                                              aten::mul         3.66%      23.182us         6.58%      41.632us      13.877us       7.583us        48.76%       7.583us       2.528us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.583us        48.76%       7.583us       2.528us             3  
+                                Activity Buffer Request        30.96%     195.937us        30.96%     195.937us     195.937us       2.688us        17.29%       2.688us       2.688us             1  
+                                            aten::slice         3.89%      24.640us         4.85%      30.690us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.96%       6.050us         0.96%       6.050us       1.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        34.74%     219.857us        34.74%     219.857us      36.643us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.84%       5.310us         0.84%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 632.781us
+Self CUDA time total: 15.551us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T512_D2048
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     150.463us       665.09%     150.463us     150.463us             1  
+                                            torch_eager         5.93%     109.544us        99.69%       1.842ms       1.842ms       0.000us         0.00%      26.527us      26.527us             1  
+                                             aten::silu         2.24%      41.413us        89.69%       1.657ms     552.422us      11.584us        51.20%      15.488us       5.163us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.584us        51.20%      11.584us       3.861us             3  
+                                              aten::mul         1.32%      24.310us         2.35%      43.432us      14.477us      11.039us        48.80%      11.039us       3.680us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      11.039us        48.80%      11.039us       3.680us             3  
+                                Activity Buffer Request        76.49%       1.413ms        76.49%       1.413ms       1.413ms       3.904us        17.26%       3.904us       3.904us             1  
+                                            aten::slice         1.39%      25.640us         1.72%      31.740us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
+                                       aten::as_strided         0.33%       6.100us         0.33%       6.100us       1.017us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.00%     221.728us        12.00%     221.728us      36.955us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.31%       5.690us         0.31%       5.690us       5.690us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.848ms
+Self CUDA time total: 22.623us
+
+
+impl                     wl                  p50(ms)  ok
+torch_eager              cuda_T128_D1024        0.05  True
+torch_eager              cuda_T128_D2048        0.05  True
+torch_eager              cuda_T128_D768         0.04  True
+torch_eager              cuda_T256_D1024        0.05  True
+torch_eager              cuda_T256_D2048        0.05  True
+torch_eager              cuda_T256_D768         0.05  True
+torch_eager              cuda_T512_D1024        0.05  True
+torch_eager              cuda_T512_D2048        0.05  True
+torch_eager              cuda_T512_D768         0.05  True
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+activation.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/activation/index.html b/activation/index.html new file mode 100644 index 0000000000000000000000000000000000000000..ddb801226b9a0aa8d81788bef013e946eb8554ed --- /dev/null +++ b/activation/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /activation + + + +
+ ← back +
+

Index of /activation

+ + + \ No newline at end of file diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg new file mode 100644 index 0000000000000000000000000000000000000000..cc4b0932cb850954c8d6b0adb27feeabe3e3f7f4 --- /dev/null +++ b/activation/results/artifacts/combine/latency.svg @@ -0,0 +1,318 @@ + + + + + + + 2025-10-29T00:37:20.527749 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_T128_D768 + + + + + + + + + + + + + cuda_T128_D1024 + + + + + + + + + + + + + cuda_T128_D2048 + + + + + + + + + + + + + cuda_T256_D768 + + + + + + + + + + + + + cuda_T256_D1024 + + + + + + + + + + + + + cuda_T256_D2048 + + + + + + + + + + + + + cuda_T512_D768 + + + + + + + + + + + + + cuda_T512_D1024 + + + + + + + + + + + + + cuda_T512_D2048 + + + + Workload + + + + + + + + + + + + + + + + + 0.025 + + + + + + + + + + + + + 0.030 + + + + + + + + + + + + + 0.035 + + + + + + + + + + + + + 0.040 + + + + + + + + + + + + + 0.045 + + + + + + + + + + + + + 0.050 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + hf_kernels_swiglu + + + + + + + + + torch_eager + + + + + + + + + + \ No newline at end of file diff --git a/activation/results/cells/combine.py b/activation/results/cells/combine.py new file mode 100644 index 0000000000000000000000000000000000000000..c27fc2d1dc911098e8feb19f8e4a7ed33d851a12 --- /dev/null +++ b/activation/results/cells/combine.py @@ -0,0 +1,27 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "matplotlib", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +from kernels_benchmark_tools.core.visuals import generate_combined_results + +# Map display names to uvnote environment variables +cache_env_map = { + "HF Kernels SwiGLU": "UVNOTE_FILE_HF_KERNELS_SWIGLU_BENCHMARK", + "PyTorch SwiGLU": "UVNOTE_FILE_TORCH_SWIGLU_BENCHMARK", + # "Compiled SwiGLU": "UVNOTE_FILE_COMPILED_SWIGLU_BENCHMARK", +} + +# Generate combined results with visualization +generate_combined_results( + cache_env_map=cache_env_map, + output_filename="activation.jsonl", + svg_filename="latency.svg" +) \ No newline at end of file diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html new file mode 100644 index 0000000000000000000000000000000000000000..432464a5a72bec48d82e3ed28b902538354c87bf --- /dev/null +++ b/activation/results/combined_results.html @@ -0,0 +1,4654 @@ + + + + + + SwiGLU Activation Benchmark - Combined Results + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

SwiGLU Activation Benchmarks - Aggregated Results

+

This document combines benchmark results from multiple SwiGLU activation implementations.

+

Combined Summary and Visualization

+
+ + + + + + + 2025-10-29T00:37:20.527749 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_T128_D768 + + + + + + + + + + + + + cuda_T128_D1024 + + + + + + + + + + + + + cuda_T128_D2048 + + + + + + + + + + + + + cuda_T256_D768 + + + + + + + + + + + + + cuda_T256_D1024 + + + + + + + + + + + + + cuda_T256_D2048 + + + + + + + + + + + + + cuda_T512_D768 + + + + + + + + + + + + + cuda_T512_D1024 + + + + + + + + + + + + + cuda_T512_D2048 + + + + Workload + + + + + + + + + + + + + + + + + 0.025 + + + + + + + + + + + + + 0.030 + + + + + + + + + + + + + 0.035 + + + + + + + + + + + + + 0.040 + + + + + + + + + + + + + 0.045 + + + + + + + + + + + + + 0.050 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + hf_kernels_swiglu + + + + + + + + + torch_eager + + + + + + + + + + +
+ +
+
+ +▶ code +▼ output + ▶ uv-logs + | +Cell: combine | 4.29s + | + +Raw +
+ +
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ HF Kernels SwiGLU             : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b
+✓ PyTorch SwiGLU                : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb
+
+  ✓ Found HF Kernels SwiGLU
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b/activation.jsonl
+  ✓ Found PyTorch SwiGLU
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb/activation.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_swiglu        cuda_T128_D1024        0.03  True
+hf_kernels_swiglu        cuda_T128_D2048        0.03  True
+hf_kernels_swiglu        cuda_T128_D768         0.02  True
+hf_kernels_swiglu        cuda_T256_D1024        0.03  True
+hf_kernels_swiglu        cuda_T256_D2048        0.03  True
+hf_kernels_swiglu        cuda_T256_D768         0.03  True
+hf_kernels_swiglu        cuda_T512_D1024        0.03  True
+hf_kernels_swiglu        cuda_T512_D2048        0.03  True
+hf_kernels_swiglu        cuda_T512_D768         0.03  True
+torch_eager              cuda_T128_D1024        0.05  True
+torch_eager              cuda_T128_D2048        0.05  True
+torch_eager              cuda_T128_D768         0.04  True
+torch_eager              cuda_T256_D1024        0.05  True
+torch_eager              cuda_T256_D2048        0.05  True
+torch_eager              cuda_T256_D768         0.05  True
+torch_eager              cuda_T512_D1024        0.05  True
+torch_eager              cuda_T512_D2048        0.05  True
+torch_eager              cuda_T512_D768         0.05  True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 18 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+  ✓ HF Kernels SwiGLU
+  ✓ PyTorch SwiGLU
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+latency.svg +
+ + + + + + + 2025-10-29T00:37:20.527749 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_T128_D768 + + + + + + + + + + + + + cuda_T128_D1024 + + + + + + + + + + + + + cuda_T128_D2048 + + + + + + + + + + + + + cuda_T256_D768 + + + + + + + + + + + + + cuda_T256_D1024 + + + + + + + + + + + + + cuda_T256_D2048 + + + + + + + + + + + + + cuda_T512_D768 + + + + + + + + + + + + + cuda_T512_D1024 + + + + + + + + + + + + + cuda_T512_D2048 + + + + Workload + + + + + + + + + + + + + + + + + 0.025 + + + + + + + + + + + + + 0.030 + + + + + + + + + + + + + 0.035 + + + + + + + + + + + + + 0.040 + + + + + + + + + + + + + 0.045 + + + + + + + + + + + + + 0.050 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + hf_kernels_swiglu + + + + + + + + + torch_eager + + + + + + + + + + +
+
+
+
+
+ + + \ No newline at end of file diff --git a/activation/results/index.html b/activation/results/index.html new file mode 100644 index 0000000000000000000000000000000000000000..5c60fe94ab1a86a4d9f299448a7d8a5b85027447 --- /dev/null +++ b/activation/results/index.html @@ -0,0 +1,88 @@ + + + + + + Index of /activation/results + + + +
+ ← back +
+

Index of /activation/results

+ + + \ No newline at end of file diff --git a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..42d266f1bc1ac47e5d0d53656b57da93664a039f --- /dev/null +++ b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl @@ -0,0 +1,24 @@ +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.046111000017390325, "p50": 0.046270999973785365, "p90": 0.04740100001754399, "mean": 0.04670720001058726, "iqr": 0.001160000010713702, "raw_times": [0.047512000037386315, 0.04740100001754399, 0.04624100000683029, 0.046270999973785365, 0.046111000017390325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05871199999774035, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05225199998903918, "p50": 0.053462000039417035, "p90": 0.053592000028857, "mean": 0.05365380001194353, "iqr": 0.0002100000529026147, "raw_times": [0.053462000039417035, 0.055581000026450056, 0.053592000028857, 0.053381999975954386, 0.05225199998903918], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0581319999923835, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05121200001667603, "p50": 0.05470199999990655, "p90": 0.05482099999198908, "mean": 0.05431980000594194, "iqr": 0.0013289999856169743, "raw_times": [0.05121200001667603, 0.057372000014765945, 0.05470199999990655, 0.05482099999198908, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056541999981618574, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05210199998373355, "p50": 0.05333199999313365, "p90": 0.05396199998131124, "mean": 0.05322599998862643, "iqr": 0.0016399999935856613, "raw_times": [0.05210199998373355, 0.05333199999313365, 0.05396199998131124, 0.052321999987725576, 0.05441199999722812], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09094299997514099, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05103099999814731, "p50": 0.05309199997327596, "p90": 0.053381999975954386, "mean": 0.05291379998197954, "iqr": 0.0004199999921183917, "raw_times": [0.053381999975954386, 0.052961999983835994, 0.05103099999814731, 0.05309199997327596, 0.054101999978684034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05603199997494812, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051181999992877536, "p50": 0.05189199998767435, "p90": 0.05201199996918149, "mean": 0.052023999978700886, "iqr": 0.0004999999987376214, "raw_times": [0.05151199997044387, 0.05352199997332718, 0.05189199998767435, 0.05201199996918149, 0.051181999992877536], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055981999992127385, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05042200001525998, "p50": 0.052002000018092076, "p90": 0.05382199998393844, "mean": 0.05366420000427752, "iqr": 0.00333999997792489, "raw_times": [0.05048200000601355, 0.05042200001525998, 0.052002000018092076, 0.06159299999808354, 0.05382199998393844], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05433199999060889, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0522220000220841, "p50": 0.053632000003744906, "p90": 0.05870200004665094, "mean": 0.056078200009324064, "iqr": 0.005690000079994206, "raw_times": [0.0522220000220841, 0.06282300000748364, 0.053632000003744906, 0.05301199996665673, 0.05870200004665094], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.055741999972269696, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05032100000335049, "p50": 0.050921000024573004, "p90": 0.05318199998782802, "mean": 0.05303959999309882, "iqr": 0.0023800000121809717, "raw_times": [0.05080199997564705, 0.050921000024573004, 0.05032100000335049, 0.059971999974095525, 0.05318199998782802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0550720000092042, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05211199999166638, "p50": 0.05235200001152407, "p90": 0.053132000005007285, "mean": 0.05707820000679931, "iqr": 0.0008700000080352766, "raw_times": [0.05235200001152407, 0.05226199999697201, 0.053132000005007285, 0.07553300002882679, 0.05211199999166638], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05610199997363452, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0512720000074296, "p50": 0.0524320000181433, "p90": 0.05278200001157529, "mean": 0.05529400000341411, "iqr": 0.000919999990856013, "raw_times": [0.05278200001157529, 0.0524320000181433, 0.0512720000074296, 0.0681219999592031, 0.05186200002071928], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05547199998545693, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05112100001269937, "p50": 0.051342000006115995, "p90": 0.05172099997707846, "mean": 0.053885599993463984, "iqr": 0.00040899999476096127, "raw_times": [0.05112100001269937, 0.06393199998910859, 0.05172099997707846, 0.0513119999823175, 0.051342000006115995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055091999968226446, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050531999988834286, "p50": 0.05176199999823439, "p90": 0.051821999988987955, "mean": 0.05163600000059887, "iqr": 0.0003099999617006688, "raw_times": [0.050531999988834286, 0.05176199999823439, 0.052551999999650434, 0.051821999988987955, 0.051512000027287286], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055182000039621926, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124200004047452, "p50": 0.05148200000348879, "p90": 0.05251200002476253, "mean": 0.051918000008299714, "iqr": 0.0011100000278929656, "raw_times": [0.05251200002476253, 0.05295199997590316, 0.05148200000348879, 0.05140199999686956, 0.05124200004047452], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05506200000127137, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05025200005093211, "p50": 0.05105200000343757, "p90": 0.05146199998762313, "mean": 0.05136380001431462, "iqr": 0.0005399999736255268, "raw_times": [0.05146199998762313, 0.053131000015582686, 0.050922000013997604, 0.05025200005093211, 0.05105200000343757], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0684330000240152, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051181999992877536, "p50": 0.052152000023397704, "p90": 0.05241200000227764, "mean": 0.05240600000888662, "iqr": 0.00034999999343199306, "raw_times": [0.052152000023397704, 0.05422200001703459, 0.05241200000227764, 0.051181999992877536, 0.052062000008845644], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05490099999860831, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05016099999011203, "p50": 0.05225199998903918, "p90": 0.05251199996791911, "mean": 0.05182779999586273, "iqr": 0.001349999934063817, "raw_times": [0.05016099999011203, 0.053051999998388055, 0.05116200003385529, 0.05251199996791911, 0.05225199998903918], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05627199999480581, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05154100000481776, "p50": 0.0524320000181433, "p90": 0.05299099996136647, "mean": 0.05266959998380116, "iqr": 0.0006189999908201571, "raw_times": [0.05154100000481776, 0.054011999964131974, 0.05299099996136647, 0.0524320000181433, 0.05237199997054631], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05572200001324745, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05098199994790775, "p50": 0.05128100002593783, "p90": 0.052071999959935056, "mean": 0.05161159999715892, "iqr": 0.0008409999168179638, "raw_times": [0.05098199994790775, 0.052071999959935056, 0.05128100002593783, 0.05123100004311709, 0.052492000008896866], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055401999986770534, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050202000011267955, "p50": 0.05295199997590316, "p90": 0.05307200001425372, "mean": 0.052619999996750266, "iqr": 0.00046000002384971594, "raw_times": [0.050202000011267955, 0.05307200001425372, 0.054261999991922494, 0.05295199997590316, 0.052611999990404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05440200004613871, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05220100001679384, "p50": 0.052891999985149596, "p90": 0.05323199997064876, "mean": 0.05431980000594194, "iqr": 0.0007509999591093219, "raw_times": [0.05220100001679384, 0.052891999985149596, 0.05323199997064876, 0.06079300004557808, 0.052481000011539436], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0552820000052634, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05108200002723606, "p50": 0.05157200001804085, "p90": 0.053041000001030625, "mean": 0.051985800007514626, "iqr": 0.0018490000002202578, "raw_times": [0.05157200001804085, 0.05108200002723606, 0.053041000001030625, 0.05119200000081037, 0.053041999990455224], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05657200000541707, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05095099999152808, "p50": 0.0515919999770631, "p90": 0.05208099997844329, "mean": 0.05173159999003474, "iqr": 0.0006789999815737247, "raw_times": [0.0515919999770631, 0.05208099997844329, 0.052632000006269664, 0.05095099999152808, 0.05140199999686956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056392000033156364, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} +{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05110099999683371, "p50": 0.051662000032592914, "p90": 0.051741999982368725, "mean": 0.05161380000799909, "iqr": 0.00010999997357430402, "raw_times": [0.05163200000879442, 0.05110099999683371, 0.051741999982368725, 0.051662000032592914, 0.05193200001940568], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05588200002648591, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null} diff --git a/causal_conv1d/impls/cells/benchmark.py b/causal_conv1d/impls/cells/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..725b12c4018e4eec05c5ddccb0c88a8eae6f150d --- /dev/null +++ b/causal_conv1d/impls/cells/benchmark.py @@ -0,0 +1,31 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "kernels", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +import torch +import sys +from kernels_benchmark_tools import KernelTypeEnum, run_benchmark +from kernels import get_kernel + +# Load the causal conv1d kernel +causal_conv1d = get_kernel("kernels-community/causal-conv1d") + + +def hf_kernels_causal_conv1d(input_tensor, weight, bias): + return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias) + + +run_benchmark( + kernel_type=KernelTypeEnum.CAUSAL_CONV1D, + impl_name="hf_kernels_causal_conv1d", + impl_tags={"family": "hf-kernels", "backend": "cuda"}, + impl_func=hf_kernels_causal_conv1d, +) \ No newline at end of file diff --git a/causal_conv1d/impls/cells/nv.py b/causal_conv1d/impls/cells/nv.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5 --- /dev/null +++ b/causal_conv1d/impls/cells/nv.py @@ -0,0 +1,2 @@ +import subprocess +print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/causal_conv1d/impls/hf_kernels_causal_conv1d.html b/causal_conv1d/impls/hf_kernels_causal_conv1d.html new file mode 100644 index 0000000000000000000000000000000000000000..4299d818b5235441dc2b29a2515c8e5718a75f01 --- /dev/null +++ b/causal_conv1d/impls/hf_kernels_causal_conv1d.html @@ -0,0 +1,4542 @@ + + + + + + hf_kernels_causal_conv1d + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

HF Kernels - Causal Conv1D

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.23s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 29 00:36:08 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   30C    P0             87W /  350W |       0MiB /  46068MiB |     18%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Causal Conv1D Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 6.07s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the causal conv1d kernel
+causal_conv1d = get_kernel("kernels-community/causal-conv1d")
+
+
+def hf_kernels_causal_conv1d(input_tensor, weight, bias):
+    return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
+    impl_name="hf_kernels_causal_conv1d",
+    impl_tags={"family": "hf-kernels", "backend": "cuda"},
+    impl_func=hf_kernels_causal_conv1d,
+)
+
+ +
+
+
+
+
Running causal_conv1d benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     144.543us      3556.67%     144.543us     144.543us             1  
+                               hf_kernels_causal_conv1d         8.86%     163.685us        99.64%       1.842ms       1.842ms       0.000us         0.00%       5.504us       5.504us             1  
+                                         CausalConv1dFn         5.76%     106.513us        90.78%       1.678ms     559.289us       0.000us         0.00%       5.504us       1.835us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         2.24%      41.381us        81.42%       1.505ms     501.611us       4.064us       100.00%       5.504us       1.835us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
+                                Activity Buffer Request        76.78%       1.419ms        76.78%       1.419ms       1.419ms       1.440us        35.43%       1.440us       1.440us             1  
+                                       aten::empty_like         0.97%      17.931us         3.60%      66.522us      22.174us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         2.63%      48.591us         2.63%      48.591us      16.197us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.40%      44.403us         2.40%      44.403us      14.801us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.36%       6.650us         0.36%       6.650us       6.650us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.848ms
+Self CUDA time total: 4.064us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.574us      3378.19%     128.574us     128.574us             1  
+                               hf_kernels_causal_conv1d         6.39%     108.804us        99.67%       1.696ms       1.696ms       0.000us         0.00%       5.085us       5.085us             1  
+                                         CausalConv1dFn         4.62%      78.561us        93.27%       1.588ms     529.188us       0.000us         0.00%       5.085us       1.695us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.51%      25.693us        86.85%       1.478ms     492.734us       3.806us       100.00%       5.085us       1.695us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.806us       100.00%       3.806us       1.269us             3  
+                                Activity Buffer Request        83.54%       1.422ms        83.54%       1.422ms       1.422ms       1.279us        33.60%       1.279us       1.279us             1  
+                                       aten::empty_like         0.47%       8.001us         1.81%      30.802us      10.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.34%      22.801us         1.34%      22.801us       7.600us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.80%      30.601us         1.80%      30.601us      10.200us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.33%       5.681us         0.33%       5.681us       5.681us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.702ms
+Self CUDA time total: 3.806us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.111us      3339.80%     126.111us     126.111us             1  
+                               hf_kernels_causal_conv1d         5.63%      95.933us        99.70%       1.698ms       1.698ms       0.000us         0.00%       5.056us       5.056us             1  
+                                         CausalConv1dFn         4.46%      75.892us        94.07%       1.602ms     534.022us       0.000us         0.00%       5.056us       1.685us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.40%      23.785us        87.77%       1.495ms     498.271us       3.776us       100.00%       5.056us       1.685us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.776us       100.00%       3.776us       1.259us             3  
+                                Activity Buffer Request        84.61%       1.441ms        84.61%       1.441ms       1.441ms       1.280us        33.90%       1.280us       1.280us             1  
+                                       aten::empty_like         0.49%       8.320us         1.84%      31.360us      10.453us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.35%      23.040us         1.35%      23.040us       7.680us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.77%      30.070us         1.77%      30.070us      10.023us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.30%       5.061us         0.30%       5.061us       5.061us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.703ms
+Self CUDA time total: 3.776us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     128.703us      3352.51%     128.703us     128.703us             1  
+                               hf_kernels_causal_conv1d         5.02%      93.431us        99.72%       1.856ms       1.856ms       0.000us         0.00%       5.119us       5.119us             1  
+                                         CausalConv1dFn         4.18%      77.825us        94.70%       1.762ms     587.414us       0.000us         0.00%       5.119us       1.706us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.36%      25.311us        88.83%       1.653ms     551.005us       3.839us       100.00%       5.119us       1.706us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.839us       100.00%       3.839us       1.280us             3  
+                                Activity Buffer Request        76.83%       1.430ms        76.83%       1.430ms       1.430ms       1.280us        33.34%       1.280us       1.280us             1  
+                                       aten::empty_like         0.45%       8.401us         1.69%      31.401us      10.467us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.24%      23.000us         1.24%      23.000us       7.667us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        10.65%     198.147us        10.65%     198.147us      66.049us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.120us         0.28%       5.120us       5.120us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.861ms
+Self CUDA time total: 3.839us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     125.247us      2609.31%     125.247us     125.247us             1  
+                               hf_kernels_causal_conv1d         5.46%      99.082us        99.73%       1.809ms       1.809ms       0.000us         0.00%       6.432us       6.432us             1  
+                                         CausalConv1dFn         4.18%      75.835us        94.27%       1.709ms     569.830us       0.000us         0.00%       6.432us       2.144us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.40%      25.379us        88.34%       1.602ms     533.975us       4.800us       100.00%       6.432us       2.144us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.800us       100.00%       4.800us       1.600us             3  
+                                Activity Buffer Request        77.97%       1.414ms        77.97%       1.414ms       1.414ms       1.632us        34.00%       1.632us       1.632us             1  
+                                       aten::empty_like         0.46%       8.420us         1.75%      31.730us      10.577us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.29%      23.310us         1.29%      23.310us       7.770us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.97%     162.627us         8.97%     162.627us      54.209us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.860us         0.27%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.813ms
+Self CUDA time total: 4.800us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.190us      2487.38%     120.190us     120.190us             1  
+                               hf_kernels_causal_conv1d        14.45%      80.914us        99.14%     554.970us     554.970us       0.000us         0.00%       6.464us       6.464us             1  
+                                         CausalConv1dFn        12.94%      72.432us        84.69%     474.056us     158.019us       0.000us         0.00%       6.464us       2.155us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.57%      25.572us        66.53%     372.404us     124.135us       4.832us       100.00%       6.464us       2.155us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.832us       100.00%       4.832us       1.611us             3  
+                                Activity Buffer Request        34.22%     191.566us        34.22%     191.566us     191.566us       1.632us        33.77%       1.632us       1.632us             1  
+                                       aten::empty_like         1.40%       7.860us         5.22%      29.220us       9.740us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.82%      21.360us         3.82%      21.360us       7.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.74%     155.266us        27.74%     155.266us      51.755us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.86%       4.800us         0.86%       4.800us       4.800us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 559.770us
+Self CUDA time total: 4.832us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     133.279us      1243.27%     133.279us     133.279us             1  
+                               hf_kernels_causal_conv1d         5.54%     100.182us        99.73%       1.805ms       1.805ms       0.000us         0.00%      14.336us      14.336us             1  
+                                         CausalConv1dFn         4.54%      82.173us        94.20%       1.705ms     568.267us       0.000us         0.00%      14.336us       4.779us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      26.531us        87.96%       1.592ms     530.609us      10.720us       100.00%      14.336us       4.779us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.720us       100.00%      10.720us       3.573us             3  
+                                Activity Buffer Request        77.82%       1.408ms        77.82%       1.408ms       1.408ms       3.616us        33.73%       3.616us       3.616us             1  
+                                       aten::empty_like         0.46%       8.260us         1.70%      30.801us      10.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.25%      22.541us         1.25%      22.541us       7.514us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.67%     156.947us         8.67%     156.947us      52.316us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.830us         0.27%       4.830us       4.830us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.810ms
+Self CUDA time total: 10.720us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.037us      1127.54%     123.037us     123.037us             1  
+                               hf_kernels_causal_conv1d        20.63%     102.765us        99.04%     493.397us     493.397us       0.000us         0.00%      14.592us      14.592us             1  
+                                         CausalConv1dFn        14.78%      73.650us        78.41%     390.632us     130.211us       0.000us         0.00%      14.592us       4.864us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.23%      26.041us        57.43%     286.091us      95.364us      10.912us       100.00%      14.592us       4.864us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      10.912us       100.00%      10.912us       3.637us             3  
+                                Activity Buffer Request        21.15%     105.364us        21.15%     105.364us     105.364us       3.680us        33.72%       3.680us       3.680us             1  
+                                       aten::empty_like         1.51%       7.510us         6.20%      30.891us      10.297us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.69%      23.381us         4.69%      23.381us       7.794us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.05%     154.686us        31.05%     154.686us      51.562us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.96%       4.790us         0.96%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 498.187us
+Self CUDA time total: 10.912us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.944us      1189.53%     130.944us     130.944us             1  
+                               hf_kernels_causal_conv1d         5.42%      97.593us        99.72%       1.796ms       1.796ms       0.000us         0.00%      14.720us      14.720us             1  
+                                         CausalConv1dFn         4.08%      73.404us        94.31%       1.699ms     566.233us       0.000us         0.00%      14.720us       4.907us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.44%      26.001us        88.45%       1.593ms     531.068us      11.008us       100.00%      14.720us       4.907us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.008us       100.00%      11.008us       3.669us             3  
+                                Activity Buffer Request        78.36%       1.411ms        78.36%       1.411ms       1.411ms       3.712us        33.72%       3.712us       3.712us             1  
+                                       aten::empty_like         0.46%       8.350us         1.78%      32.090us      10.697us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.32%      23.740us         1.32%      23.740us       7.913us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.65%     155.786us         8.65%     155.786us      51.929us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       4.990us         0.28%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.801ms
+Self CUDA time total: 11.008us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     122.014us      1080.15%     122.014us     122.014us             1  
+                               hf_kernels_causal_conv1d        12.40%      73.852us        99.19%     590.511us     590.511us       0.000us         0.00%      15.104us      15.104us             1  
+                                         CausalConv1dFn        12.35%      73.524us        86.78%     516.659us     172.220us       0.000us         0.00%      15.104us       5.035us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.03%      24.020us        69.45%     413.474us     137.825us      11.296us       100.00%      15.104us       5.035us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      11.296us       100.00%      11.296us       3.765us             3  
+                                Activity Buffer Request        38.81%     231.068us        38.81%     231.068us     231.068us       3.808us        33.71%       3.808us       3.808us             1  
+                                       aten::empty_like         1.25%       7.459us         4.98%      29.661us       9.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.73%      22.202us         3.73%      22.202us       7.401us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        26.60%     158.386us        26.60%     158.386us      52.795us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.81%       4.840us         0.81%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 595.351us
+Self CUDA time total: 11.296us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     135.582us       269.70%     135.582us     135.582us             1  
+                               hf_kernels_causal_conv1d        12.51%      76.722us        99.20%     608.371us     608.371us       0.000us         0.00%      83.711us      83.711us             1  
+                                         CausalConv1dFn        13.24%      81.202us        86.69%     531.649us     177.216us       0.000us         0.00%      83.711us      27.904us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.12%      25.291us        68.50%     420.085us     140.028us      50.271us       100.00%      83.711us      27.904us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      50.271us       100.00%      50.271us      16.757us             3  
+                                Activity Buffer Request        38.84%     238.229us        38.84%     238.229us     238.229us      33.440us        66.52%      33.440us      33.440us             1  
+                                       aten::empty_like         1.27%       7.790us         4.95%      30.362us      10.121us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.68%      22.572us         3.68%      22.572us       7.524us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        25.53%     156.565us        25.53%     156.565us      52.188us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.80%       4.910us         0.80%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 613.281us
+Self CUDA time total: 50.271us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     127.740us       248.41%     127.740us     127.740us             1  
+                               hf_kernels_causal_conv1d        15.37%      77.574us        99.04%     499.998us     499.998us       0.000us         0.00%      85.854us      85.854us             1  
+                                         CausalConv1dFn        14.63%      73.842us        83.68%     422.424us     140.808us       0.000us         0.00%      85.854us      28.618us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.23%      26.412us        63.27%     319.402us     106.467us      51.423us       100.00%      85.854us      28.618us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      51.423us       100.00%      51.423us      17.141us             3  
+                                Activity Buffer Request        27.23%     137.484us        27.23%     137.484us     137.484us      34.431us        66.96%      34.431us      34.431us             1  
+                                       aten::empty_like         1.41%       7.140us         5.78%      29.180us       9.727us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.37%      22.040us         4.37%      22.040us       7.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.80%     155.506us        30.80%     155.506us      51.835us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.96%       4.831us         0.96%       4.831us       4.831us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 504.829us
+Self CUDA time total: 51.423us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     117.437us      3008.12%     117.437us     117.437us             1  
+                               hf_kernels_causal_conv1d        12.18%      74.242us        99.17%     604.340us     604.340us       0.000us         0.00%       5.152us       5.152us             1  
+                                         CausalConv1dFn        11.66%      71.062us        86.99%     530.098us     176.699us       0.000us         0.00%       5.152us       1.717us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.18%      25.499us        70.51%     429.675us     143.225us       3.904us       100.00%       5.152us       1.717us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.904us       100.00%       3.904us       1.301us             3  
+                                Activity Buffer Request        41.02%     249.979us        41.02%     249.979us     249.979us       1.248us        31.97%       1.248us       1.248us             1  
+                                       aten::empty_like         1.33%       8.110us         4.82%      29.361us       9.787us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.49%      21.251us         3.49%      21.251us       7.084us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        25.30%     154.197us        25.30%     154.197us      51.399us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.83%       5.050us         0.83%       5.050us       5.050us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 609.390us
+Self CUDA time total: 3.904us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     157.920us      4012.20%     157.920us     157.920us             1  
+                               hf_kernels_causal_conv1d        19.90%     106.583us        99.11%     530.709us     530.709us       0.000us         0.00%       5.216us       5.216us             1  
+                                         CausalConv1dFn        15.55%      83.245us        79.21%     424.126us     141.375us       0.000us         0.00%       5.216us       1.739us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.02%      26.862us        57.76%     309.281us     103.094us       3.936us       100.00%       5.216us       1.739us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       3.936us       100.00%       3.936us       1.312us             3  
+                                Activity Buffer Request        22.25%     119.154us        22.25%     119.154us     119.154us       1.280us        32.52%       1.280us       1.280us             1  
+                                       aten::empty_like         1.55%       8.320us         5.90%      31.600us      10.533us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.35%      23.280us         4.35%      23.280us       7.760us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        30.49%     163.265us        30.49%     163.265us      54.422us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.89%       4.750us         0.89%       4.750us       4.750us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 535.459us
+Self CUDA time total: 3.936us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.777us      2998.47%     123.777us     123.777us             1  
+                               hf_kernels_causal_conv1d        13.54%      78.054us        99.15%     571.700us     571.700us       0.000us         0.00%       5.504us       5.504us             1  
+                                         CausalConv1dFn        12.84%      74.051us        85.62%     493.646us     164.549us       0.000us         0.00%       5.504us       1.835us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.19%      24.152us        67.43%     388.784us     129.595us       4.128us       100.00%       5.504us       1.835us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.128us       100.00%       4.128us       1.376us             3  
+                                Activity Buffer Request        36.14%     208.368us        36.14%     208.368us     208.368us       1.376us        33.33%       1.376us       1.376us             1  
+                                       aten::empty_like         1.37%       7.901us         5.34%      30.811us      10.270us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.97%      22.910us         3.97%      22.910us       7.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        27.10%     156.264us        27.10%     156.264us      52.088us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.85%       4.881us         0.85%       4.881us       4.881us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 576.581us
+Self CUDA time total: 4.128us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     118.875us      2925.07%     118.875us     118.875us             1  
+                               hf_kernels_causal_conv1d        17.67%      83.134us        98.92%     465.527us     465.527us       0.000us         0.00%       5.440us       5.440us             1  
+                                         CausalConv1dFn        15.04%      70.762us        81.26%     382.393us     127.464us       0.000us         0.00%       5.440us       1.813us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         4.98%      23.432us        59.87%     281.731us      93.910us       4.064us       100.00%       5.440us       1.813us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       4.064us       100.00%       4.064us       1.355us             3  
+                                Activity Buffer Request        22.07%     103.873us        22.07%     103.873us     103.873us       1.376us        33.86%       1.376us       1.376us             1  
+                                       aten::empty_like         1.61%       7.590us         6.35%      29.900us       9.967us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.74%      22.310us         4.74%      22.310us       7.437us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.82%     154.426us        32.82%     154.426us      51.475us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.08%       5.061us         1.08%       5.061us       5.061us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 470.588us
+Self CUDA time total: 4.064us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     126.722us      2343.23%     126.722us     126.722us             1  
+                               hf_kernels_causal_conv1d        12.92%     104.393us        99.42%     803.188us     803.188us       0.000us         0.00%       7.264us       7.264us             1  
+                                         CausalConv1dFn         9.39%      75.863us        86.50%     698.795us     232.932us       0.000us         0.00%       7.264us       2.421us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         3.09%      24.969us        73.13%     590.770us     196.923us       5.408us       100.00%       7.264us       2.421us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.408us       100.00%       5.408us       1.803us             3  
+                                Activity Buffer Request        49.73%     401.794us        49.73%     401.794us     401.794us       1.856us        34.32%       1.856us       1.856us             1  
+                                       aten::empty_like         0.96%       7.780us         3.98%      32.162us      10.721us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         3.02%      24.382us         3.02%      24.382us       8.127us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        20.30%     164.007us        20.30%     164.007us      54.669us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.58%       4.700us         0.58%       4.700us       4.700us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 807.888us
+Self CUDA time total: 5.408us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     120.446us      2201.13%     120.446us     120.446us             1  
+                               hf_kernels_causal_conv1d        18.67%      89.551us        99.02%     474.966us     474.966us       0.000us         0.00%       7.328us       7.328us             1  
+                                         CausalConv1dFn        15.56%      74.654us        80.35%     385.415us     128.472us       0.000us         0.00%       7.328us       2.443us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.05%      24.231us        58.47%     280.459us      93.486us       5.472us       100.00%       7.328us       2.443us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us       5.472us       100.00%       5.472us       1.824us             3  
+                                Activity Buffer Request        20.97%     100.573us        20.97%     100.573us     100.573us       1.856us        33.92%       1.856us       1.856us             1  
+                                       aten::empty_like         1.52%       7.312us         6.32%      30.302us      10.101us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.79%      22.990us         4.79%      22.990us       7.663us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.45%     155.655us        32.45%     155.655us      51.885us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.98%       4.720us         0.98%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 479.686us
+Self CUDA time total: 5.472us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.208us       742.52%     130.208us     130.208us             1  
+                               hf_kernels_causal_conv1d         5.57%     103.684us        99.74%       1.855ms       1.855ms       0.000us         0.00%      23.424us      23.424us             1  
+                                         CausalConv1dFn         4.08%      75.922us        94.16%       1.751ms     583.780us       0.000us         0.00%      23.424us       7.808us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.22%      22.672us        88.43%       1.645ms     548.249us      17.536us       100.00%      23.424us       7.808us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.536us       100.00%      17.536us       5.845us             3  
+                                Activity Buffer Request        78.77%       1.465ms        78.77%       1.465ms       1.465ms       5.888us        33.58%       5.888us       5.888us             1  
+                                       aten::empty_like         0.43%       7.931us         1.65%      30.671us      10.224us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.22%      22.740us         1.22%      22.740us       7.580us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.44%     157.016us         8.44%     157.016us      52.339us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.26%       4.860us         0.26%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.860ms
+Self CUDA time total: 17.536us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.970us       691.76%     123.970us     123.970us             1  
+                               hf_kernels_causal_conv1d        18.87%      88.734us        98.86%     464.856us     464.856us       0.000us         0.00%      23.905us      23.905us             1  
+                                         CausalConv1dFn        15.17%      71.352us        79.99%     376.122us     125.374us       0.000us         0.00%      23.905us       7.968us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.25%      24.691us        58.28%     274.030us      91.343us      17.921us       100.00%      23.905us       7.968us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      17.921us       100.00%      17.921us       5.974us             3  
+                                Activity Buffer Request        19.83%      93.233us        19.83%      93.233us      93.233us       5.984us        33.39%       5.984us       5.984us             1  
+                                       aten::empty_like         1.60%       7.540us         6.54%      30.740us      10.247us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.93%      23.200us         4.93%      23.200us       7.733us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        33.20%     156.106us        33.20%     156.106us      52.035us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.14%       5.350us         1.14%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 470.206us
+Self CUDA time total: 17.921us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     130.879us       726.50%     130.879us     130.879us             1  
+                               hf_kernels_causal_conv1d         5.43%      99.212us        99.73%       1.824ms       1.824ms       0.000us         0.00%      24.063us      24.063us             1  
+                                         CausalConv1dFn         4.16%      76.013us        94.31%       1.725ms     574.860us       0.000us         0.00%      24.063us       8.021us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.28%      23.352us        88.43%       1.617ms     539.055us      18.015us       100.00%      24.063us       8.021us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.015us       100.00%      18.015us       6.005us             3  
+                                Activity Buffer Request        78.67%       1.439ms        78.67%       1.439ms       1.439ms       6.048us        33.57%       6.048us       6.048us             1  
+                                       aten::empty_like         0.41%       7.570us         1.72%      31.401us      10.467us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.30%      23.831us         1.30%      23.831us       7.944us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         8.49%     155.235us         8.49%     155.235us      51.745us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.27%       4.890us         0.27%       4.890us       4.890us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.829ms
+Self CUDA time total: 18.015us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     123.645us       665.08%     123.645us     123.645us             1  
+                               hf_kernels_causal_conv1d        22.59%     109.155us        99.05%     478.537us     478.537us       0.000us         0.00%      24.830us      24.830us             1  
+                                         CausalConv1dFn        15.84%      76.521us        76.45%     369.382us     123.127us       0.000us         0.00%      24.830us       8.277us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.13%      24.791us        54.61%     263.860us      87.953us      18.591us       100.00%      24.830us       8.277us             3  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      18.591us       100.00%      18.591us       6.197us             3  
+                                Activity Buffer Request        17.60%      85.023us        17.60%      85.023us      85.023us       6.239us        33.56%       6.239us       6.239us             1  
+                                       aten::empty_like         1.53%       7.411us         6.00%      29.001us       9.667us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.47%      21.590us         4.47%      21.590us       7.197us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        31.88%     154.046us        31.88%     154.046us      51.349us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.95%       4.601us         0.95%       4.601us       4.601us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 483.138us
+Self CUDA time total: 18.591us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d         5.67%     104.074us        99.72%       1.829ms       1.829ms       0.000us         0.00%     162.623us     162.623us             1  
+                                         CausalConv1dFn         4.47%      81.893us        94.05%       1.725ms     574.926us       0.000us         0.00%     162.623us      54.208us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         1.47%      26.950us        87.82%       1.611ms     536.865us      97.823us       100.00%     162.623us      54.208us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     146.719us       149.98%     146.719us     146.719us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      97.823us       100.00%      97.823us      32.608us             3  
+                                Activity Buffer Request        77.02%       1.413ms        77.02%       1.413ms       1.413ms      64.800us        66.24%      64.800us      64.800us             1  
+                                       aten::empty_like         0.45%       8.219us         1.76%      32.292us      10.764us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         1.31%      24.073us         1.31%      24.073us       8.024us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         9.33%     171.076us         9.33%     171.076us      57.025us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         0.28%       5.071us         0.28%       5.071us       5.071us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.834ms
+Self CUDA time total: 97.823us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                               hf_kernels_causal_conv1d        19.41%      95.622us        98.95%     487.536us     487.536us       0.000us         0.00%     165.309us     165.309us             1  
+                                         CausalConv1dFn        15.06%      74.214us        79.54%     391.914us     130.638us       0.000us         0.00%     165.309us      55.103us             3  
+              _causal_conv1d_90f5a60::causal_conv1d_fwd         5.22%      25.702us        58.53%     288.390us      96.130us      99.646us       100.00%     165.309us      55.103us             3  
+                               hf_kernels_causal_conv1d         0.00%       0.000us         0.00%       0.000us       0.000us     134.941us       135.42%     134.941us     134.941us             1  
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern...         0.00%       0.000us         0.00%       0.000us       0.000us      99.646us       100.00%      99.646us      33.215us             3  
+                                Activity Buffer Request        20.90%     102.993us        20.90%     102.993us     102.993us      65.663us        65.90%      65.663us      65.663us             1  
+                                       aten::empty_like         1.51%       7.430us         5.95%      29.310us       9.770us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         4.44%      21.880us         4.44%      21.880us       7.293us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel        32.41%     159.695us        32.41%     159.695us      53.232us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize         1.05%       5.180us         1.05%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 492.716us
+Self CUDA time total: 99.646us
+
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_causal_conv1d cuda_B2_D2048_S128_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D2048_S128_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S512_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S128_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S128_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
+
+
+
▶ UV Install Logs
+ +
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] +Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:00, 14.15it/s] +Fetching 11 files: 64%|██████▎ | 7/11 [00:02<00:01, 3.20it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:02<00:00, 5.38it/s]
+
+

Artifacts:

+causal_conv1d.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/causal_conv1d/impls/index.html b/causal_conv1d/impls/index.html new file mode 100644 index 0000000000000000000000000000000000000000..53e1498a3461d88697955f51c8d45218113d21f0 --- /dev/null +++ b/causal_conv1d/impls/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /causal_conv1d/impls + + + +
+ ← back +
+

Index of /causal_conv1d/impls

+ + + \ No newline at end of file diff --git a/causal_conv1d/impls/torch_causal_conv1d.html b/causal_conv1d/impls/torch_causal_conv1d.html new file mode 100644 index 0000000000000000000000000000000000000000..ff8df9ac4df5c3feb7604ba7136a718d0e2413fd --- /dev/null +++ b/causal_conv1d/impls/torch_causal_conv1d.html @@ -0,0 +1,4787 @@ + + + + + + torch_causal_conv1d + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

PyTorch Native - Causal Conv1D

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.23s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 29 00:36:08 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   30C    P0             87W /  350W |       0MiB /  46068MiB |     18%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Causal Conv1D Benchmark (PyTorch Native)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 7.30s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import torch.nn.functional as F
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_causal_conv1d(input_tensor, weight, bias):
+    # Convert to weight dtype for computation
+    x = input_tensor.to(weight.dtype)
+    dim = weight.shape[0]
+    width = weight.shape[1]
+    seqlen = input_tensor.shape[-1]
+
+    # Depthwise causal conv1d using PyTorch
+    out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
+
+    # Truncate to original sequence length
+    out = out[..., :seqlen]
+
+    # Convert back to original dtype
+    return out.to(input_tensor.dtype)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
+    impl_name="torch_eager",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=torch_causal_conv1d,
+)
+
+ +
+
+
+
+
Running causal_conv1d benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     460.509us      2386.43%     460.509us     460.509us             1  
+                                            torch_eager        10.46%     229.787us        99.65%       2.189ms       2.189ms       0.000us         0.00%      21.633us      21.633us             1  
+                                               aten::to         0.59%      12.913us        79.38%       1.743ms     290.578us       0.000us         0.00%      14.272us       2.379us             6  
+                                         aten::_to_copy         1.99%      43.750us        78.79%       1.731ms     288.426us       0.000us         0.00%      14.272us       2.379us             6  
+                                            aten::copy_         2.89%      63.562us        74.16%       1.629ms     271.469us      11.936us        61.85%      14.272us       2.379us             6  
+                                           aten::conv1d         0.44%       9.671us         7.66%     168.306us      56.102us       0.000us         0.00%       7.361us       2.454us             3  
+                                      aten::convolution         0.72%      15.890us         7.22%     158.635us      52.878us       0.000us         0.00%       7.361us       2.454us             3  
+                                     aten::_convolution         1.69%      37.102us         6.50%     142.745us      47.582us       0.000us         0.00%       7.361us       2.454us             3  
+                                aten::_conv_depthwise2d         1.60%      35.230us         3.77%      82.773us      27.591us       7.361us        38.15%       7.361us       2.454us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        38.15%       7.361us       2.454us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.50%       6.272us       2.091us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.35%       5.664us       1.888us             3  
+                                Activity Buffer Request        68.26%       1.499ms        68.26%       1.499ms       1.499ms       2.336us        12.11%       2.336us       2.336us             1  
+                                    aten::empty_strided         2.64%      57.992us         2.64%      57.992us       9.665us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         4.12%      90.443us         4.12%      90.443us      10.049us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.47%      32.392us         1.88%      41.212us       4.579us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.64%      14.011us         0.64%      14.011us       0.934us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      12.120us         0.55%      12.120us       4.040us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.50%      10.961us         0.50%      10.961us       3.654us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.43%       9.410us         0.51%      11.220us       3.740us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.196ms
+Self CUDA time total: 19.297us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     350.557us      1795.89%     350.557us     350.557us             1  
+                                            torch_eager         6.82%     130.236us        99.71%       1.905ms       1.905ms       0.000us         0.00%      21.632us      21.632us             1  
+                                               aten::to         0.35%       6.597us        84.97%       1.623ms     270.580us       0.000us         0.00%      13.728us       2.288us             6  
+                                         aten::_to_copy         1.27%      24.323us        84.63%       1.617ms     269.481us       0.000us         0.00%      13.728us       2.288us             6  
+                                            aten::copy_         2.68%      51.130us        81.67%       1.560ms     260.072us      11.616us        59.51%      13.728us       2.288us             6  
+                                           aten::conv1d         0.33%       6.400us         6.43%     122.914us      40.971us       0.000us         0.00%       7.904us       2.635us             3  
+                                      aten::convolution         0.52%       9.901us         6.10%     116.514us      38.838us       0.000us         0.00%       7.904us       2.635us             3  
+                                     aten::_convolution         1.28%      24.410us         5.58%     106.613us      35.538us       0.000us         0.00%       7.904us       2.635us             3  
+                                aten::_conv_depthwise2d         1.25%      23.932us         3.35%      63.983us      21.328us       7.904us        40.49%       7.904us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        40.49%       7.904us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        31.15%       6.080us       2.027us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.536us        28.36%       5.536us       1.845us             3  
+                                Activity Buffer Request        76.19%       1.456ms        76.19%       1.456ms       1.456ms       2.112us        10.82%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.68%      32.131us         1.68%      32.131us       5.355us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.93%      75.003us         3.93%      75.003us       8.334us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.97%      18.540us         1.29%      24.620us       2.736us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.51%       9.711us         0.51%       9.711us       0.647us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%       9.650us         0.51%       9.650us       3.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       9.000us         0.47%       9.000us       3.000us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.37%       7.100us         0.45%       8.560us       2.853us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.911ms
+Self CUDA time total: 19.520us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     379.390us      2047.55%     379.390us     379.390us             1  
+                                            torch_eager         8.20%     159.835us        99.65%       1.942ms       1.942ms       0.000us         0.00%      20.449us      20.449us             1  
+                                               aten::to         0.37%       7.179us        83.32%       1.624ms     270.686us       0.000us         0.00%      13.536us       2.256us             6  
+                                         aten::_to_copy         1.40%      27.213us        82.96%       1.617ms     269.489us       0.000us         0.00%      13.536us       2.256us             6  
+                                            aten::copy_         2.62%      51.160us        79.92%       1.558ms     259.635us      11.616us        62.69%      13.536us       2.256us             6  
+                                           aten::conv1d         0.34%       6.560us         6.49%     126.453us      42.151us       0.000us         0.00%       6.913us       2.304us             3  
+                                      aten::convolution         0.57%      11.119us         6.15%     119.893us      39.964us       0.000us         0.00%       6.913us       2.304us             3  
+                                     aten::_convolution         1.29%      25.191us         5.58%     108.774us      36.258us       0.000us         0.00%       6.913us       2.304us             3  
+                                aten::_conv_depthwise2d         1.16%      22.580us         3.36%      65.502us      21.834us       6.913us        37.31%       6.913us       2.304us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.913us        37.31%       6.913us       2.304us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        31.95%       5.920us       1.973us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.74%       5.696us       1.899us             3  
+                                Activity Buffer Request        74.82%       1.458ms        74.82%       1.458ms       1.458ms       1.920us        10.36%       1.920us       1.920us             1  
+                                    aten::empty_strided         1.64%      31.911us         1.64%      31.911us       5.319us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.59%      70.043us         3.59%      70.043us       7.783us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.01%      19.612us         1.35%      26.392us       2.932us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.55%      10.750us         0.55%      10.750us       0.717us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.62%      12.182us         0.62%      12.182us       4.061us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.46%       8.910us         0.46%       8.910us       2.970us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.35%       6.890us         0.42%       8.260us       2.753us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.949ms
+Self CUDA time total: 18.529us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.058us      1736.41%     340.058us     340.058us             1  
+                                            torch_eager         6.15%     129.375us        99.74%       2.097ms       2.097ms       0.000us         0.00%      21.760us      21.760us             1  
+                                               aten::to         0.32%       6.700us        86.45%       1.818ms     303.002us       0.000us         0.00%      14.112us       2.352us             6  
+                                         aten::_to_copy         1.17%      24.651us        86.13%       1.811ms     301.886us       0.000us         0.00%      14.112us       2.352us             6  
+                                            aten::copy_         2.42%      50.883us        83.54%       1.757ms     292.785us      11.936us        60.95%      14.112us       2.352us             6  
+                                           aten::conv1d         0.30%       6.290us         5.74%     120.803us      40.268us       0.000us         0.00%       7.648us       2.549us             3  
+                                      aten::convolution         0.48%      10.020us         5.45%     114.513us      38.171us       0.000us         0.00%       7.648us       2.549us             3  
+                                     aten::_convolution         1.15%      24.209us         4.97%     104.493us      34.831us       0.000us         0.00%       7.648us       2.549us             3  
+                                aten::_conv_depthwise2d         1.00%      21.080us         2.93%      61.691us      20.564us       7.648us        39.05%       7.648us       2.549us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.648us        39.05%       7.648us       2.549us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        31.70%       6.208us       2.069us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.25%       5.728us       1.909us             3  
+                                Activity Buffer Request        71.15%       1.496ms        71.15%       1.496ms       1.496ms       2.176us        11.11%       2.176us       2.176us             1  
+                                    aten::empty_strided         1.42%      29.951us         1.42%      29.951us       4.992us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.98%     230.807us        10.98%     230.807us      25.645us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.94%      19.863us         1.21%      25.543us       2.838us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       9.630us         0.46%       9.630us       0.642us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.50%      10.541us         0.50%      10.541us       3.514us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       8.810us         0.42%       8.810us       2.937us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.35%       7.411us         0.44%       9.201us       3.067us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.103ms
+Self CUDA time total: 19.584us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.070us      1381.53%     339.070us     339.070us             1  
+                                            torch_eager         6.44%     132.135us        99.72%       2.045ms       2.045ms       0.000us         0.00%      26.814us      26.814us             1  
+                                               aten::to         0.33%       6.722us        86.08%       1.765ms     294.155us       0.000us         0.00%      15.262us       2.544us             6  
+                                         aten::_to_copy         1.20%      24.702us        85.75%       1.758ms     293.035us       0.000us         0.00%      15.262us       2.544us             6  
+                                            aten::copy_         2.39%      49.030us        83.04%       1.702ms     283.750us      12.991us        52.93%      15.262us       2.544us             6  
+                                           aten::conv1d         0.29%       5.850us         5.78%     118.603us      39.534us       0.000us         0.00%      11.552us       3.851us             3  
+                                      aten::convolution         0.55%      11.220us         5.50%     112.753us      37.584us       0.000us         0.00%      11.552us       3.851us             3  
+                                     aten::_convolution         1.18%      24.170us         4.95%     101.533us      33.844us       0.000us         0.00%      11.552us       3.851us             3  
+                                aten::_conv_depthwise2d         1.08%      22.212us         2.99%      61.273us      20.424us      11.552us        47.07%      11.552us       3.851us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        47.07%      11.552us       3.851us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.655us        27.12%       6.655us       2.218us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        25.82%       6.336us       2.112us             3  
+                                Activity Buffer Request        71.25%       1.461ms        71.25%       1.461ms       1.461ms       2.271us         9.25%       2.271us       2.271us             1  
+                                    aten::empty_strided         1.51%      31.010us         1.51%      31.010us       5.168us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.41%     213.527us        10.41%     213.527us      23.725us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      18.350us         1.15%      23.660us       2.629us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.131us         0.45%       9.131us       0.609us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.46%       9.481us         0.46%       9.481us       3.160us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.760us         0.43%       8.760us       2.920us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       5.520us         0.33%       6.850us       2.283us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.050ms
+Self CUDA time total: 24.543us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.129us      1305.15%     339.129us     339.129us             1  
+                                            torch_eager         6.29%     128.886us        99.74%       2.043ms       2.043ms       0.000us         0.00%      28.224us      28.224us             1  
+                                               aten::to         0.34%       6.902us        86.10%       1.763ms     293.882us       0.000us         0.00%      15.168us       2.528us             6  
+                                         aten::_to_copy         1.23%      25.190us        85.76%       1.756ms     292.731us       0.000us         0.00%      15.168us       2.528us             6  
+                                            aten::copy_         2.41%      49.270us        83.08%       1.701ms     283.571us      12.928us        49.75%      15.168us       2.528us             6  
+                                           aten::conv1d         0.31%       6.370us         5.92%     121.333us      40.444us       0.000us         0.00%      13.056us       4.352us             3  
+                                      aten::convolution         0.49%      10.120us         5.61%     114.963us      38.321us       0.000us         0.00%      13.056us       4.352us             3  
+                                     aten::_convolution         1.25%      25.500us         5.12%     104.843us      34.948us       0.000us         0.00%      13.056us       4.352us             3  
+                                aten::_conv_depthwise2d         1.08%      22.212us         3.04%      62.243us      20.748us      13.056us        50.25%      13.056us       4.352us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.056us        50.25%      13.056us       4.352us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        25.37%       6.592us       2.197us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        24.38%       6.336us       2.112us             3  
+                                Activity Buffer Request        71.41%       1.463ms        71.41%       1.463ms       1.463ms       2.240us         8.62%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.45%      29.770us         1.45%      29.770us       4.962us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.25%     209.968us        10.25%     209.968us      23.330us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      18.870us         1.21%      24.780us       2.753us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       9.601us         0.47%       9.601us       0.640us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%      10.510us         0.51%      10.510us       3.503us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.45%       9.181us         0.45%       9.181us       3.060us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.640us         0.40%       8.140us       2.713us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.048ms
+Self CUDA time total: 25.984us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     362.270us       942.63%     362.270us     362.270us             1  
+                                            torch_eager         7.50%     163.876us        99.75%       2.180ms       2.180ms       0.000us         0.00%      40.993us      40.993us             1  
+                                           aten::conv1d         0.34%       7.388us         5.94%     129.794us      43.265us       0.000us         0.00%      22.464us       7.488us             3  
+                                      aten::convolution         0.56%      12.301us         5.60%     122.406us      40.802us       0.000us         0.00%      22.464us       7.488us             3  
+                                     aten::_convolution         1.18%      25.829us         5.04%     110.105us      36.702us       0.000us         0.00%      22.464us       7.488us             3  
+                                aten::_conv_depthwise2d         1.07%      23.371us         2.94%      64.311us      21.437us      22.464us        58.45%      22.464us       7.488us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.464us        58.45%      22.464us       7.488us             3  
+                                               aten::to         0.36%       7.830us        84.95%       1.856ms     309.406us       0.000us         0.00%      18.529us       3.088us             6  
+                                         aten::_to_copy         1.44%      31.560us        84.59%       1.849ms     308.101us       0.000us         0.00%      18.529us       3.088us             6  
+                                            aten::copy_         2.41%      52.633us        81.64%       1.784ms     297.326us      15.968us        41.55%      18.529us       3.088us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.609us        22.40%       8.609us       2.870us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.15%       7.359us       2.453us             3  
+                                Activity Buffer Request        65.39%       1.429ms        65.39%       1.429ms       1.429ms       2.561us         6.66%       2.561us       2.561us             1  
+                                    aten::empty_strided         1.51%      33.091us         1.51%      33.091us       5.515us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        14.87%     325.052us        14.87%     325.052us      36.117us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.00%      21.833us         1.21%      26.523us       2.947us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.39%       8.492us         0.39%       8.492us       0.566us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.44%       9.570us         0.44%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       8.750us         0.40%       8.750us       2.917us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.37%       7.980us         0.45%       9.772us       3.257us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.185ms
+Self CUDA time total: 38.432us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.836us       827.74%     339.836us     339.836us             1  
+                                            torch_eager         6.54%     141.434us        99.74%       2.158ms       2.158ms       0.000us         0.00%      43.648us      43.648us             1  
+                                           aten::conv1d         0.28%       6.090us         5.53%     119.574us      39.858us       0.000us         0.00%      25.407us       8.469us             3  
+                                      aten::convolution         0.46%       9.939us         5.25%     113.484us      37.828us       0.000us         0.00%      25.407us       8.469us             3  
+                                     aten::_convolution         1.12%      24.214us         4.79%     103.545us      34.515us       0.000us         0.00%      25.407us       8.469us             3  
+                                aten::_conv_depthwise2d         1.05%      22.612us         2.94%      63.593us      21.198us      25.407us        61.88%      25.407us       8.469us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.407us        61.88%      25.407us       8.469us             3  
+                                               aten::to         0.29%       6.201us        86.38%       1.869ms     311.424us       0.000us         0.00%      18.241us       3.040us             6  
+                                         aten::_to_copy         1.18%      25.424us        86.09%       1.862ms     310.391us       0.000us         0.00%      18.241us       3.040us             6  
+                                            aten::copy_         2.40%      51.862us        83.52%       1.807ms     301.107us      15.649us        38.12%      18.241us       3.040us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.320us        20.27%       8.320us       2.773us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.329us        17.85%       7.329us       2.443us             3  
+                                Activity Buffer Request        68.07%       1.472ms        68.07%       1.472ms       1.472ms       2.592us         6.31%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.40%      30.280us         1.40%      30.280us       5.047us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        14.06%     304.169us        14.06%     304.169us      33.797us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.84%      18.230us         1.08%      23.418us       2.602us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.40%       8.619us         0.40%       8.619us       0.575us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%      10.370us         0.48%      10.370us       3.457us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       8.770us         0.41%       8.770us       2.923us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.26%       5.659us         0.32%       6.990us       2.330us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.163ms
+Self CUDA time total: 41.056us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.560us       329.80%     338.560us     338.560us             1  
+                                            torch_eager         6.25%     131.427us        99.74%       2.098ms       2.098ms       0.000us         0.00%     108.608us     108.608us             1  
+                                           aten::conv1d         0.29%       6.110us         5.71%     120.083us      40.028us       0.000us         0.00%      70.496us      23.499us             3  
+                                      aten::convolution         0.47%       9.940us         5.42%     113.973us      37.991us       0.000us         0.00%      70.496us      23.499us             3  
+                                     aten::_convolution         1.11%      23.441us         4.94%     104.033us      34.678us       0.000us         0.00%      70.496us      23.499us             3  
+                                aten::_conv_depthwise2d         1.04%      21.830us         2.93%      61.652us      20.551us      70.496us        68.67%      70.496us      23.499us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.496us        68.67%      70.496us      23.499us             3  
+                                               aten::to         0.30%       6.292us        86.43%       1.818ms     303.059us       0.000us         0.00%      38.112us       6.352us             6  
+                                         aten::_to_copy         1.17%      24.539us        86.13%       1.812ms     302.010us       0.000us         0.00%      38.112us       6.352us             6  
+                                            aten::copy_         2.47%      51.869us        83.58%       1.758ms     293.072us      32.160us        31.33%      38.112us       6.352us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.568us        17.11%      17.568us       5.856us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.592us        14.21%      14.592us       4.864us             3  
+                                Activity Buffer Request        67.63%       1.423ms        67.63%       1.423ms       1.423ms       5.952us         5.80%       5.952us       5.952us             1  
+                                    aten::empty_strided         1.38%      29.091us         1.38%      29.091us       4.849us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        14.47%     304.542us        14.47%     304.542us      33.838us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.91%      19.049us         1.17%      24.579us       2.731us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       9.070us         0.43%       9.070us       0.605us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.49%      10.351us         0.49%      10.351us       3.450us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       8.621us         0.41%       8.621us       2.874us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.38%       8.050us         0.45%       9.470us       3.157us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.104ms
+Self CUDA time total: 102.656us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.578us       301.93%     340.578us     340.578us             1  
+                                            torch_eager         6.29%     133.214us        99.74%       2.113ms       2.113ms       0.000us         0.00%     118.752us     118.752us             1  
+                                           aten::conv1d         0.31%       6.499us         5.66%     119.974us      39.991us       0.000us         0.00%      80.576us      26.859us             3  
+                                      aten::convolution         0.47%       9.880us         5.36%     113.475us      37.825us       0.000us         0.00%      80.576us      26.859us             3  
+                                     aten::_convolution         1.21%      25.730us         4.89%     103.595us      34.532us       0.000us         0.00%      80.576us      26.859us             3  
+                                aten::_conv_depthwise2d         1.01%      21.361us         2.87%      60.832us      20.277us      80.576us        71.43%      80.576us      26.859us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.576us        71.43%      80.576us      26.859us             3  
+                                               aten::to         0.33%       7.060us        86.42%       1.831ms     305.149us       0.000us         0.00%      38.176us       6.363us             6  
+                                         aten::_to_copy         1.15%      24.352us        86.09%       1.824ms     303.972us       0.000us         0.00%      38.176us       6.363us             6  
+                                            aten::copy_         2.34%      49.642us        83.57%       1.770ms     295.075us      32.224us        28.57%      38.176us       6.363us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.664us        15.66%      17.664us       5.888us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.560us        12.91%      14.560us       4.853us             3  
+                                Activity Buffer Request        68.62%       1.454ms        68.62%       1.454ms       1.454ms       5.952us         5.28%       5.952us       5.952us             1  
+                                    aten::empty_strided         1.37%      29.031us         1.37%      29.031us       4.838us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        13.59%     287.970us        13.59%     287.970us      31.997us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      18.772us         1.17%      24.871us       2.763us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.520us         0.45%       9.520us       0.635us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.46%       9.850us         0.46%       9.850us       3.283us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       8.670us         0.41%       8.670us       2.890us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.821us         0.38%       8.112us       2.704us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.119ms
+Self CUDA time total: 112.800us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         6.32%     133.665us        99.60%       2.106ms       2.106ms       0.000us         0.00%     433.181us     433.181us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     423.869us       107.93%     423.869us     423.869us             1  
+                                           aten::conv1d         0.30%       6.441us         5.98%     126.475us      42.158us       0.000us         0.00%     252.190us      84.063us             3  
+                                      aten::convolution         0.49%      10.391us         5.68%     120.034us      40.011us       0.000us         0.00%     252.190us      84.063us             3  
+                                     aten::_convolution         1.19%      25.110us         5.19%     109.643us      36.548us       0.000us         0.00%     252.190us      84.063us             3  
+                                aten::_conv_depthwise2d         1.07%      22.550us         3.14%      66.363us      22.121us     252.190us        64.21%     252.190us      84.063us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     252.190us        64.21%     252.190us      84.063us             3  
+                                               aten::to         0.33%       6.989us        85.86%       1.815ms     302.520us       0.000us         0.00%     180.991us      30.165us             6  
+                                         aten::_to_copy         1.18%      24.921us        85.53%       1.808ms     301.355us       0.000us         0.00%     180.991us      30.165us             6  
+                                            aten::copy_         2.39%      50.532us        82.93%       1.753ms     292.204us     140.543us        35.79%     180.991us      30.165us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     100.768us        25.66%     100.768us      33.589us             3  
+                                Activity Buffer Request        67.47%       1.426ms        67.47%       1.426ms       1.426ms      40.448us        10.30%      40.448us      40.448us             1  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.775us        10.13%      39.775us      13.258us             3  
+                                    aten::empty_strided         1.42%      29.990us         1.42%      29.990us       4.998us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        14.15%     299.142us        14.15%     299.142us      33.238us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      19.400us         1.21%      25.500us       2.833us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.49%      10.430us         0.49%      10.430us       0.695us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      11.580us         0.55%      11.580us       3.860us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       9.361us         0.44%       9.361us       3.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.34%       7.110us         0.42%       8.900us       2.967us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.114ms
+Self CUDA time total: 392.733us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         6.65%     143.166us        97.03%       2.090ms       2.090ms       0.000us         0.00%     486.301us     486.301us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     477.853us       106.88%     477.853us     477.853us             1  
+                                           aten::conv1d         0.33%       7.110us         5.88%     126.575us      42.192us       0.000us         0.00%     298.557us      99.519us             3  
+                                      aten::convolution         0.51%      11.062us         5.55%     119.465us      39.822us       0.000us         0.00%     298.557us      99.519us             3  
+                                     aten::_convolution         1.16%      25.071us         5.03%     108.403us      36.134us       0.000us         0.00%     298.557us      99.519us             3  
+                                aten::_conv_depthwise2d         1.05%      22.671us         3.05%      65.592us      21.864us     298.557us        66.78%     298.557us      99.519us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.557us        66.78%     298.557us      99.519us             3  
+                                               aten::to         0.33%       7.030us        83.12%       1.790ms     298.407us       0.000us         0.00%     187.744us      31.291us             6  
+                                         aten::_to_copy         1.22%      26.183us        82.80%       1.783ms     297.235us       0.000us         0.00%     187.744us      31.291us             6  
+                                            aten::copy_         2.41%      51.979us        80.11%       1.726ms     287.603us     148.544us        33.22%     187.744us      31.291us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     108.768us        24.33%     108.768us      36.256us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.776us         8.90%      39.776us      13.259us             3  
+                                Activity Buffer Request        66.10%       1.424ms        66.10%       1.424ms       1.424ms      39.200us         8.77%      39.200us      39.200us             1  
+                                    aten::empty_strided         1.47%      31.611us         1.47%      31.611us       5.268us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.61%     271.569us        12.61%     271.569us      30.174us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.93%      19.971us         1.21%      26.011us       2.890us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.711us         0.45%       9.711us       0.647us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%      10.061us         0.47%      10.061us       3.354us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%      11.040us         0.51%      11.040us       3.680us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.950us         0.34%       7.400us       2.467us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.154ms
+Self CUDA time total: 447.101us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     355.165us      1897.25%     355.165us     355.165us             1  
+                                            torch_eager        15.24%     136.376us        99.32%     888.600us     888.600us       0.000us         0.00%      20.608us      20.608us             1  
+                                               aten::to         0.80%       7.121us        66.93%     598.831us      99.805us       0.000us         0.00%      13.376us       2.229us             6  
+                                         aten::_to_copy         2.95%      26.380us        66.13%     591.710us      98.618us       0.000us         0.00%      13.376us       2.229us             6  
+                                            aten::copy_         5.90%      52.793us        59.34%     530.948us      88.491us      11.488us        61.37%      13.376us       2.229us             6  
+                                           aten::conv1d         0.68%       6.050us        13.88%     124.163us      41.388us       0.000us         0.00%       7.232us       2.411us             3  
+                                      aten::convolution         1.23%      10.987us        13.20%     118.113us      39.371us       0.000us         0.00%       7.232us       2.411us             3  
+                                     aten::_convolution         2.78%      24.854us        11.97%     107.126us      35.709us       0.000us         0.00%       7.232us       2.411us             3  
+                                aten::_conv_depthwise2d         2.73%      24.470us         7.32%      65.481us      21.827us       7.232us        38.63%       7.232us       2.411us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.232us        38.63%       7.232us       2.411us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        31.62%       5.920us       1.973us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        29.74%       5.568us       1.856us             3  
+                                Activity Buffer Request        26.68%     238.708us        26.68%     238.708us     238.708us       1.888us        10.09%       1.888us       1.888us             1  
+                                    aten::empty_strided         3.84%      34.382us         3.84%      34.382us       5.730us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        29.10%     260.398us        29.10%     260.398us      28.933us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.02%      18.071us         2.57%      22.961us       2.551us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.97%       8.709us         0.97%       8.709us       0.581us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.22%      10.910us         1.22%      10.910us       3.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.02%       9.150us         1.02%       9.150us       3.050us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.75%       6.751us         0.92%       8.220us       2.740us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 894.710us
+Self CUDA time total: 18.720us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.578us      1674.05%     323.578us     323.578us             1  
+                                            torch_eager        14.45%     120.436us        99.39%     828.559us     828.559us       0.000us         0.00%      21.217us      21.217us             1  
+                                               aten::to         0.75%       6.271us        67.77%     564.939us      94.156us       0.000us         0.00%      13.377us       2.230us             6  
+                                         aten::_to_copy         2.76%      22.992us        67.02%     558.668us      93.111us       0.000us         0.00%      13.377us       2.230us             6  
+                                            aten::copy_         5.96%      49.722us        60.74%     506.327us      84.388us      11.489us        59.44%      13.377us       2.230us             6  
+                                           aten::conv1d         0.75%       6.211us        13.83%     115.254us      38.418us       0.000us         0.00%       7.840us       2.613us             3  
+                                      aten::convolution         1.19%       9.930us        13.08%     109.043us      36.348us       0.000us         0.00%       7.840us       2.613us             3  
+                                     aten::_convolution         2.77%      23.131us        11.89%      99.113us      33.038us       0.000us         0.00%       7.840us       2.613us             3  
+                                aten::_conv_depthwise2d         2.53%      21.092us         7.21%      60.132us      20.044us       7.840us        40.56%       7.840us       2.613us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us        40.56%       7.840us       2.613us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.857us        30.30%       5.857us       1.952us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.14%       5.632us       1.877us             3  
+                                Activity Buffer Request        27.26%     227.207us        27.26%     227.207us     227.207us       1.888us         9.77%       1.888us       1.888us             1  
+                                    aten::empty_strided         3.52%      29.349us         3.52%      29.349us       4.891us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        29.92%     249.418us        29.92%     249.418us      27.713us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.13%      17.749us         2.80%      23.370us       2.597us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.11%       9.261us         1.11%       9.261us       0.617us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.16%       9.660us         1.16%       9.660us       3.220us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%       9.360us         1.12%       9.360us       3.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.70%       5.810us         0.88%       7.370us       2.457us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 833.619us
+Self CUDA time total: 19.329us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     326.394us      1677.60%     326.394us     326.394us             1  
+                                            torch_eager        14.78%     122.914us        99.34%     825.919us     825.919us       0.000us         0.00%      21.632us      21.632us             1  
+                                               aten::to         0.79%       6.552us        67.16%     558.381us      93.064us       0.000us         0.00%      14.368us       2.395us             6  
+                                         aten::_to_copy         2.94%      24.430us        66.37%     551.829us      91.971us       0.000us         0.00%      14.368us       2.395us             6  
+                                            aten::copy_         5.83%      48.462us        59.95%     498.427us      83.071us      12.192us        62.66%      14.368us       2.395us             6  
+                                           aten::conv1d         0.71%       5.939us        14.00%     116.404us      38.801us       0.000us         0.00%       7.264us       2.421us             3  
+                                      aten::convolution         1.18%       9.811us        13.29%     110.465us      36.822us       0.000us         0.00%       7.264us       2.421us             3  
+                                     aten::_convolution         2.85%      23.732us        12.11%     100.654us      33.551us       0.000us         0.00%       7.264us       2.421us             3  
+                                aten::_conv_depthwise2d         2.52%      20.910us         7.24%      60.232us      20.077us       7.264us        37.34%       7.264us       2.421us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.264us        37.34%       7.264us       2.421us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.40%       6.304us       2.101us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        30.26%       5.888us       1.963us             3  
+                                Activity Buffer Request        26.68%     221.788us        26.68%     221.788us     221.788us       2.176us        11.18%       2.176us       2.176us             1  
+                                    aten::empty_strided         3.48%      28.972us         3.48%      28.972us       4.829us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        30.05%     249.819us        30.05%     249.819us      27.758us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.04%      16.929us         2.67%      22.200us       2.467us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.901us         1.07%       8.901us       0.593us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.15%       9.570us         1.15%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.98%       8.110us         0.98%       8.110us       2.703us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.86%       7.190us         1.02%       8.500us       2.833us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 831.399us
+Self CUDA time total: 19.456us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     356.696us      1774.96%     356.696us     356.696us             1  
+                                            torch_eager        13.86%     123.804us        99.36%     887.440us     887.440us       0.000us         0.00%      22.272us      22.272us             1  
+                                               aten::to         0.71%       6.320us        66.62%     595.061us      99.177us       0.000us         0.00%      14.368us       2.395us             6  
+                                         aten::_to_copy         2.82%      25.151us        65.92%     588.741us      98.124us       0.000us         0.00%      14.368us       2.395us             6  
+                                            aten::copy_         5.73%      51.172us        59.67%     532.958us      88.826us      12.192us        60.67%      14.368us       2.395us             6  
+                                           aten::conv1d         0.70%       6.210us        15.70%     140.195us      46.732us       0.000us         0.00%       7.904us       2.635us             3  
+                                      aten::convolution         1.11%       9.881us        15.00%     133.985us      44.662us       0.000us         0.00%       7.904us       2.635us             3  
+                                     aten::_convolution         2.74%      24.510us        13.89%     124.104us      41.368us       0.000us         0.00%       7.904us       2.635us             3  
+                                aten::_conv_depthwise2d         2.70%      24.090us         9.26%      82.742us      27.581us       7.904us        39.33%       7.904us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        39.33%       7.904us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.240us        31.05%       6.240us       2.080us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.952us        29.62%       5.952us       1.984us             3  
+                                Activity Buffer Request        28.94%     258.459us        28.94%     258.459us     258.459us       2.176us        10.83%       2.176us       2.176us             1  
+                                    aten::empty_strided         3.43%      30.632us         3.43%      30.632us       5.105us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        29.46%     263.129us        29.46%     263.129us      29.237us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.97%      17.620us         2.61%      23.310us       2.590us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       9.580us         1.07%       9.580us       0.639us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.09%       9.720us         1.09%       9.720us       3.240us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.02%       9.130us         1.02%       9.130us       3.043us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.75%       6.702us         0.94%       8.422us       2.807us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 893.171us
+Self CUDA time total: 20.096us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.730us       926.72%     332.730us     332.730us             1  
+                                            torch_eager        14.27%     126.064us        99.42%     878.341us     878.341us       0.000us         0.00%      38.496us      38.496us             1  
+                                           aten::conv1d         0.64%       5.671us        13.39%     118.255us      39.418us       0.000us         0.00%      20.096us       6.699us             3  
+                                      aten::convolution         1.11%       9.840us        12.74%     112.584us      37.528us       0.000us         0.00%      20.096us       6.699us             3  
+                                     aten::_convolution         2.79%      24.681us        11.63%     102.744us      34.248us       0.000us         0.00%      20.096us       6.699us             3  
+                                aten::_conv_depthwise2d         2.42%      21.390us         7.02%      62.061us      20.687us      20.096us        55.97%      20.096us       6.699us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.096us        55.97%      20.096us       6.699us             3  
+                                               aten::to         0.72%       6.320us        68.61%     606.182us     101.030us       0.000us         0.00%      18.400us       3.067us             6  
+                                         aten::_to_copy         2.82%      24.900us        67.90%     599.862us      99.977us       0.000us         0.00%      18.400us       3.067us             6  
+                                            aten::copy_         5.62%      49.645us        61.77%     545.702us      90.950us      15.808us        44.03%      18.400us       3.067us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        23.53%       8.448us       2.816us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        20.50%       7.360us       2.453us             3  
+                                Activity Buffer Request        29.42%     259.919us        29.42%     259.919us     259.919us       2.592us         7.22%       2.592us       2.592us             1  
+                                    aten::empty_strided         3.31%      29.260us         3.31%      29.260us       4.877us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        29.15%     257.559us        29.15%     257.559us      28.618us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.02%      17.842us         2.68%      23.662us       2.629us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.05%       9.271us         1.05%       9.271us       0.618us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%      10.540us         1.19%      10.540us       3.513us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.99%       8.710us         0.99%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.65%       5.719us         0.80%       7.050us       2.350us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 883.481us
+Self CUDA time total: 35.904us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     337.888us       888.80%     337.888us     337.888us             1  
+                                            torch_eager         6.31%     128.615us        99.74%       2.033ms       2.033ms       0.000us         0.00%      40.576us      40.576us             1  
+                                           aten::conv1d         0.31%       6.349us         5.98%     121.885us      40.628us       0.000us         0.00%      22.304us       7.435us             3  
+                                      aten::convolution         0.53%      10.852us         5.67%     115.536us      38.512us       0.000us         0.00%      22.304us       7.435us             3  
+                                     aten::_convolution         1.24%      25.291us         5.14%     104.684us      34.895us       0.000us         0.00%      22.304us       7.435us             3  
+                                aten::_conv_depthwise2d         1.08%      22.031us         3.01%      61.431us      20.477us      22.304us        58.67%      22.304us       7.435us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.304us        58.67%      22.304us       7.435us             3  
+                                               aten::to         0.34%       6.829us        86.09%       1.755ms     292.477us       0.000us         0.00%      18.272us       3.045us             6  
+                                         aten::_to_copy         1.20%      24.424us        85.75%       1.748ms     291.339us       0.000us         0.00%      18.272us       3.045us             6  
+                                            aten::copy_         2.48%      50.501us        83.10%       1.694ms     282.331us      15.712us        41.33%      18.272us       3.045us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.320us        21.89%       8.320us       2.773us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        19.44%       7.392us       2.464us             3  
+                                Activity Buffer Request        69.75%       1.422ms        69.75%       1.422ms       1.422ms       2.560us         6.73%       2.560us       2.560us             1  
+                                    aten::empty_strided         1.45%      29.621us         1.45%      29.621us       4.937us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.90%     242.506us        11.90%     242.506us      26.945us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      18.701us         1.17%      23.851us       2.650us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.710us         0.43%       8.710us       0.581us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.800us         0.48%       9.800us       3.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.710us         0.43%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.35%       7.191us         0.42%       8.621us       2.874us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.038ms
+Self CUDA time total: 38.016us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     362.972us       567.16%     362.972us     362.972us             1  
+                                            torch_eager        14.84%     128.544us        99.34%     860.680us     860.680us       0.000us         0.00%      68.061us      68.061us             1  
+                                           aten::conv1d         0.70%       6.079us        16.52%     143.165us      47.722us       0.000us         0.00%      41.728us      13.909us             3  
+                                      aten::convolution         3.42%      29.613us        15.82%     137.086us      45.695us       0.000us         0.00%      41.728us      13.909us             3  
+                                     aten::_convolution         2.86%      24.759us        12.40%     107.473us      35.824us       0.000us         0.00%      41.728us      13.909us             3  
+                                aten::_conv_depthwise2d         2.59%      22.439us         7.67%      66.492us      22.164us      41.728us        65.20%      41.728us      13.909us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.728us        65.20%      41.728us      13.909us             3  
+                                               aten::to         0.77%       6.631us        64.71%     560.621us      93.437us       0.000us         0.00%      26.333us       4.389us             6  
+                                         aten::_to_copy         2.80%      24.253us        63.94%     553.990us      92.332us       0.000us         0.00%      26.333us       4.389us             6  
+                                            aten::copy_         5.80%      50.240us        57.50%     498.196us      83.033us      22.270us        34.80%      26.333us       4.389us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.903us        18.60%      11.903us       3.968us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.367us        16.20%      10.367us       3.456us             3  
+                                Activity Buffer Request        26.05%     225.728us        26.05%     225.728us     225.728us       4.063us         6.35%       4.063us       4.063us             1  
+                                    aten::empty_strided         3.64%      31.541us         3.64%      31.541us       5.257us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        28.31%     245.279us        28.31%     245.279us      27.253us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.11%      18.263us         2.74%      23.752us       2.639us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.06%       9.199us         1.06%       9.199us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.26%      10.941us         1.26%      10.941us       3.647us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.16%      10.061us         1.16%      10.061us       3.354us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.66%       5.740us         0.85%       7.330us       2.443us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 866.380us
+Self CUDA time total: 63.998us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     357.311us       512.91%     357.311us     357.311us             1  
+                                            torch_eager        20.96%     191.619us        99.38%     908.662us     908.662us       0.000us         0.00%      73.696us      73.696us             1  
+                                           aten::conv1d         0.63%       5.760us        15.23%     139.294us      46.431us       0.000us         0.00%      47.296us      15.765us             3  
+                                      aten::convolution         2.87%      26.271us        14.60%     133.534us      44.511us       0.000us         0.00%      47.296us      15.765us             3  
+                                     aten::_convolution         2.77%      25.360us        11.73%     107.263us      35.754us       0.000us         0.00%      47.296us      15.765us             3  
+                                aten::_conv_depthwise2d         2.38%      21.722us         7.17%      65.523us      21.841us      47.296us        67.89%      47.296us      15.765us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.296us        67.89%      47.296us      15.765us             3  
+                                               aten::to         0.73%       6.650us        60.08%     549.318us      91.553us       0.000us         0.00%      26.400us       4.400us             6  
+                                         aten::_to_copy         2.63%      24.032us        59.35%     542.668us      90.445us       0.000us         0.00%      26.400us       4.400us             6  
+                                            aten::copy_         5.57%      50.922us        53.46%     488.786us      81.464us      22.368us        32.11%      26.400us       4.400us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.872us        17.04%      11.872us       3.957us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.496us        15.07%      10.496us       3.499us             3  
+                                Activity Buffer Request        23.91%     218.617us        23.91%     218.617us     218.617us       4.032us         5.79%       4.032us       4.032us             1  
+                                    aten::empty_strided         3.26%      29.850us         3.26%      29.850us       4.975us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.57%     242.937us        26.57%     242.937us      26.993us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.04%      18.652us         2.65%      24.251us       2.695us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       9.230us         1.01%       9.230us       0.615us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.08%       9.870us         1.08%       9.870us       3.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%      10.241us         1.12%      10.241us       3.414us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.63%       5.780us         0.80%       7.270us       2.423us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 914.323us
+Self CUDA time total: 69.664us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     348.092us       187.26%     348.092us     348.092us             1  
+                                            torch_eager        14.76%     124.374us        99.29%     836.558us     836.558us       0.000us         0.00%     195.870us     195.870us             1  
+                                           aten::conv1d         0.70%       5.900us        14.42%     121.504us      40.501us       0.000us         0.00%     133.406us      44.469us             3  
+                                      aten::convolution         1.14%       9.610us        13.72%     115.604us      38.535us       0.000us         0.00%     133.406us      44.469us             3  
+                                     aten::_convolution         2.88%      24.263us        12.58%     105.994us      35.331us       0.000us         0.00%     133.406us      44.469us             3  
+                                aten::_conv_depthwise2d         2.73%      23.010us         7.80%      65.750us      21.917us     133.406us        71.77%     133.406us      44.469us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.406us        71.77%     133.406us      44.469us             3  
+                                               aten::to         0.74%       6.220us        66.83%     563.060us      93.843us       0.000us         0.00%      62.464us      10.411us             6  
+                                         aten::_to_copy         2.83%      23.861us        66.09%     556.840us      92.807us       0.000us         0.00%      62.464us      10.411us             6  
+                                            aten::copy_         6.03%      50.810us        59.73%     503.287us      83.881us      52.480us        28.23%      62.464us      10.411us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.600us        15.92%      29.600us       9.867us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.880us        12.31%      22.880us       7.627us             3  
+                                Activity Buffer Request        25.69%     216.468us        25.69%     216.468us     216.468us       9.984us         5.37%       9.984us       9.984us             1  
+                                    aten::empty_strided         3.52%      29.692us         3.52%      29.692us       4.949us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        30.59%     257.739us        30.59%     257.739us      28.638us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.08%      17.540us         2.73%      23.000us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.12%       9.412us         1.12%       9.412us       0.627us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%      10.110us         1.20%      10.110us       3.370us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.29%      10.900us         1.29%      10.900us       3.633us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.68%       5.719us         0.88%       7.451us       2.484us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 842.539us
+Self CUDA time total: 185.886us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     348.403us       166.18%     348.403us     348.403us             1  
+                                            torch_eager        14.60%     122.924us        99.33%     836.209us     836.209us       0.000us         0.00%     223.383us     223.383us             1  
+                                           aten::conv1d         0.69%       5.779us        14.01%     117.955us      39.318us       0.000us         0.00%     153.883us      51.294us             3  
+                                      aten::convolution         1.25%      10.491us        13.32%     112.176us      37.392us       0.000us         0.00%     153.883us      51.294us             3  
+                                     aten::_convolution         2.91%      24.484us        12.08%     101.685us      33.895us       0.000us         0.00%     153.883us      51.294us             3  
+                                aten::_conv_depthwise2d         2.49%      20.928us         7.14%      60.070us      20.023us     153.883us        73.40%     153.883us      51.294us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     153.883us        73.40%     153.883us      51.294us             3  
+                                               aten::to         0.73%       6.179us        67.37%     567.200us      94.533us       0.000us         0.00%      69.500us      11.583us             6  
+                                         aten::_to_copy         2.75%      23.132us        66.64%     561.021us      93.504us       0.000us         0.00%      69.500us      11.583us             6  
+                                            aten::copy_         5.91%      49.740us        60.39%     508.377us      84.729us      55.773us        26.60%      69.500us      11.583us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.927us        15.71%      32.927us      10.976us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.846us        10.90%      22.846us       7.615us             3  
+                                Activity Buffer Request        29.09%     244.869us        29.09%     244.869us     244.869us      13.727us         6.55%      13.727us      13.727us             1  
+                                    aten::empty_strided         3.51%      29.512us         3.51%      29.512us       4.919us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        27.84%     234.420us        27.84%     234.420us      26.047us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.13%      17.973us         2.77%      23.320us       2.591us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       9.167us         1.09%       9.167us       0.611us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.12%       9.440us         1.12%       9.440us       3.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.07%       9.050us         1.07%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.85%       7.121us         1.02%       8.601us       2.867us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 841.880us
+Self CUDA time total: 209.656us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         7.22%     135.785us        57.39%       1.079ms       1.079ms       0.000us         0.00%       1.518ms       1.518ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.419ms       100.41%       1.419ms       1.419ms             1  
+                                               aten::to         0.37%       6.901us        40.86%     768.526us     128.088us       0.000us         0.00%     823.221us     137.204us             6  
+                                         aten::_to_copy         1.63%      30.742us        40.49%     761.625us     126.938us       0.000us         0.00%     823.221us     137.204us             6  
+                                            aten::copy_         2.94%      55.302us        27.81%     523.157us      87.193us     717.942us        50.81%     823.221us     137.204us             6  
+                                           aten::conv1d         0.33%       6.280us         6.71%     126.144us      42.048us       0.000us         0.00%     695.094us     231.698us             3  
+                                      aten::convolution         0.57%      10.750us         6.37%     119.864us      39.955us       0.000us         0.00%     695.094us     231.698us             3  
+                                     aten::_convolution         1.35%      25.400us         5.80%     109.114us      36.371us       0.000us         0.00%     695.094us     231.698us             3  
+                                aten::_conv_depthwise2d         1.19%      22.332us         3.55%      66.763us      22.254us     695.094us        49.19%     695.094us     231.698us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     695.094us        49.19%     695.094us     231.698us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     411.706us        29.14%     411.706us     137.235us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     306.236us        21.67%     306.236us     102.079us             3  
+                                Activity Buffer Request        12.99%     244.238us        12.99%     244.238us     244.238us     105.279us         7.45%     105.279us     105.279us             1  
+                                    aten::empty_strided         2.17%      40.811us        11.04%     207.726us      34.621us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        13.13%     246.997us        13.13%     246.997us      27.444us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.97%      37.133us         2.36%      44.413us       4.935us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.58%      10.889us         0.58%      10.889us       0.726us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%      10.051us         0.53%      10.051us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.58%      11.000us         0.58%      11.000us       3.667us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.34%       6.350us         0.41%       7.700us       2.567us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.881ms
+Self CUDA time total: 1.413ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         4.25%     132.984us        66.63%       2.083ms       2.083ms       0.000us         0.00%       1.503ms       1.503ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.434ms       100.41%       1.434ms       1.434ms             1  
+                                               aten::to         0.21%       6.470us        57.53%       1.798ms     299.656us       0.000us         0.00%     765.147us     127.524us             6  
+                                         aten::_to_copy         0.80%      25.009us        57.32%       1.791ms     298.577us       0.000us         0.00%     765.147us     127.524us             6  
+                                            aten::copy_         1.51%      47.155us        55.55%       1.736ms     289.360us     690.492us        48.35%     765.147us     127.524us             6  
+                                           aten::conv1d         0.20%       6.231us         3.91%     122.325us      40.775us       0.000us         0.00%     737.724us     245.908us             3  
+                                      aten::convolution         0.32%       9.920us         3.71%     116.094us      38.698us       0.000us         0.00%     737.724us     245.908us             3  
+                                     aten::_convolution         0.82%      25.623us         3.40%     106.174us      35.391us       0.000us         0.00%     737.724us     245.908us             3  
+                                aten::_conv_depthwise2d         0.70%      21.899us         1.98%      62.011us      20.670us     737.724us        51.65%     737.724us     245.908us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     737.724us        51.65%     737.724us     245.908us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     398.046us        27.87%     398.046us     132.682us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     292.446us        20.48%     292.446us      97.482us             3  
+                                Activity Buffer Request        47.19%       1.475ms        47.19%       1.475ms       1.475ms      74.655us         5.23%      74.655us      74.655us             1  
+                                    aten::empty_strided         0.97%      30.293us         0.97%      30.293us       5.049us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         7.52%     235.026us         7.52%     235.026us      26.114us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.60%      18.740us         0.79%      24.820us       2.758us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.32%      10.019us         0.32%      10.019us       0.668us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.32%       9.882us         0.32%       9.882us       3.294us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.29%       9.220us         0.29%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       7.471us         0.29%       9.160us       3.053us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.125ms
+Self CUDA time total: 1.428ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_eager              cuda_B2_D2048_S128_W2     0.08  True
+torch_eager              cuda_B2_D2048_S128_W4     0.09  True
+torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
+torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
+torch_eager              cuda_B2_D2048_S512_W2     0.08  True
+torch_eager              cuda_B2_D2048_S512_W4     0.08  True
+torch_eager              cuda_B2_D64_S128_W2     0.07  True
+torch_eager              cuda_B2_D64_S128_W4     0.09  True
+torch_eager              cuda_B2_D64_S2048_W2     0.09  True
+torch_eager              cuda_B2_D64_S2048_W4     0.08  True
+torch_eager              cuda_B2_D64_S512_W2     0.09  True
+torch_eager              cuda_B2_D64_S512_W4     0.09  True
+torch_eager              cuda_B4_D2048_S128_W2     0.09  True
+torch_eager              cuda_B4_D2048_S128_W4     0.08  True
+torch_eager              cuda_B4_D2048_S2048_W2     0.49  True
+torch_eager              cuda_B4_D2048_S2048_W4     0.50  True
+torch_eager              cuda_B4_D2048_S512_W2     0.09  True
+torch_eager              cuda_B4_D2048_S512_W4     0.10  True
+torch_eager              cuda_B4_D64_S128_W2     0.08  True
+torch_eager              cuda_B4_D64_S128_W4     0.08  True
+torch_eager              cuda_B4_D64_S2048_W2     0.08  True
+torch_eager              cuda_B4_D64_S2048_W4     0.09  True
+torch_eager              cuda_B4_D64_S512_W2     0.08  True
+torch_eager              cuda_B4_D64_S512_W4     0.08  True
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+causal_conv1d.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/causal_conv1d/index.html b/causal_conv1d/index.html new file mode 100644 index 0000000000000000000000000000000000000000..41313e21299f746daf8b9b76fbbb22687cf02763 --- /dev/null +++ b/causal_conv1d/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /causal_conv1d + + + +
+ ← back +
+

Index of /causal_conv1d

+ + + \ No newline at end of file diff --git a/causal_conv1d/results/artifacts/combine/latency.svg b/causal_conv1d/results/artifacts/combine/latency.svg new file mode 100644 index 0000000000000000000000000000000000000000..73369f1354843caf2bfe58e9b9dbb5f0a7c81b2a --- /dev/null +++ b/causal_conv1d/results/artifacts/combine/latency.svg @@ -0,0 +1,530 @@ + + + + + + + 2025-10-29T00:37:16.145885 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_B2_D64_S128_W2 + + + + + + + + + + + + + cuda_B2_D64_S128_W4 + + + + + + + + + + + + + cuda_B2_D64_S512_W2 + + + + + + + + + + + + + cuda_B2_D64_S512_W4 + + + + + + + + + + + + + cuda_B2_D64_S2048_W2 + + + + + + + + + + + + + cuda_B2_D64_S2048_W4 + + + + + + + + + + + + + cuda_B2_D2048_S128_W2 + + + + + + + + + + + + + cuda_B2_D2048_S128_W4 + + + + + + + + + + + + + cuda_B2_D2048_S512_W2 + + + + + + + + + + + + + cuda_B2_D2048_S512_W4 + + + + + + + + + + + + + cuda_B2_D2048_S2048_W2 + + + + + + + + + + + + + cuda_B2_D2048_S2048_W4 + + + + + + + + + + + + + cuda_B4_D64_S128_W2 + + + + + + + + + + + + + cuda_B4_D64_S128_W4 + + + + + + + + + + + + + cuda_B4_D64_S512_W2 + + + + + + + + + + + + + cuda_B4_D64_S512_W4 + + + + + + + + + + + + + cuda_B4_D64_S2048_W2 + + + + + + + + + + + + + cuda_B4_D64_S2048_W4 + + + + + + + + + + + + + cuda_B4_D2048_S128_W2 + + + + + + + + + + + + + cuda_B4_D2048_S128_W4 + + + + + + + + + + + + + cuda_B4_D2048_S512_W2 + + + + + + + + + + + + + cuda_B4_D2048_S512_W4 + + + + + + + + + + + + + cuda_B4_D2048_S2048_W2 + + + + + + + + + + + + + cuda_B4_D2048_S2048_W4 + + + + Workload + + + + + + + + + + + + + + + + + 0.1 + + + + + + + + + + + + + 0.2 + + + + + + + + + + + + + 0.3 + + + + + + + + + + + + + 0.4 + + + + + + + + + + + + + 0.5 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + hf_kernels_causal_conv1d + + + + + + + + + torch_eager + + + + + + + + + + \ No newline at end of file diff --git a/causal_conv1d/results/cells/combine.py b/causal_conv1d/results/cells/combine.py new file mode 100644 index 0000000000000000000000000000000000000000..bb0f868d7f37b547e4fd981763d6af8a3bca13dd --- /dev/null +++ b/causal_conv1d/results/cells/combine.py @@ -0,0 +1,26 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "matplotlib", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +from kernels_benchmark_tools.core.visuals import generate_combined_results + +# Map display names to uvnote environment variables +cache_env_map = { + "HF Kernels Causal Conv1D": "UVNOTE_FILE_HF_KERNELS_CAUSAL_CONV1D_BENCHMARK", + "PyTorch Causal Conv1D": "UVNOTE_FILE_TORCH_CAUSAL_CONV1D_BENCHMARK", +} + +# Generate combined results with visualization +generate_combined_results( + cache_env_map=cache_env_map, + output_filename="causal_conv1d.jsonl", + svg_filename="latency.svg" +) \ No newline at end of file diff --git a/causal_conv1d/results/combined_results.html b/causal_conv1d/results/combined_results.html new file mode 100644 index 0000000000000000000000000000000000000000..b113ec63f04aaf43b21a9692d5686dbfeb8f2510 --- /dev/null +++ b/causal_conv1d/results/combined_results.html @@ -0,0 +1,5106 @@ + + + + + + Causal Conv1D Benchmark - Combined Results + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Causal Conv1D Benchmarks - Aggregated Results

+

This document combines benchmark results from multiple Causal Conv1D implementations.

+

Combined Summary and Visualization

+
+ + + + + + + 2025-10-29T00:37:16.145885 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_B2_D64_S128_W2 + + + + + + + + + + + + + cuda_B2_D64_S128_W4 + + + + + + + + + + + + + cuda_B2_D64_S512_W2 + + + + + + + + + + + + + cuda_B2_D64_S512_W4 + + + + + + + + + + + + + cuda_B2_D64_S2048_W2 + + + + + + + + + + + + + cuda_B2_D64_S2048_W4 + + + + + + + + + + + + + cuda_B2_D2048_S128_W2 + + + + + + + + + + + + + cuda_B2_D2048_S128_W4 + + + + + + + + + + + + + cuda_B2_D2048_S512_W2 + + + + + + + + + + + + + cuda_B2_D2048_S512_W4 + + + + + + + + + + + + + cuda_B2_D2048_S2048_W2 + + + + + + + + + + + + + cuda_B2_D2048_S2048_W4 + + + + + + + + + + + + + cuda_B4_D64_S128_W2 + + + + + + + + + + + + + cuda_B4_D64_S128_W4 + + + + + + + + + + + + + cuda_B4_D64_S512_W2 + + + + + + + + + + + + + cuda_B4_D64_S512_W4 + + + + + + + + + + + + + cuda_B4_D64_S2048_W2 + + + + + + + + + + + + + cuda_B4_D64_S2048_W4 + + + + + + + + + + + + + cuda_B4_D2048_S128_W2 + + + + + + + + + + + + + cuda_B4_D2048_S128_W4 + + + + + + + + + + + + + cuda_B4_D2048_S512_W2 + + + + + + + + + + + + + cuda_B4_D2048_S512_W4 + + + + + + + + + + + + + cuda_B4_D2048_S2048_W2 + + + + + + + + + + + + + cuda_B4_D2048_S2048_W4 + + + + Workload + + + + + + + + + + + + + + + + + 0.1 + + + + + + + + + + + + + 0.2 + + + + + + + + + + + + + 0.3 + + + + + + + + + + + + + 0.4 + + + + + + + + + + + + + 0.5 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + hf_kernels_causal_conv1d + + + + + + + + + torch_eager + + + + + + + + + + +
+ +
+
+ +▶ code +▼ output + ▶ uv-logs + | +Cell: combine | 4.43s + | + +Raw +
+ +
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ HF Kernels Causal Conv1D      : /__w/kernels-benchmarks/kernels-benchmarks/benches/causal_conv1d/impls/.uvnote/cache/7a691bd653e23c412c5d29fbc92ea1454823ea437864cf9473fc561b116ef3d9
+✓ PyTorch Causal Conv1D         : /__w/kernels-benchmarks/kernels-benchmarks/benches/causal_conv1d/impls/.uvnote/cache/70757e27f2df1dfde4905a24527bb4ca6f0f8df7dac2e2ecaa0ddc359c7d5e64
+
+  ✓ Found HF Kernels Causal Conv1D
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/causal_conv1d/impls/.uvnote/cache/7a691bd653e23c412c5d29fbc92ea1454823ea437864cf9473fc561b116ef3d9/causal_conv1d.jsonl
+  ✓ Found PyTorch Causal Conv1D
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/causal_conv1d/impls/.uvnote/cache/70757e27f2df1dfde4905a24527bb4ca6f0f8df7dac2e2ecaa0ddc359c7d5e64/causal_conv1d.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_causal_conv1d cuda_B2_D2048_S128_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D2048_S128_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D2048_S512_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S128_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S128_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2     0.05  True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W4     0.05  True
+torch_eager              cuda_B2_D2048_S128_W2     0.08  True
+torch_eager              cuda_B2_D2048_S128_W4     0.09  True
+torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
+torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
+torch_eager              cuda_B2_D2048_S512_W2     0.08  True
+torch_eager              cuda_B2_D2048_S512_W4     0.08  True
+torch_eager              cuda_B2_D64_S128_W2     0.07  True
+torch_eager              cuda_B2_D64_S128_W4     0.09  True
+torch_eager              cuda_B2_D64_S2048_W2     0.09  True
+torch_eager              cuda_B2_D64_S2048_W4     0.08  True
+torch_eager              cuda_B2_D64_S512_W2     0.09  True
+torch_eager              cuda_B2_D64_S512_W4     0.09  True
+torch_eager              cuda_B4_D2048_S128_W2     0.09  True
+torch_eager              cuda_B4_D2048_S128_W4     0.08  True
+torch_eager              cuda_B4_D2048_S2048_W2     0.49  True
+torch_eager              cuda_B4_D2048_S2048_W4     0.50  True
+torch_eager              cuda_B4_D2048_S512_W2     0.09  True
+torch_eager              cuda_B4_D2048_S512_W4     0.10  True
+torch_eager              cuda_B4_D64_S128_W2     0.08  True
+torch_eager              cuda_B4_D64_S128_W4     0.08  True
+torch_eager              cuda_B4_D64_S2048_W2     0.08  True
+torch_eager              cuda_B4_D64_S2048_W4     0.09  True
+torch_eager              cuda_B4_D64_S512_W2     0.08  True
+torch_eager              cuda_B4_D64_S512_W4     0.08  True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 48 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+  ✓ HF Kernels Causal Conv1D
+  ✓ PyTorch Causal Conv1D
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+latency.svg +
+ + + + + + + 2025-10-29T00:37:16.145885 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_B2_D64_S128_W2 + + + + + + + + + + + + + cuda_B2_D64_S128_W4 + + + + + + + + + + + + + cuda_B2_D64_S512_W2 + + + + + + + + + + + + + cuda_B2_D64_S512_W4 + + + + + + + + + + + + + cuda_B2_D64_S2048_W2 + + + + + + + + + + + + + cuda_B2_D64_S2048_W4 + + + + + + + + + + + + + cuda_B2_D2048_S128_W2 + + + + + + + + + + + + + cuda_B2_D2048_S128_W4 + + + + + + + + + + + + + cuda_B2_D2048_S512_W2 + + + + + + + + + + + + + cuda_B2_D2048_S512_W4 + + + + + + + + + + + + + cuda_B2_D2048_S2048_W2 + + + + + + + + + + + + + cuda_B2_D2048_S2048_W4 + + + + + + + + + + + + + cuda_B4_D64_S128_W2 + + + + + + + + + + + + + cuda_B4_D64_S128_W4 + + + + + + + + + + + + + cuda_B4_D64_S512_W2 + + + + + + + + + + + + + cuda_B4_D64_S512_W4 + + + + + + + + + + + + + cuda_B4_D64_S2048_W2 + + + + + + + + + + + + + cuda_B4_D64_S2048_W4 + + + + + + + + + + + + + cuda_B4_D2048_S128_W2 + + + + + + + + + + + + + cuda_B4_D2048_S128_W4 + + + + + + + + + + + + + cuda_B4_D2048_S512_W2 + + + + + + + + + + + + + cuda_B4_D2048_S512_W4 + + + + + + + + + + + + + cuda_B4_D2048_S2048_W2 + + + + + + + + + + + + + cuda_B4_D2048_S2048_W4 + + + + Workload + + + + + + + + + + + + + + + + + 0.1 + + + + + + + + + + + + + 0.2 + + + + + + + + + + + + + 0.3 + + + + + + + + + + + + + 0.4 + + + + + + + + + + + + + 0.5 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + hf_kernels_causal_conv1d + + + + + + + + + torch_eager + + + + + + + + + + +
+
+
+
+
+ + + \ No newline at end of file diff --git a/causal_conv1d/results/index.html b/causal_conv1d/results/index.html new file mode 100644 index 0000000000000000000000000000000000000000..ad7d58a5d5aa5f369cbfdb2b39008b8deb1383b1 --- /dev/null +++ b/causal_conv1d/results/index.html @@ -0,0 +1,88 @@ + + + + + + Index of /causal_conv1d/results + + + +
+ ← back +
+

Index of /causal_conv1d/results

+ + + \ No newline at end of file diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..43081d2c5637964960d306de705532da87d93bb1 --- /dev/null +++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl @@ -0,0 +1,6 @@ +{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9715130000245153, "p50": 0.9773340000265307, "p90": 0.9788430000412518, "mean": 0.976309200018477, "iqr": 0.005310000005920301, "raw_times": [0.9735330000353315, 0.9773340000265307, 0.9803229999647556, 0.9788430000412518, 0.9715130000245153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9926440000072034, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0154749999742307, "p50": 1.0199449999959143, "p90": 1.0278160000325443, "mean": 1.0223952000046665, "iqr": 0.010921000011876458, "raw_times": [1.0278160000325443, 1.0168950000206678, 1.0318449999999757, 1.0154749999742307, 1.0199449999959143], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0225849999869752, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0612160000391668, "p50": 1.0721770000259312, "p90": 1.075397000022349, "mean": 1.0706886000093618, "iqr": 0.009251000051335723, "raw_times": [1.0612160000391668, 1.0721770000259312, 1.0661459999710132, 1.075397000022349, 1.078506999988349], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0771669999485312, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.075485999990633, "p50": 1.0823069999901236, "p90": 1.084176999995634, "mean": 1.0827727999981107, "iqr": 0.0021099999685247894, "raw_times": [1.075485999990633, 1.0820670000271093, 1.0823069999901236, 1.0898269999870536, 1.084176999995634], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1057869999717695, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2330920000067636, "p50": 1.237381999999343, "p90": 1.239422999958606, "mean": 1.2375224000038543, "iqr": 0.002220999931523693, "raw_times": [1.2405130000274767, 1.2372020000270822, 1.2330920000067636, 1.237381999999343, 1.239422999958606], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.22687200001792, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} +{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2296720000222194, "p50": 1.230811999960224, "p90": 1.236231999996562, "mean": 1.2357499999893662, "iqr": 0.005929999986165058, "raw_times": [1.236231999996562, 1.2517319999574283, 1.230811999960224, 1.230302000010397, 1.2296720000222194], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2250920000269616, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..04ae262009c3d6e33aaa3e392d28c903f24c287c --- /dev/null +++ b/flash_attn/impls/cells/benchmark.py @@ -0,0 +1,30 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "xformers", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +import torch +import sys +from kernels_benchmark_tools import KernelTypeEnum, run_benchmark +import xformers.ops as xops + + +def xformers_attention(q, k, v): + """xFormers memory efficient attention""" + # xFormers expects [batch, seq_len, heads, head_dim] + return xops.memory_efficient_attention(q, k, v) + + +run_benchmark( + kernel_type=KernelTypeEnum.ATTENTION, + impl_name="xformers_meff", + impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"}, + impl_func=xformers_attention, +) \ No newline at end of file diff --git a/flash_attn/impls/cells/nv.py b/flash_attn/impls/cells/nv.py new file mode 100644 index 0000000000000000000000000000000000000000..80eef60a7536ed875fb21731ab2d059458bd20b4 --- /dev/null +++ b/flash_attn/impls/cells/nv.py @@ -0,0 +1,3 @@ +import subprocess + +print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html new file mode 100644 index 0000000000000000000000000000000000000000..b4d18851c72f81eb2cee29787f66e13729c42139 --- /dev/null +++ b/flash_attn/impls/flash_attention.html @@ -0,0 +1,4177 @@ + + + + + + flash_attention + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Flash Attention Implementation

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.26s + | + +Raw +GitHub +
+
+
+
import subprocess
+
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 29 00:36:31 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   32C    P0            151W /  350W |       0MiB /  46068MiB |     86%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Flash Attention Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 3.81s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_flash(q, k, v):
+    qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+    with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+        o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+    return o.transpose(1, 2).contiguous()
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ATTENTION,
+    impl_name="torch_flash_ma",
+    impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
+    impl_func=torch_flash,
+)
+
+ +
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.578ms       102.17%       3.578ms       3.578ms             1  
+                                         torch_flash_ma         6.87%     353.422us        46.38%       2.386ms       2.386ms       0.000us         0.00%       3.542ms       3.542ms             1  
+                     aten::scaled_dot_product_attention         0.81%      41.691us         4.31%     221.887us      73.962us       0.000us         0.00%       2.788ms     929.262us             3  
+              aten::_scaled_dot_product_flash_attention         0.53%      27.420us         3.50%     180.196us      60.065us       0.000us         0.00%       2.788ms     929.262us             3  
+                         aten::_flash_attention_forward         0.77%      39.803us         2.56%     131.456us      43.819us       2.788ms        79.61%       2.788ms     929.262us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.788ms        79.61%       2.788ms     929.262us             3  
+                                       aten::contiguous         0.28%      14.581us        33.97%       1.748ms     145.626us       0.000us         0.00%     754.272us      62.856us            12  
+                                            aten::clone         0.77%      39.360us        33.69%       1.733ms     144.411us       0.000us         0.00%     754.272us      62.856us            12  
+                                            aten::copy_         1.64%      84.313us        31.38%       1.614ms     134.494us     713.920us        20.39%     754.272us      62.856us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     713.920us        20.39%     713.920us      59.493us            12  
+                                Activity Buffer Request        27.68%       1.424ms        27.68%       1.424ms       1.424ms      40.352us         1.15%      40.352us      40.352us             1  
+                                        aten::transpose         1.22%      62.617us         1.64%      84.135us       3.506us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.42%      21.518us         0.42%      21.518us       0.897us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.49%      25.079us         1.99%     102.243us       6.816us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.77%      91.033us         1.77%      91.033us       3.793us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.57%     132.402us         2.57%     132.402us       8.827us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.32%      16.702us         0.32%      16.702us       5.567us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.05%       2.750us         0.05%       2.750us       0.458us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.17%       9.001us         0.17%       9.001us       3.000us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.62%       2.758ms        53.62%       2.758ms       2.758ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 5.144ms
+Self CUDA time total: 3.502ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                         torch_flash_ma         4.93%     257.698us        42.06%       2.199ms       2.199ms       0.000us         0.00%       3.742ms       3.742ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.698ms       100.30%       3.698ms       3.698ms             1  
+                     aten::scaled_dot_product_attention         0.48%      25.212us         3.48%     182.067us      60.689us       0.000us         0.00%       2.929ms     976.488us             3  
+              aten::_scaled_dot_product_flash_attention         0.39%      20.471us         3.00%     156.855us      52.285us       0.000us         0.00%       2.929ms     976.488us             3  
+                         aten::_flash_attention_forward         0.74%      38.430us         2.18%     114.074us      38.025us       2.929ms        79.45%       2.929ms     976.488us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.929ms        79.45%       2.929ms     976.488us             3  
+                                       aten::contiguous         0.17%       9.122us        32.76%       1.713ms     142.713us       0.000us         0.00%     812.318us      67.693us            12  
+                                            aten::clone         0.59%      31.068us        32.59%       1.703ms     141.953us       0.000us         0.00%     812.318us      67.693us            12  
+                                            aten::copy_         1.50%      78.513us        30.83%       1.612ms     134.315us     757.726us        20.55%     812.318us      67.693us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     757.726us        20.55%     757.726us      63.144us            12  
+                                Activity Buffer Request        27.74%       1.450ms        27.74%       1.450ms       1.450ms      54.592us         1.48%      54.592us      54.592us             1  
+                                        aten::transpose         0.99%      51.637us         1.32%      68.781us       2.866us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.33%      17.144us         0.33%      17.144us       0.714us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.41%      21.274us         1.52%      79.248us       5.283us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.40%      73.206us         1.40%      73.206us       3.050us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         2.03%     106.061us         2.03%     106.061us       7.071us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.26%      13.410us         0.26%      13.410us       4.470us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.04%       1.900us         0.04%       1.900us       0.317us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.830us         0.07%       3.830us       1.277us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        57.94%       3.028ms        57.94%       3.028ms       3.028ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 5.227ms
+Self CUDA time total: 3.687ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                         torch_flash_ma         4.92%     259.759us        41.31%       2.182ms       2.182ms       0.000us         0.00%       3.825ms       3.825ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.778ms       100.30%       3.778ms       3.778ms             1  
+                     aten::scaled_dot_product_attention         0.46%      24.480us         3.48%     183.685us      61.228us       0.000us         0.00%       2.990ms     996.566us             3  
+              aten::_scaled_dot_product_flash_attention         0.36%      18.972us         3.01%     159.205us      53.068us       0.000us         0.00%       2.990ms     996.566us             3  
+                         aten::_flash_attention_forward         0.75%      39.470us         2.21%     116.583us      38.861us       2.990ms        79.38%       2.990ms     996.566us             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       2.990ms        79.38%       2.990ms     996.566us             3  
+                                       aten::contiguous         0.20%      10.370us        32.06%       1.693ms     141.118us       0.000us         0.00%     835.605us      69.634us            12  
+                                            aten::clone         0.56%      29.562us        31.86%       1.683ms     140.254us       0.000us         0.00%     835.605us      69.634us            12  
+                                            aten::copy_         1.55%      81.613us        30.00%       1.585ms     132.057us     776.758us        20.62%     835.605us      69.634us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     776.758us        20.62%     776.758us      64.730us            12  
+                                Activity Buffer Request        26.94%       1.423ms        26.94%       1.423ms       1.423ms      58.847us         1.56%      58.847us      58.847us             1  
+                                        aten::transpose         0.97%      51.460us         1.30%      68.660us       2.861us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.33%      17.200us         0.33%      17.200us       0.717us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.39%      20.693us         1.67%      88.333us       5.889us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.54%      81.451us         1.54%      81.451us       3.394us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         1.97%     104.004us         1.97%     104.004us       6.934us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.28%      14.530us         0.28%      14.530us       4.843us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.04%       1.902us         0.04%       1.902us       0.317us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.600us         0.07%       3.600us       1.200us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.69%       3.100ms        58.69%       3.100ms       3.100ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 5.282ms
+Self CUDA time total: 3.766ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                         torch_flash_ma         4.63%     260.119us        43.14%       2.422ms       2.422ms       0.000us         0.00%       3.911ms       3.911ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       3.865ms       100.31%       3.865ms       3.865ms             1  
+                     aten::scaled_dot_product_attention         0.43%      24.361us         3.22%     180.586us      60.195us       0.000us         0.00%       3.069ms       1.023ms             3  
+              aten::_scaled_dot_product_flash_attention         0.35%      19.401us         2.78%     156.225us      52.075us       0.000us         0.00%       3.069ms       1.023ms             3  
+                         aten::_flash_attention_forward         0.68%      38.111us         2.03%     114.053us      38.018us       3.069ms        79.64%       3.069ms       1.023ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.069ms        79.64%       3.069ms       1.023ms             3  
+                                       aten::contiguous         0.17%       9.669us        34.46%       1.935ms     161.211us       0.000us         0.00%     842.147us      70.179us            12  
+                                            aten::clone         0.54%      30.453us        34.29%       1.925ms     160.405us       0.000us         0.00%     842.147us      70.179us            12  
+                                            aten::copy_         1.42%      79.471us        32.63%       1.832ms     152.656us     784.675us        20.36%     842.147us      70.179us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     784.675us        20.36%     784.675us      65.390us            12  
+                                Activity Buffer Request        26.20%       1.471ms        26.20%       1.471ms       1.471ms      57.472us         1.49%      57.472us      57.472us             1  
+                                        aten::transpose         0.92%      51.697us         1.23%      69.261us       2.886us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.31%      17.564us         0.31%      17.564us       0.732us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.36%      20.299us         1.45%      81.452us       5.430us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.34%      75.405us         1.34%      75.405us       3.142us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         5.43%     304.654us         5.43%     304.654us      20.310us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.25%      13.960us         0.25%      13.960us       4.653us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.839us         0.03%       1.839us       0.306us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.07%       3.750us         0.07%       3.750us       1.250us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        56.86%       3.192ms        56.86%       3.192ms       3.192ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 5.614ms
+Self CUDA time total: 3.854ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                         torch_flash_ma         5.20%     312.192us        40.27%       2.420ms       2.420ms       0.000us         0.00%       4.370ms       4.370ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.320ms       100.26%       4.320ms       4.320ms             1  
+                     aten::scaled_dot_product_attention         0.42%      25.401us         3.13%     188.317us      62.772us       0.000us         0.00%       3.499ms       1.166ms             3  
+              aten::_scaled_dot_product_flash_attention         0.34%      20.373us         2.71%     162.916us      54.305us       0.000us         0.00%       3.499ms       1.166ms             3  
+                         aten::_flash_attention_forward         0.70%      41.822us         1.99%     119.463us      39.821us       3.499ms        81.21%       3.499ms       1.166ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.499ms        81.21%       3.499ms       1.166ms             3  
+                                       aten::contiguous         0.17%      10.061us        31.18%       1.873ms     156.120us       0.000us         0.00%     870.813us      72.568us            12  
+                                            aten::clone         0.51%      30.510us        31.01%       1.863ms     155.281us       0.000us         0.00%     870.813us      72.568us            12  
+                                            aten::copy_         1.32%      79.253us        29.46%       1.770ms     147.488us     809.726us        18.79%     870.813us      72.568us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     809.726us        18.79%     809.726us      67.477us            12  
+                                Activity Buffer Request        23.71%       1.425ms        23.71%       1.425ms       1.425ms      61.087us         1.42%      61.087us      61.087us             1  
+                                        aten::transpose         0.85%      51.371us         1.15%      68.940us       2.873us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.29%      17.569us         0.29%      17.569us       0.732us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.34%      20.420us         1.39%      83.415us       5.561us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.27%      76.235us         1.27%      76.235us       3.176us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.81%     288.717us         4.81%     288.717us      19.248us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.26%      15.360us         0.26%      15.360us       5.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.980us         0.03%       1.980us       0.330us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.780us         0.06%       3.780us       1.260us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.73%       3.589ms        59.73%       3.589ms       3.589ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 6.009ms
+Self CUDA time total: 4.309ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                         torch_flash_ma         4.62%     283.749us        39.30%       2.416ms       2.416ms       0.000us         0.00%       4.488ms       4.488ms             1  
+                                         torch_flash_ma         0.00%       0.000us         0.00%       0.000us       0.000us       4.437ms       100.26%       4.437ms       4.437ms             1  
+                     aten::scaled_dot_product_attention         0.41%      25.050us         2.99%     183.606us      61.202us       0.000us         0.00%       3.606ms       1.202ms             3  
+              aten::_scaled_dot_product_flash_attention         0.32%      19.512us         2.58%     158.556us      52.852us       0.000us         0.00%       3.606ms       1.202ms             3  
+                         aten::_flash_attention_forward         0.64%      39.583us         1.89%     116.223us      38.741us       3.606ms        81.47%       3.606ms       1.202ms             3  
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us       3.606ms        81.47%       3.606ms       1.202ms             3  
+                                       aten::contiguous         0.16%       9.930us        30.93%       1.901ms     158.420us       0.000us         0.00%     882.206us      73.517us            12  
+                                            aten::clone         0.49%      30.220us        30.76%       1.891ms     157.592us       0.000us         0.00%     882.206us      73.517us            12  
+                                            aten::copy_         1.34%      82.326us        29.23%       1.797ms     149.726us     820.351us        18.53%     882.206us      73.517us            12  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     820.351us        18.53%     820.351us      68.363us            12  
+                                Activity Buffer Request        23.42%       1.439ms        23.42%       1.439ms       1.439ms      61.855us         1.40%      61.855us      61.855us             1  
+                                        aten::transpose         0.85%      52.248us         1.14%      70.082us       2.920us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.29%      17.834us         0.29%      17.834us       0.743us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.33%      20.531us         1.36%      83.782us       5.585us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.26%      77.251us         1.26%      77.251us       3.219us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         4.84%     297.592us         4.84%     297.592us      19.839us       0.000us         0.00%       0.000us       0.000us            15  
+                                    aten::empty_strided         0.24%      14.660us         0.24%      14.660us       4.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                 cudaDeviceGetAttribute         0.03%       1.929us         0.03%       1.929us       0.321us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.06%       3.839us         0.06%       3.839us       1.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        60.70%       3.731ms        60.70%       3.731ms       3.731ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 6.147ms
+Self CUDA time total: 4.426ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_flash_ma           cuda_attn_L128_bfloat16     1.21  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.29  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.32  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.47  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.49  True
+
+
+

Artifacts:

+attention.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html new file mode 100644 index 0000000000000000000000000000000000000000..4817c7014f4b0c071581df7216c8b47278369487 --- /dev/null +++ b/flash_attn/impls/hf_kernels_flash_attn.html @@ -0,0 +1,4088 @@ + + + + + + hf_kernels_flash_attn + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

HF Kernels - Flash Attention

+

HuggingFace Kernels Flash Attention Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 5.88s + | + +Raw +GitHub +🤗 HF +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the flash attention kernel
+hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
+
+
+def hf_flash_attention(query, key, value):
+    """HuggingFace Kernels Flash Attention"""
+    return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ATTENTION,
+    impl_name="hf_kernels_flash_attn",
+    impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
+    impl_func=hf_flash_attention,
+)
+
+ +
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                  hf_kernels_flash_attn         3.54%     153.223us        41.10%       1.781ms       1.781ms       0.000us         0.00%       3.710ms       3.710ms             1  
+                               _flash_attn_9e27194::fwd         1.64%      71.013us        37.57%       1.628ms     542.522us       2.765ms       100.00%       3.710ms       1.237ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.766ms       100.05%       2.766ms       2.766ms             1  
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.765ms       100.00%       2.765ms     921.626us             3  
+                                Activity Buffer Request        32.85%       1.423ms        32.85%       1.423ms       1.423ms     945.530us        34.20%     945.530us     945.530us             1  
+                                 cudaDeviceGetAttribute         0.11%       4.920us         0.11%       4.920us       0.328us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.37%      16.201us         1.19%      51.582us      17.194us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.82%      35.381us         0.82%      35.381us      11.794us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.55%      23.891us         0.55%      23.891us       2.655us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.27%      11.501us         0.27%      11.501us       3.834us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.96%      41.661us         0.96%      41.661us      13.887us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        58.90%       2.552ms        58.90%       2.552ms       2.552ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.332ms
+Self CUDA time total: 2.765ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                  hf_kernels_flash_attn         1.95%      87.173us        36.43%       1.628ms       1.628ms       0.000us         0.00%       3.993ms       3.993ms             1  
+                               _flash_attn_9e27194::fwd         1.10%      49.286us        34.48%       1.541ms     513.554us       2.982ms       100.00%       3.993ms       1.331ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.984ms       100.06%       2.984ms       2.984ms             1  
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.982ms       100.00%       2.982ms     993.983us             3  
+                                Activity Buffer Request        31.65%       1.414ms        31.65%       1.414ms       1.414ms       1.011ms        33.92%       1.011ms       1.011ms             1  
+                                 cudaDeviceGetAttribute         0.09%       3.827us         0.09%       3.827us       0.255us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.16%       7.330us         0.51%      22.831us       7.610us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.35%      15.501us         0.35%      15.501us       5.167us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.46%      20.669us         0.46%      20.669us       2.297us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.08%       3.520us         0.08%       3.520us       1.173us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.59%      26.211us         0.59%      26.211us       8.737us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        63.57%       2.841ms        63.57%       2.841ms       2.841ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.469ms
+Self CUDA time total: 2.982ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                  hf_kernels_flash_attn         2.39%     107.943us        36.87%       1.664ms       1.664ms       0.000us         0.00%       4.011ms       4.011ms             1  
+                               _flash_attn_9e27194::fwd         1.08%      48.663us        34.47%       1.556ms     518.528us       2.994ms       100.00%       4.011ms       1.337ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       2.996ms       100.05%       2.996ms       2.996ms             1  
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       2.994ms       100.00%       2.994ms     998.054us             3  
+                                Activity Buffer Request        31.64%       1.428ms        31.64%       1.428ms       1.428ms       1.017ms        33.96%       1.017ms       1.017ms             1  
+                                 cudaDeviceGetAttribute         0.09%       4.050us         0.09%       4.050us       0.270us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.16%       7.029us         0.54%      24.521us       8.174us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.39%      17.492us         0.39%      17.492us       5.831us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.46%      20.589us         0.46%      20.589us       2.288us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.08%       3.660us         0.08%       3.660us       1.220us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.59%      26.452us         0.59%      26.452us       8.817us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        63.13%       2.849ms        63.13%       2.849ms       2.849ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.512ms
+Self CUDA time total: 2.994ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                  hf_kernels_flash_attn         2.37%     113.154us        39.04%       1.864ms       1.864ms       0.000us         0.00%       4.086ms       4.086ms             1  
+                               _flash_attn_9e27194::fwd         1.02%      48.863us        36.67%       1.751ms     583.543us       3.059ms       100.00%       4.086ms       1.362ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.060ms       100.05%       3.060ms       3.060ms             1  
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.059ms       100.00%       3.059ms       1.020ms             3  
+                                Activity Buffer Request        29.92%       1.429ms        29.92%       1.429ms       1.429ms       1.027ms        33.57%       1.027ms       1.027ms             1  
+                                 cudaDeviceGetAttribute         0.08%       3.821us         0.08%       3.821us       0.255us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.16%       7.819us         0.54%      25.920us       8.640us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.38%      18.101us         0.38%      18.101us       6.034us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.44%      21.109us         0.44%      21.109us       2.345us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.08%       3.840us         0.08%       3.840us       1.280us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         4.58%     218.538us         4.58%     218.538us      72.846us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        60.96%       2.910ms        60.96%       2.910ms       2.910ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.774ms
+Self CUDA time total: 3.059ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                  hf_kernels_flash_attn         2.11%     109.115us        34.87%       1.804ms       1.804ms       0.000us         0.00%       4.702ms       4.702ms             1  
+                               _flash_attn_9e27194::fwd         0.94%      48.879us        32.76%       1.695ms     565.076us       3.518ms       100.00%       4.702ms       1.567ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.519ms       100.04%       3.519ms       3.519ms             1  
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.518ms       100.00%       3.518ms       1.173ms             3  
+                                Activity Buffer Request        27.57%       1.427ms        27.57%       1.427ms       1.427ms       1.184ms        33.66%       1.184ms       1.184ms             1  
+                                 cudaDeviceGetAttribute         0.07%       3.810us         0.07%       3.810us       0.254us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.14%       7.040us         0.48%      25.061us       8.354us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.35%      18.021us         0.35%      18.021us       6.007us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.40%      20.762us         0.40%      20.762us       2.307us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.07%       3.731us         0.07%       3.731us       1.244us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.21%     166.285us         3.21%     166.285us      55.428us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        65.13%       3.370ms        65.13%       3.370ms       3.370ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 5.175ms
+Self CUDA time total: 3.518ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                  hf_kernels_flash_attn         2.00%     105.404us        33.86%       1.781ms       1.781ms       0.000us         0.00%       4.846ms       4.846ms             1  
+                               _flash_attn_9e27194::fwd         0.97%      50.822us        31.86%       1.675ms     558.446us       3.623ms       100.00%       4.846ms       1.615ms             3  
+                                  hf_kernels_flash_attn         0.00%       0.000us         0.00%       0.000us       0.000us       3.624ms       100.04%       3.624ms       3.624ms             1  
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits...         0.00%       0.000us         0.00%       0.000us       0.000us       3.623ms       100.00%       3.623ms       1.208ms             3  
+                                Activity Buffer Request        26.72%       1.405ms        26.72%       1.405ms       1.405ms       1.223ms        33.77%       1.223ms       1.223ms             1  
+                                 cudaDeviceGetAttribute         0.08%       4.369us         0.08%       4.369us       0.291us       0.000us         0.00%       0.000us       0.000us            15  
+                                       aten::empty_like         0.15%       7.679us         0.48%      25.141us       8.380us       0.000us         0.00%       0.000us       0.000us             3  
+                                    aten::empty_strided         0.33%      17.462us         0.33%      17.462us       5.821us       0.000us         0.00%       0.000us       0.000us             3  
+                                            aten::empty         0.40%      21.081us         0.40%      21.081us       2.342us       0.000us         0.00%       0.000us       0.000us             9  
+                                   cudaFuncSetAttribute         0.07%       3.770us         0.07%       3.770us       1.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.13%     164.746us         3.13%     164.746us      54.915us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        66.14%       3.478ms        66.14%       3.478ms       3.478ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 5.259ms
+Self CUDA time total: 3.623ms
+
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.95  True
+hf_kernels_flash_attn    cuda_attn_L256_bfloat16     0.99  True
+hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.04  True
+hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.06  True
+hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.21  True
+hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.21  True
+
+
+Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s] +Fetching 20 files: 10%|█ | 2/20 [00:01<00:16, 1.12it/s] +Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.15it/s] +
+
+

Artifacts:

+attention.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html new file mode 100644 index 0000000000000000000000000000000000000000..5b8a9bc38528f0c5161a3d259e80bf519be70a90 --- /dev/null +++ b/flash_attn/impls/hf_kernels_flash_attn3.html @@ -0,0 +1,4081 @@ + + + + + + hf_kernels_flash_attn3 + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

HF Kernels - Flash Attention 3

+

HuggingFace Kernels Flash Attention 3 Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 6.50s + | + +Raw +GitHub +🤗 HF +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the flash attention 3 kernel
+hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
+
+
+def hf_flash_attention3(query, key, value):
+    return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ATTENTION,
+    impl_name="hf_kernels_flash_attn3",
+    impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
+    impl_func=hf_flash_attention3,
+)
+
+ +
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                 hf_kernels_flash_attn3         3.80%     163.585us        44.55%       1.916ms       1.916ms       0.000us         0.00%       3.598ms       3.598ms             1  
+                                          FlashAttnFunc         3.38%     145.315us        40.75%       1.753ms     584.213us       0.000us         0.00%       3.598ms       1.199ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.86%      80.133us        37.37%       1.607ms     535.775us       2.702ms       100.00%       3.598ms       1.199ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.704ms       100.05%       2.704ms       2.704ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.702ms       100.00%       2.702ms     900.800us             3  
+                                Activity Buffer Request        33.08%       1.423ms        33.08%       1.423ms       1.423ms     895.776us        33.15%     895.776us     895.776us             1  
+                                            aten::empty         1.02%      43.812us         1.02%      43.812us       7.302us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.30%      13.081us         0.30%      13.081us       4.360us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.10%      47.211us         1.10%      47.211us      15.737us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        55.45%       2.385ms        55.45%       2.385ms       2.385ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.301ms
+Self CUDA time total: 2.702ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                 hf_kernels_flash_attn3         2.35%     101.013us        40.06%       1.725ms       1.725ms       0.000us         0.00%       3.751ms       3.751ms             1  
+                                          FlashAttnFunc         2.16%      92.983us        37.71%       1.624ms     541.352us       0.000us         0.00%       3.751ms       1.250ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.19%      51.175us        35.55%       1.531ms     510.358us       2.802ms       100.00%       3.751ms       1.250ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.803ms       100.06%       2.803ms       2.803ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.802ms       100.00%       2.802ms     933.921us             3  
+                                Activity Buffer Request        32.90%       1.417ms        32.90%       1.417ms       1.417ms     949.686us        33.90%     949.686us     949.686us             1  
+                                            aten::empty         0.63%      27.091us         0.63%      27.091us       4.515us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.239us         0.12%       5.239us       1.746us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.72%      30.870us         0.72%      30.870us      10.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.94%       2.581ms        59.94%       2.581ms       2.581ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.306ms
+Self CUDA time total: 2.802ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                 hf_kernels_flash_attn3         2.33%     100.994us        40.09%       1.739ms       1.739ms       0.000us         0.00%       3.778ms       3.778ms             1  
+                                          FlashAttnFunc         2.19%      94.944us        37.76%       1.638ms     545.852us       0.000us         0.00%       3.778ms       1.259ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.20%      52.112us        35.57%       1.543ms     514.204us       2.819ms       100.00%       3.778ms       1.259ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.820ms       100.05%       2.820ms       2.820ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.819ms       100.00%       2.819ms     939.550us             3  
+                                Activity Buffer Request        32.79%       1.422ms        32.79%       1.422ms       1.422ms     959.198us        34.03%     959.198us     959.198us             1  
+                                            aten::empty         0.60%      26.051us         0.60%      26.051us       4.342us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.409us         0.12%       5.409us       1.803us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.85%      36.931us         0.85%      36.931us      12.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        59.91%       2.599ms        59.91%       2.599ms       2.599ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.337ms
+Self CUDA time total: 2.819ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                 hf_kernels_flash_attn3         2.88%     135.094us        43.08%       2.020ms       2.020ms       0.000us         0.00%       3.874ms       3.874ms             1  
+                                          FlashAttnFunc         2.10%      98.504us        40.20%       1.885ms     628.185us       0.000us         0.00%       3.874ms       1.291ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.10%      51.632us        38.10%       1.786ms     595.350us       2.895ms       100.00%       3.874ms       1.291ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       2.897ms       100.06%       2.897ms       2.897ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.895ms       100.00%       2.895ms     965.011us             3  
+                                Activity Buffer Request        30.58%       1.434ms        30.58%       1.434ms       1.434ms     979.229us        33.82%     979.229us     979.229us             1  
+                                            aten::empty         0.58%      27.080us         0.58%      27.080us       4.513us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       5.380us         0.11%       5.380us       1.793us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         5.72%     268.289us         5.72%     268.289us      89.430us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        56.92%       2.668ms        56.92%       2.668ms       2.668ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.688ms
+Self CUDA time total: 2.895ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                 hf_kernels_flash_attn3         2.52%     128.963us        37.26%       1.903ms       1.903ms       0.000us         0.00%       4.575ms       4.575ms             1  
+                                          FlashAttnFunc         1.87%      95.425us        34.74%       1.774ms     591.441us       0.000us         0.00%       4.575ms       1.525ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.01%      51.593us        32.87%       1.679ms     559.632us       3.427ms       100.00%       4.575ms       1.525ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.429ms       100.05%       3.429ms       3.429ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.427ms       100.00%       3.427ms       1.142ms             3  
+                                Activity Buffer Request        27.82%       1.421ms        27.82%       1.421ms       1.421ms       1.148ms        33.49%       1.148ms       1.148ms             1  
+                                            aten::empty         0.55%      28.251us         0.55%      28.251us       4.709us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.10%       5.249us         0.10%       5.249us       1.750us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.38%     172.866us         3.38%     172.866us      57.622us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        62.74%       3.205ms        62.74%       3.205ms       3.205ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 5.108ms
+Self CUDA time total: 3.427ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                 hf_kernels_flash_attn3         2.37%     119.165us        36.69%       1.842ms       1.842ms       0.000us         0.00%       4.545ms       4.545ms             1  
+                                          FlashAttnFunc         1.86%      93.463us        34.32%       1.723ms     574.423us       0.000us         0.00%       4.545ms       1.515ms             3  
+                        _flash_attn3_48fe103_dirty::fwd         1.01%      50.561us        32.46%       1.630ms     543.268us       3.398ms       100.00%       4.545ms       1.515ms             3  
+                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us       3.400ms       100.05%       3.400ms       3.400ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.398ms       100.00%       3.398ms       1.133ms             3  
+                                Activity Buffer Request        27.47%       1.379ms        27.47%       1.379ms       1.379ms       1.147ms        33.76%       1.147ms       1.147ms             1  
+                                            aten::empty         0.56%      28.202us         0.56%      28.202us       4.700us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.10%       5.090us         0.10%       5.090us       1.697us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.32%     166.515us         3.32%     166.515us      55.505us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        63.31%       3.179ms        63.31%       3.179ms       3.179ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 5.022ms
+Self CUDA time total: 3.398ms
+
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.93  True
+hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.96  True
+hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.01  True
+hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.01  True
+hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.18  True
+hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     1.17  True
+
+
+
▶ UV Install Logs
+ +
+
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] +Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.06it/s] +Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.12it/s]
+
+

Artifacts:

+attention.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/flash_attn/impls/index.html b/flash_attn/impls/index.html new file mode 100644 index 0000000000000000000000000000000000000000..59a4fb994db1d910bbc3e0e4a28c04e81908a615 --- /dev/null +++ b/flash_attn/impls/index.html @@ -0,0 +1,93 @@ + + + + + + Index of /flash_attn/impls + + + +
+ ← back +
+

Index of /flash_attn/impls

+ + + \ No newline at end of file diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html new file mode 100644 index 0000000000000000000000000000000000000000..9802f3f658c65e57e8526f2af8160462ea71be6e --- /dev/null +++ b/flash_attn/impls/mem_efficient_attention.html @@ -0,0 +1,4175 @@ + + + + + + mem_efficient_attention + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Memory Efficient Attention Implementation

+

Memory Efficient SDPA Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 35.14s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_mem_eff(q, k, v):
+    qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+    with torch.nn.attention.sdpa_kernel(
+        torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION
+    ):
+        o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+    return o.transpose(1, 2).contiguous()
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ATTENTION,
+    impl_name="torch_mem_eff",
+    impl_tags={"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"},
+    impl_func=torch_mem_eff,
+)
+
+ +
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          torch_mem_eff         4.95%     352.351us        32.76%       2.334ms       2.334ms       0.000us         0.00%       5.540ms       5.540ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.523ms       100.61%       5.523ms       5.523ms             1  
+                     aten::scaled_dot_product_attention         0.42%      30.002us         2.65%     188.407us      62.802us       0.000us         0.00%       4.866ms       1.622ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.34%      24.112us         2.22%     158.405us      52.802us       0.000us         0.00%       4.866ms       1.622ms             3  
+                     aten::_efficient_attention_forward         0.50%      35.512us         1.50%     106.553us      35.518us       4.866ms        88.65%       4.866ms       1.622ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       4.866ms        88.65%       4.866ms       1.622ms             3  
+                                       aten::contiguous         0.17%      12.230us        24.19%       1.723ms     191.466us       0.000us         0.00%     673.885us      74.876us             9  
+                                            aten::clone         0.48%      34.032us        24.02%       1.711ms     190.107us       0.000us         0.00%     673.885us      74.876us             9  
+                                            aten::copy_         1.04%      73.980us        22.51%       1.603ms     178.136us     623.037us        11.35%     673.885us      74.876us             9  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     623.037us        11.35%     623.037us      69.226us             9  
+                                Activity Buffer Request        20.23%       1.441ms        20.23%       1.441ms       1.441ms      50.848us         0.93%      50.848us      50.848us             1  
+                                        aten::transpose         1.03%      73.058us         1.37%      97.392us       4.058us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.34%      24.334us         0.34%      24.334us       1.014us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.28%      19.590us         1.03%      73.701us       8.189us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         1.26%      89.621us         1.26%      89.621us       4.268us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.58%     112.598us         1.58%     112.598us       9.383us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.04%       3.160us         0.04%       3.160us       1.053us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.12%       8.400us         0.12%       8.400us       2.800us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        67.24%       4.789ms        67.24%       4.789ms       4.789ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 7.123ms
+Self CUDA time total: 5.489ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          torch_mem_eff         3.15%     231.099us        27.84%       2.044ms       2.044ms       0.000us         0.00%       5.902ms       5.902ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       5.856ms       100.14%       5.856ms       5.856ms             1  
+                     aten::scaled_dot_product_attention         0.26%      19.041us         1.91%     140.484us      46.828us       0.000us         0.00%       5.210ms       1.737ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.25%      18.340us         1.65%     121.443us      40.481us       0.000us         0.00%       5.210ms       1.737ms             3  
+                     aten::_efficient_attention_forward         0.40%      29.263us         1.10%      80.783us      26.928us       5.210ms        89.09%       5.210ms       1.737ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.210ms        89.09%       5.210ms       1.737ms             3  
+                                       aten::contiguous         0.10%       7.239us        22.19%       1.629ms     181.023us       0.000us         0.00%     692.607us      76.956us             9  
+                                            aten::clone         0.29%      21.632us        22.09%       1.622ms     180.219us       0.000us         0.00%     692.607us      76.956us             9  
+                                            aten::copy_         0.87%      63.554us        21.13%       1.551ms     172.359us     638.271us        10.91%     692.607us      76.956us             9  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     638.271us        10.91%     638.271us      70.919us             9  
+                                Activity Buffer Request        19.39%       1.423ms        19.39%       1.423ms       1.423ms      54.336us         0.93%      54.336us      54.336us             1  
+                                        aten::transpose         0.66%      48.509us         0.89%      65.581us       2.733us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.23%      17.072us         0.23%      17.072us       0.711us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.16%      11.700us         0.67%      49.102us       5.456us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.83%      61.232us         0.83%      61.232us       2.916us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.18%      86.372us         1.18%      86.372us       7.198us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.340us         0.03%       2.340us       0.780us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.05%       3.500us         0.05%       3.500us       1.167us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        72.16%       5.297ms        72.16%       5.297ms       5.297ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 7.341ms
+Self CUDA time total: 5.848ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          torch_mem_eff         2.94%     229.483us        29.69%       2.318ms       2.318ms       0.000us         0.00%       6.099ms       6.099ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.049ms       100.14%       6.049ms       6.049ms             1  
+                     aten::scaled_dot_product_attention         0.23%      17.971us         1.79%     139.464us      46.488us       0.000us         0.00%       5.384ms       1.795ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.23%      18.090us         1.56%     121.493us      40.498us       0.000us         0.00%       5.384ms       1.795ms             3  
+                     aten::_efficient_attention_forward         0.36%      27.830us         1.04%      80.963us      26.988us       5.384ms        89.13%       5.384ms       1.795ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.384ms        89.13%       5.384ms       1.795ms             3  
+                                       aten::contiguous         0.09%       7.278us        24.41%       1.906ms     211.734us       0.000us         0.00%     714.652us      79.406us             9  
+                                            aten::clone         0.28%      21.781us        24.31%       1.898ms     210.925us       0.000us         0.00%     714.652us      79.406us             9  
+                                            aten::copy_         0.80%      62.662us        23.36%       1.824ms     202.683us     656.540us        10.87%     714.652us      79.406us             9  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     656.540us        10.87%     656.540us      72.949us             9  
+                                Activity Buffer Request        21.74%       1.697ms        21.74%       1.697ms       1.697ms      58.112us         0.96%      58.112us      58.112us             1  
+                                        aten::transpose         0.63%      48.810us         0.84%      65.850us       2.744us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.22%      17.040us         0.22%      17.040us       0.710us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.14%      11.161us         0.67%      52.392us       5.821us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.87%      67.583us         0.87%      67.583us       3.218us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         1.09%      85.261us         1.09%      85.261us       7.105us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.451us         0.03%       2.451us       0.817us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.290us         0.04%       3.290us       1.097us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        70.31%       5.490ms        70.31%       5.490ms       5.490ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 7.808ms
+Self CUDA time total: 6.041ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          torch_mem_eff         2.96%     232.645us        28.95%       2.277ms       2.277ms       0.000us         0.00%       6.207ms       6.207ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.157ms       100.14%       6.157ms       6.157ms             1  
+                     aten::scaled_dot_product_attention         0.23%      18.052us         1.76%     138.596us      46.199us       0.000us         0.00%       5.492ms       1.831ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.23%      17.731us         1.53%     120.544us      40.181us       0.000us         0.00%       5.492ms       1.831ms             3  
+                     aten::_efficient_attention_forward         0.35%      27.329us         1.02%      80.113us      26.704us       5.492ms        89.32%       5.492ms       1.831ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.492ms        89.32%       5.492ms       1.831ms             3  
+                                       aten::contiguous         0.09%       7.269us        23.67%       1.862ms     206.848us       0.000us         0.00%     714.624us      79.403us             9  
+                                            aten::clone         0.28%      21.997us        23.58%       1.854ms     206.041us       0.000us         0.00%     714.624us      79.403us             9  
+                                            aten::copy_         0.89%      69.616us        22.61%       1.779ms     197.614us     656.513us        10.68%     714.624us      79.403us             9  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     656.513us        10.68%     656.513us      72.946us             9  
+                                Activity Buffer Request        17.99%       1.415ms        17.99%       1.415ms       1.415ms      58.111us         0.95%      58.111us      58.111us             1  
+                                        aten::transpose         0.63%      49.422us         0.84%      66.332us       2.764us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.22%      16.910us         0.22%      16.910us       0.705us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.15%      11.593us         0.68%      53.843us       5.983us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.87%      68.381us         0.87%      68.381us       3.256us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         4.00%     314.941us         4.00%     314.941us      26.245us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.380us         0.03%       2.380us       0.793us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.242us         0.04%       3.242us       1.081us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        71.05%       5.588ms        71.05%       5.588ms       5.588ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 7.865ms
+Self CUDA time total: 6.149ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          torch_mem_eff         2.91%     232.917us        28.19%       2.257ms       2.257ms       0.000us         0.00%       6.364ms       6.364ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.313ms       100.13%       6.313ms       6.313ms             1  
+                     aten::scaled_dot_product_attention         0.22%      17.912us         1.77%     142.075us      47.358us       0.000us         0.00%       5.641ms       1.880ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.23%      18.730us         1.55%     124.163us      41.388us       0.000us         0.00%       5.641ms       1.880ms             3  
+                     aten::_efficient_attention_forward         0.36%      29.090us         1.02%      81.873us      27.291us       5.641ms        89.47%       5.641ms       1.880ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.641ms        89.47%       5.641ms       1.880ms             3  
+                                       aten::contiguous         0.09%       7.221us        22.98%       1.840ms     204.428us       0.000us         0.00%     723.455us      80.384us             9  
+                                            aten::clone         0.27%      21.690us        22.89%       1.833ms     203.626us       0.000us         0.00%     723.455us      80.384us             9  
+                                            aten::copy_         0.78%      62.812us        21.99%       1.761ms     195.631us     663.839us        10.53%     723.455us      80.384us             9  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     663.839us        10.53%     663.839us      73.760us             9  
+                                Activity Buffer Request        18.37%       1.471ms        18.37%       1.471ms       1.471ms      59.616us         0.95%      59.616us      59.616us             1  
+                                        aten::transpose         0.60%      48.283us         0.82%      65.922us       2.747us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.22%      17.639us         0.22%      17.639us       0.735us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.15%      11.816us         0.63%      50.264us       5.585us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.80%      63.840us         0.80%      63.840us       3.040us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         3.11%     249.257us         3.11%     249.257us      20.771us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.260us         0.03%       2.260us       0.753us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.100us         0.04%       3.100us       1.033us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        71.81%       5.750ms        71.81%       5.750ms       5.750ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 8.007ms
+Self CUDA time total: 6.304ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          torch_mem_eff         3.10%     262.407us        28.45%       2.407ms       2.407ms       0.000us         0.00%       6.700ms       6.700ms             1  
+                                          torch_mem_eff         0.00%       0.000us         0.00%       0.000us       0.000us       6.648ms       100.13%       6.648ms       6.648ms             1  
+                     aten::scaled_dot_product_attention         0.22%      18.361us         1.72%     145.216us      48.405us       0.000us         0.00%       5.968ms       1.989ms             3  
+          aten::_scaled_dot_product_efficient_attention         0.22%      18.717us         1.50%     126.855us      42.285us       0.000us         0.00%       5.968ms       1.989ms             3  
+                     aten::_efficient_attention_forward         0.34%      29.081us         1.00%      84.393us      28.131us       5.968ms        89.89%       5.968ms       1.989ms             3  
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem...         0.00%       0.000us         0.00%       0.000us       0.000us       5.968ms        89.89%       5.968ms       1.989ms             3  
+                                       aten::contiguous         0.09%       7.641us        23.04%       1.949ms     216.566us       0.000us         0.00%     731.964us      81.329us             9  
+                                            aten::clone         0.29%      24.377us        22.95%       1.941ms     215.717us       0.000us         0.00%     731.964us      81.329us             9  
+                                            aten::copy_         0.80%      68.015us        22.01%       1.862ms     206.906us     670.941us        10.11%     731.964us      81.329us             9  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     670.941us        10.11%     670.941us      74.549us             9  
+                                Activity Buffer Request        17.04%       1.441ms        17.04%       1.441ms       1.441ms      61.023us         0.92%      61.023us      61.023us             1  
+                                        aten::transpose         0.67%      56.417us         0.87%      73.607us       3.067us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.20%      17.190us         0.20%      17.190us       0.716us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::empty_like         0.14%      12.051us         0.65%      54.922us       6.102us       0.000us         0.00%       0.000us       0.000us             9  
+                                            aten::empty         0.83%      69.821us         0.83%      69.821us       3.325us       0.000us         0.00%       0.000us       0.000us            21  
+                                       cudaLaunchKernel         4.44%     375.855us         4.44%     375.855us      31.321us       0.000us         0.00%       0.000us       0.000us            12  
+                                  cudaStreamIsCapturing         0.03%       2.230us         0.03%       2.230us       0.743us       0.000us         0.00%       0.000us       0.000us             3  
+                                   cudaFuncSetAttribute         0.04%       3.250us         0.04%       3.250us       1.083us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        71.55%       6.053ms        71.55%       6.053ms       6.053ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 8.459ms
+Self CUDA time total: 6.639ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_mem_eff            cuda_attn_L128_bfloat16     1.86  True
+torch_mem_eff            cuda_attn_L256_bfloat16     1.99  True
+torch_mem_eff            cuda_attn_L320_bfloat16     2.02  True
+torch_mem_eff            cuda_attn_L384_bfloat16     2.04  True
+torch_mem_eff            cuda_attn_L448_bfloat16     2.06  True
+torch_mem_eff            cuda_attn_L512_bfloat16     2.22  True
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+attention.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html new file mode 100644 index 0000000000000000000000000000000000000000..f3b4585cfd217bc6f3bfc855913c711a4d339cc0 --- /dev/null +++ b/flash_attn/impls/sage_attention.html @@ -0,0 +1,3949 @@ + + + + + + sage_attention + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

SageAttention Implementation

+

SageAttention Benchmark (INT8 Quantized)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 4.32s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the sage attention kernel
+hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention")
+
+
+def sage_attention(query, key, value):
+    """SageAttention with INT8 Q/K quantization and FP16 P/V"""
+    return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ATTENTION,
+    impl_name="sage_int8_fp16",
+    impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"},
+    impl_func=sage_attention,
+)
+
+ +
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+impl                     wl                  p50(ms)  ok
+sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+
+
+Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] +Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 12.32it/s] +Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 16.93it/s] +
+
+

Artifacts:

+attention.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html new file mode 100644 index 0000000000000000000000000000000000000000..b12c5ba9380949bde34bbedbe6ea5dddcebc46fe --- /dev/null +++ b/flash_attn/impls/xformers.html @@ -0,0 +1,4088 @@ + + + + + + xformers + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

xFormers Memory Efficient Attention

+

xFormers Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 5.56s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "xformers",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+import xformers.ops as xops
+
+
+def xformers_attention(q, k, v):
+    """xFormers memory efficient attention"""
+    # xFormers expects [batch, seq_len, heads, head_dim]
+    return xops.memory_efficient_attention(q, k, v)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ATTENTION,
+    impl_name="xformers_meff",
+    impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
+    impl_func=xformers_attention,
+)
+
+ +
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          xformers_meff        10.99%     493.828us        51.93%       2.334ms       2.334ms       0.000us         0.00%       3.600ms       3.600ms             1  
+                             xformers_flash3::flash_fwd         4.32%     194.118us        40.08%       1.801ms     600.437us       0.000us         0.00%       3.600ms       1.200ms             3  
+                                      flash_attn_3::fwd         1.81%      81.292us        35.76%       1.607ms     535.731us       2.714ms       100.00%       3.600ms       1.200ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.716ms       100.05%       2.716ms       2.716ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.714ms       100.00%       2.714ms     904.730us             3  
+                                Activity Buffer Request        31.96%       1.436ms        31.96%       1.436ms       1.436ms     885.349us        32.62%     885.349us     885.349us             1  
+                                            aten::empty         0.86%      38.850us         0.86%      38.850us       6.475us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.25%      11.022us         0.25%      11.022us       3.674us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.88%      39.751us         0.88%      39.751us      13.250us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.26%      11.630us         0.87%      38.970us       6.495us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.61%      27.340us         0.61%      27.340us       4.557us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        48.07%       2.160ms        48.07%       2.160ms       2.160ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.494ms
+Self CUDA time total: 2.714ms
+
+
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          xformers_meff         7.45%     327.551us        47.96%       2.108ms       2.108ms       0.000us         0.00%       3.684ms       3.684ms             1  
+                             xformers_flash3::flash_fwd         3.56%     156.647us        39.91%       1.754ms     584.750us       0.000us         0.00%       3.684ms       1.228ms             3  
+                                      flash_attn_3::fwd         1.31%      57.602us        36.35%       1.598ms     532.534us       2.754ms       100.00%       3.684ms       1.228ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.755ms       100.06%       2.755ms       2.755ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.754ms       100.00%       2.754ms     917.895us             3  
+                                Activity Buffer Request        33.31%       1.464ms        33.31%       1.464ms       1.464ms     930.812us        33.80%     930.812us     930.812us             1  
+                                            aten::empty         0.76%      33.251us         0.76%      33.251us       5.542us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.14%       6.040us         0.14%       6.040us       2.013us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.83%      36.590us         0.83%      36.590us      12.197us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.23%      10.130us         0.60%      26.441us       4.407us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.37%      16.311us         0.37%      16.311us       2.719us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        52.04%       2.287ms        52.04%       2.287ms       2.287ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.395ms
+Self CUDA time total: 2.754ms
+
+
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          xformers_meff         6.93%     309.631us        45.92%       2.051ms       2.051ms       0.000us         0.00%       3.806ms       3.806ms             1  
+                             xformers_flash3::flash_fwd         3.88%     173.206us        38.45%       1.717ms     572.356us       0.000us         0.00%       3.806ms       1.269ms             3  
+                                      flash_attn_3::fwd         1.30%      58.031us        34.57%       1.544ms     514.621us       2.838ms       100.00%       3.806ms       1.269ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.840ms       100.06%       2.840ms       2.840ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.838ms       100.00%       2.838ms     945.948us             3  
+                                Activity Buffer Request        31.70%       1.416ms        31.70%       1.416ms       1.416ms     968.572us        34.13%     968.572us     968.572us             1  
+                                            aten::empty         0.70%      31.373us         0.70%      31.373us       5.229us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.510us         0.12%       5.510us       1.837us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.74%      33.081us         0.74%      33.081us      11.027us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.19%       8.679us         0.54%      24.060us       4.010us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.34%      15.381us         0.34%      15.381us       2.564us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        54.08%       2.416ms        54.08%       2.416ms       2.416ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.466ms
+Self CUDA time total: 2.838ms
+
+
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          xformers_meff         6.70%     313.562us        47.60%       2.227ms       2.227ms       0.000us         0.00%       3.863ms       3.863ms             1  
+                             xformers_flash3::flash_fwd         3.24%     151.796us        40.34%       1.888ms     629.212us       0.000us         0.00%       3.863ms       1.288ms             3  
+                                      flash_attn_3::fwd         1.25%      58.574us        37.10%       1.736ms     578.613us       2.888ms       100.00%       3.863ms       1.288ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       2.890ms       100.06%       2.890ms       2.890ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       2.888ms       100.00%       2.888ms     962.743us             3  
+                                Activity Buffer Request        30.65%       1.434ms        30.65%       1.434ms       1.434ms     974.434us        33.74%     974.434us     974.434us             1  
+                                            aten::empty         0.64%      30.051us         0.64%      30.051us       5.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       5.730us         0.12%       5.730us       1.910us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         4.43%     207.206us         4.43%     207.206us      69.069us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.22%      10.139us         0.56%      26.119us       4.353us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.34%      15.980us         0.34%      15.980us       2.663us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        52.40%       2.452ms        52.40%       2.452ms       2.452ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.679ms
+Self CUDA time total: 2.888ms
+
+
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          xformers_meff         6.05%     310.689us        42.88%       2.201ms       2.201ms       0.000us         0.00%       4.489ms       4.489ms             1  
+                             xformers_flash3::flash_fwd         2.93%     150.475us        36.35%       1.866ms     622.001us       0.000us         0.00%       4.489ms       1.496ms             3  
+                                      flash_attn_3::fwd         1.04%      53.593us        33.42%       1.716ms     571.843us       3.365ms       100.00%       4.489ms       1.496ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.367ms       100.05%       3.367ms       3.367ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.365ms       100.00%       3.365ms       1.122ms             3  
+                                Activity Buffer Request        28.02%       1.439ms        28.02%       1.439ms       1.439ms       1.123ms        33.38%       1.123ms       1.123ms             1  
+                                            aten::empty         0.59%      30.191us         0.59%      30.191us       5.032us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.12%       6.030us         0.12%       6.030us       2.010us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.65%     187.166us         3.65%     187.166us      62.389us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.18%       9.272us         0.47%      24.322us       4.054us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.29%      15.050us         0.29%      15.050us       2.508us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        57.12%       2.932ms        57.12%       2.932ms       2.932ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 5.133ms
+Self CUDA time total: 3.365ms
+
+
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                          xformers_meff         6.40%     331.462us        43.16%       2.236ms       2.236ms       0.000us         0.00%       4.557ms       4.557ms             1  
+                             xformers_flash3::flash_fwd         2.99%     154.686us        36.26%       1.879ms     626.255us       0.000us         0.00%       4.557ms       1.519ms             3  
+                                      flash_attn_3::fwd         1.13%      58.511us        33.27%       1.724ms     574.693us       3.413ms       100.00%       4.557ms       1.519ms             3  
+                                          xformers_meff         0.00%       0.000us         0.00%       0.000us       0.000us       3.415ms       100.05%       3.415ms       3.415ms             1  
+void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us       3.413ms       100.00%       3.413ms       1.138ms             3  
+                                Activity Buffer Request        27.70%       1.435ms        27.70%       1.435ms       1.435ms       1.144ms        33.52%       1.144ms       1.144ms             1  
+                                            aten::empty         0.61%      31.572us         0.61%      31.572us       5.262us       0.000us         0.00%       0.000us       0.000us             6  
+                                   cudaFuncSetAttribute         0.11%       5.890us         0.11%       5.890us       1.963us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         3.72%     192.906us         3.72%     192.906us      64.302us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::reshape         0.18%       9.270us         0.50%      26.000us       4.333us       0.000us         0.00%       0.000us       0.000us             6  
+                                             aten::view         0.32%      16.730us         0.32%      16.730us       2.788us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        56.84%       2.946ms        56.84%       2.946ms       2.946ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 5.182ms
+Self CUDA time total: 3.413ms
+
+
+impl                     wl                  p50(ms)  ok
+xformers_meff            cuda_attn_L128_bfloat16     0.98  True
+xformers_meff            cuda_attn_L256_bfloat16     1.02  True
+xformers_meff            cuda_attn_L320_bfloat16     1.07  True
+xformers_meff            cuda_attn_L384_bfloat16     1.08  True
+xformers_meff            cuda_attn_L448_bfloat16     1.24  True
+xformers_meff            cuda_attn_L512_bfloat16     1.23  True
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+attention.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/flash_attn/index.html b/flash_attn/index.html new file mode 100644 index 0000000000000000000000000000000000000000..eea7df846d9f2d44c6c6e03a5ac30d00cecd90cf --- /dev/null +++ b/flash_attn/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /flash_attn + + + +
+ ← back +
+

Index of /flash_attn

+ + + \ No newline at end of file diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg new file mode 100644 index 0000000000000000000000000000000000000000..2d31b4481d7f215abd56492ac08378eb5fcc9988 --- /dev/null +++ b/flash_attn/results/artifacts/combine/latency.svg @@ -0,0 +1,355 @@ + + + + + + + 2025-10-29T00:37:33.622731 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_attn_L128_bfloat16 + + + + + + + + + + + + + cuda_attn_L256_bfloat16 + + + + + + + + + + + + + cuda_attn_L320_bfloat16 + + + + + + + + + + + + + cuda_attn_L384_bfloat16 + + + + + + + + + + + + + cuda_attn_L448_bfloat16 + + + + + + + + + + + + + cuda_attn_L512_bfloat16 + + + + Workload + + + + + + + + + + + + + + + + + 1.0 + + + + + + + + + + + + + 1.2 + + + + + + + + + + + + + 1.4 + + + + + + + + + + + + + 1.6 + + + + + + + + + + + + + 1.8 + + + + + + + + + + + + + 2.0 + + + + + + + + + + + + + 2.2 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + torch_flash_ma + + + + + + + + + torch_mem_eff + + + + + + + + + xformers_meff + + + + + + + + + hf_kernels_flash_attn + + + + + + + + + hf_kernels_flash_attn3 + + + + + + + + + + \ No newline at end of file diff --git a/flash_attn/results/cells/combine.py b/flash_attn/results/cells/combine.py new file mode 100644 index 0000000000000000000000000000000000000000..5b3cb6ed6cf078138bf247e29ddf57bb5c9e7f82 --- /dev/null +++ b/flash_attn/results/cells/combine.py @@ -0,0 +1,30 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "matplotlib", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +from kernels_benchmark_tools.core.visuals import generate_combined_results + +# Map display names to uvnote environment variables +cache_env_map = { + "Flash (PyTorch SDPA)": "UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK", + "MemEff (PyTorch SDPA)": "UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK", + "xFormers": "UVNOTE_FILE_XFORMERS_BENCHMARK", + "HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK", + "HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK", + "SageAttention": "UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK", +} + +# Generate combined results with visualization +generate_combined_results( + cache_env_map=cache_env_map, + output_filename="attention.jsonl", + svg_filename="latency.svg" +) \ No newline at end of file diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html new file mode 100644 index 0000000000000000000000000000000000000000..3d799c214609ad2665c954956251cb91819359f8 --- /dev/null +++ b/flash_attn/results/combined_results.html @@ -0,0 +1,4774 @@ + + + + + + Flash Attention Benchmark - Combined Results + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Flash Attention Benchmarks - Aggregated Results

+

This document combines benchmark results from multiple Flash Attention implementations.

+

Combined Summary and Visualization

+
+ + + + + + + 2025-10-29T00:37:33.622731 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_attn_L128_bfloat16 + + + + + + + + + + + + + cuda_attn_L256_bfloat16 + + + + + + + + + + + + + cuda_attn_L320_bfloat16 + + + + + + + + + + + + + cuda_attn_L384_bfloat16 + + + + + + + + + + + + + cuda_attn_L448_bfloat16 + + + + + + + + + + + + + cuda_attn_L512_bfloat16 + + + + Workload + + + + + + + + + + + + + + + + + 1.0 + + + + + + + + + + + + + 1.2 + + + + + + + + + + + + + 1.4 + + + + + + + + + + + + + 1.6 + + + + + + + + + + + + + 1.8 + + + + + + + + + + + + + 2.0 + + + + + + + + + + + + + 2.2 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + torch_flash_ma + + + + + + + + + torch_mem_eff + + + + + + + + + xformers_meff + + + + + + + + + hf_kernels_flash_attn + + + + + + + + + hf_kernels_flash_attn3 + + + + + + + + + + +
+ +
+
+ +▶ code +▼ output + ▶ uv-logs + | +Cell: combine | 4.30s + | + +Raw +
+ +
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ Flash (PyTorch SDPA)          : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04
+✓ MemEff (PyTorch SDPA)         : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f
+✓ xFormers                      : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58
+✓ HF Kernels Flash Attn         : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849
+✓ HF Kernels Flash Attn3        : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20
+✓ SageAttention                 : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/f6be24aff45575cad8d1df490ac5fe9ec944103fb255665c71719ca2d7efea4e
+
+  ✓ Found Flash (PyTorch SDPA)
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04/attention.jsonl
+  ✓ Found MemEff (PyTorch SDPA)
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f/attention.jsonl
+  ✓ Found xFormers
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58/attention.jsonl
+  ✓ Found HF Kernels Flash Attn
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849/attention.jsonl
+  ✓ Found HF Kernels Flash Attn3
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20/attention.jsonl
+  ✓ Found SageAttention
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/f6be24aff45575cad8d1df490ac5fe9ec944103fb255665c71719ca2d7efea4e/attention.jsonl
+
+======================================================================
+Summary: 6 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_flash_attn    cuda_attn_L128_bfloat16     0.95  True
+hf_kernels_flash_attn    cuda_attn_L256_bfloat16     0.99  True
+hf_kernels_flash_attn    cuda_attn_L320_bfloat16     1.04  True
+hf_kernels_flash_attn    cuda_attn_L384_bfloat16     1.06  True
+hf_kernels_flash_attn    cuda_attn_L448_bfloat16     1.21  True
+hf_kernels_flash_attn    cuda_attn_L512_bfloat16     1.21  True
+hf_kernels_flash_attn3   cuda_attn_L128_bfloat16     0.93  True
+hf_kernels_flash_attn3   cuda_attn_L256_bfloat16     0.96  True
+hf_kernels_flash_attn3   cuda_attn_L320_bfloat16     1.01  True
+hf_kernels_flash_attn3   cuda_attn_L384_bfloat16     1.01  True
+hf_kernels_flash_attn3   cuda_attn_L448_bfloat16     1.18  True
+hf_kernels_flash_attn3   cuda_attn_L512_bfloat16     1.17  True
+sage_int8_fp16           cuda_attn_L128_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16           cuda_attn_L256_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16           cuda_attn_L320_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16           cuda_attn_L384_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16           cuda_attn_L448_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16           cuda_attn_L512_bfloat16    FAIL  False
+  Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+torch_flash_ma           cuda_attn_L128_bfloat16     1.21  True
+torch_flash_ma           cuda_attn_L256_bfloat16     1.27  True
+torch_flash_ma           cuda_attn_L320_bfloat16     1.29  True
+torch_flash_ma           cuda_attn_L384_bfloat16     1.32  True
+torch_flash_ma           cuda_attn_L448_bfloat16     1.47  True
+torch_flash_ma           cuda_attn_L512_bfloat16     1.49  True
+torch_mem_eff            cuda_attn_L128_bfloat16     1.86  True
+torch_mem_eff            cuda_attn_L256_bfloat16     1.99  True
+torch_mem_eff            cuda_attn_L320_bfloat16     2.02  True
+torch_mem_eff            cuda_attn_L384_bfloat16     2.04  True
+torch_mem_eff            cuda_attn_L448_bfloat16     2.06  True
+torch_mem_eff            cuda_attn_L512_bfloat16     2.22  True
+xformers_meff            cuda_attn_L128_bfloat16     0.98  True
+xformers_meff            cuda_attn_L256_bfloat16     1.02  True
+xformers_meff            cuda_attn_L320_bfloat16     1.07  True
+xformers_meff            cuda_attn_L384_bfloat16     1.08  True
+xformers_meff            cuda_attn_L448_bfloat16     1.24  True
+xformers_meff            cuda_attn_L512_bfloat16     1.23  True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 36 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 6
+
+Implementations included:
+  ✓ Flash (PyTorch SDPA)
+  ✓ MemEff (PyTorch SDPA)
+  ✓ xFormers
+  ✓ HF Kernels Flash Attn
+  ✓ HF Kernels Flash Attn3
+  ✓ SageAttention
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+latency.svg +
+ + + + + + + 2025-10-29T00:37:33.622731 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_attn_L128_bfloat16 + + + + + + + + + + + + + cuda_attn_L256_bfloat16 + + + + + + + + + + + + + cuda_attn_L320_bfloat16 + + + + + + + + + + + + + cuda_attn_L384_bfloat16 + + + + + + + + + + + + + cuda_attn_L448_bfloat16 + + + + + + + + + + + + + cuda_attn_L512_bfloat16 + + + + Workload + + + + + + + + + + + + + + + + + 1.0 + + + + + + + + + + + + + 1.2 + + + + + + + + + + + + + 1.4 + + + + + + + + + + + + + 1.6 + + + + + + + + + + + + + 1.8 + + + + + + + + + + + + + 2.0 + + + + + + + + + + + + + 2.2 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + torch_flash_ma + + + + + + + + + torch_mem_eff + + + + + + + + + xformers_meff + + + + + + + + + hf_kernels_flash_attn + + + + + + + + + hf_kernels_flash_attn3 + + + + + + + + + + +
+
+
+
+
+ + + \ No newline at end of file diff --git a/flash_attn/results/index.html b/flash_attn/results/index.html new file mode 100644 index 0000000000000000000000000000000000000000..b87b6002f4b781572dbb50f91850e50ee98130ab --- /dev/null +++ b/flash_attn/results/index.html @@ -0,0 +1,88 @@ + + + + + + Index of /flash_attn/results + + + +
+ ← back +
+

Index of /flash_attn/results

+ + + \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000000000000000000000000000000000000..1061b4b3222caa3480fdd412bcf6f18bb97b54f9 --- /dev/null +++ b/index.html @@ -0,0 +1,4029 @@ + + + + + + index + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

All Benchmarks Aggregated Report

+

Layer Norm

+
+ + +
+ + + + + + + + + + + + + + + + + + +
ImplementationDescription
HF Kernels Layer NormHuggingFace kernels implementation
PyTorch Layer NormPyTorch native implementation
+

Rotary Position Embeddings

+
+ + +
+ + + + + + + + + + + + + + + + + + +
ImplementationDescription
HF Kernels RotaryHuggingFace kernels implementation
PyTorch RotaryPyTorch native implementation
+

Flash Attention

+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ImplementationDescription
Flash AttentionFlash Attention implementation
HF Kernels Flash AttentionHuggingFace kernels Flash Attention
HF Kernels Flash Attention 3HuggingFace kernels Flash Attention 3
Memory Efficient AttentionMemory efficient attention implementation
Sage AttentionSage attention implementation
xFormersxFormers attention implementation
+

Causal Conv1D

+
+ + +
+ + + + + + + + + + + + + + + + + + +
ImplementationDescription
HF Kernels Causal Conv1DHuggingFace kernels implementation
PyTorch Causal Conv1DPyTorch native implementation
+

Activation

+
+ + +
+ + + + + + + + + + + + + + + + + + +
ImplementationDescription
HF Kernels SwiGLUHuggingFace kernels SwiGLU implementation
PyTorch SwiGLUPyTorch native SwiGLU implementation
+

ReLU

+
+ + +
+ + + + + + + + + + + + + + + + + + +
ImplementationDescription
HF Kernels ReLUHuggingFace kernels ReLU implementation
PyTorch ReLUPyTorch native ReLU implementation
+
+ + + \ No newline at end of file diff --git a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..42766b88dac4a0e2fb70b1966497dfa03856e571 --- /dev/null +++ b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl @@ -0,0 +1,4 @@ +{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8265980000032869, "p50": 0.8294890000115629, "p90": 0.8318879999933415, "mean": 0.8305783999958294, "iqr": 0.0024899999857552757, "raw_times": [0.8318879999933415, 0.8294890000115629, 0.8293980000075862, 0.8355189999633694, 0.8265980000032869], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8372490000283506, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6484859999650325, "p50": 1.6553460000068299, "p90": 1.6562569999791776, "mean": 1.654196599986335, "iqr": 0.004349999983332964, "raw_times": [1.6589869999847906, 1.6484859999650325, 1.6553460000068299, 1.6519069999958447, 1.6562569999791776], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6548570000054497, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6374860000496483, "p50": 1.6479959999742277, "p90": 1.650296000036633, "mean": 1.6462442000261035, "iqr": 0.007159000006140559, "raw_times": [1.6479959999742277, 1.6374860000496483, 1.6523060000395162, 1.6431370000304923, 1.650296000036633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.658577000000605, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null} +{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2406110000238186, "p50": 3.2579909999981282, "p90": 3.259831999969265, "mean": 3.2558895999954984, "iqr": 0.00626999997166422, "raw_times": [3.259831999969265, 3.2579909999981282, 3.2674519999886797, 3.2535619999976007, 3.2406110000238186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2579709999822626, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null} diff --git a/layer_norm/impls/cells/benchmark.py b/layer_norm/impls/cells/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..d871d1b25fedf8b294c567e9ac582decb62f3cde --- /dev/null +++ b/layer_norm/impls/cells/benchmark.py @@ -0,0 +1,49 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels", +# "kernels-benchmark-tools", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +import torch +import sys +from kernels_benchmark_tools import KernelTypeEnum, run_benchmark +from kernels import get_kernel + +# Load the layer norm kernel +layer_norm_kernel = get_kernel("kernels-community/layer-norm") + + +def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5): + B, S, D = x.shape + # The kernel expects [N, D] input; support beta (bias) if provided. + out = layer_norm_kernel.dropout_add_ln_fwd( + input=x.view(-1, D), + gamma=weight, + beta=bias, + rowscale=None, + colscale=None, + x0_subset=None, + z_subset=None, + dropout_p=0.0, + epsilon=eps, + rowscale_const=1.0, + z_numrows=S, + gen=None, + residual_in_fp32=False, + is_rms_norm=False, + )[0].view(B, S, D) + return out + + +run_benchmark( + kernel_type=KernelTypeEnum.LAYER_NORM, + impl_name="hf_kernels_layer_norm", + impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, + impl_func=hf_kernels_layer_norm, +) \ No newline at end of file diff --git a/layer_norm/impls/cells/nv.py b/layer_norm/impls/cells/nv.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5 --- /dev/null +++ b/layer_norm/impls/cells/nv.py @@ -0,0 +1,2 @@ +import subprocess +print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html new file mode 100644 index 0000000000000000000000000000000000000000..b0e606786c9541a6e20ae9d4a9aef137dc63aaa8 --- /dev/null +++ b/layer_norm/impls/hf_kernels_layer_norm.html @@ -0,0 +1,4052 @@ + + + + + + hf_kernels_layer_norm + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

HF Kernels LayerNorm Implementation

+

Based on kernels-community layer-norm kernel.

+

LayerNorm Benchmark (HF Kernels)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 6.10s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the layer norm kernel
+layer_norm_kernel = get_kernel("kernels-community/layer-norm")
+
+
+def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
+    B, S, D = x.shape
+    # The kernel expects [N, D] input; support beta (bias) if provided.
+    out = layer_norm_kernel.dropout_add_ln_fwd(
+        input=x.view(-1, D),
+        gamma=weight,
+        beta=bias,
+        rowscale=None,
+        colscale=None,
+        x0_subset=None,
+        z_subset=None,
+        dropout_p=0.0,
+        epsilon=eps,
+        rowscale_const=1.0,
+        z_numrows=S,
+        gen=None,
+        residual_in_fp32=False,
+        is_rms_norm=False,
+    )[0].view(B, S, D)
+    return out
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.LAYER_NORM,
+    impl_name="hf_kernels_layer_norm",
+    impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
+    impl_func=hf_kernels_layer_norm,
+)
+
+ +
+
+
+
+
Running layer_norm benchmark on cuda with 4 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                  hf_kernels_layer_norm         4.63%     185.406us        46.16%       1.847ms       1.847ms       0.000us         0.00%       3.120ms       3.120ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         1.69%      67.562us        40.98%       1.640ms     546.562us       2.384ms       100.00%       3.120ms       1.040ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.385ms       100.06%       2.385ms       2.385ms             1  
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       2.384ms       100.00%       2.384ms     794.642us             3  
+                                Activity Buffer Request        36.92%       1.477ms        36.92%       1.477ms       1.477ms     735.676us        30.86%     735.676us     735.676us             1  
+                                             aten::view         0.54%      21.751us         0.54%      21.751us       3.625us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         1.11%      44.581us         1.11%      44.581us       4.953us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.23%       9.360us         0.23%       9.360us       3.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         1.03%      41.042us         1.03%      41.042us      13.681us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        53.84%       2.154ms        53.84%       2.154ms       2.154ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 4.001ms
+Self CUDA time total: 2.384ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                  hf_kernels_layer_norm         2.29%     145.447us        26.95%       1.711ms       1.711ms       0.000us         0.00%       6.386ms       6.386ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         0.75%      47.652us        24.47%       1.553ms     517.784us       4.812ms       100.00%       6.386ms       2.129ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.814ms       100.03%       4.814ms       4.814ms             1  
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.812ms       100.00%       4.812ms       1.604ms             3  
+                                Activity Buffer Request        22.77%       1.446ms        22.77%       1.446ms       1.446ms       1.574ms        32.71%       1.574ms       1.574ms             1  
+                                             aten::view         0.19%      11.759us         0.19%      11.759us       1.960us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.46%      29.151us         0.46%      29.151us       3.239us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.08%       4.860us         0.08%       4.860us       1.620us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.41%      26.131us         0.41%      26.131us       8.710us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        73.05%       4.638ms        73.05%       4.638ms       4.638ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 6.348ms
+Self CUDA time total: 4.812ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                  hf_kernels_layer_norm         2.00%     126.827us        27.00%       1.712ms       1.712ms       0.000us         0.00%       6.353ms       6.353ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         0.76%      48.491us        24.80%       1.572ms     524.088us       4.792ms       100.00%       6.353ms       2.118ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.793ms       100.03%       4.793ms       4.793ms             1  
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       4.792ms       100.00%       4.792ms       1.597ms             3  
+                                Activity Buffer Request        23.05%       1.462ms        23.05%       1.462ms       1.462ms       1.561ms        32.58%       1.561ms       1.561ms             1  
+                                             aten::view         0.20%      12.869us         0.20%      12.869us       2.145us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.48%      30.222us         0.48%      30.222us       3.358us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.08%       5.090us         0.08%       5.090us       1.697us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         0.42%      26.901us         0.42%      26.901us       8.967us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        73.00%       4.628ms        73.00%       4.628ms       4.628ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 6.340ms
+Self CUDA time total: 4.792ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                  hf_kernels_layer_norm         1.24%     144.853us        19.15%       2.240ms       2.240ms       0.000us         0.00%      12.815ms      12.815ms             1  
+                _layer_norm_f8ec252::dropout_add_ln_fwd         0.39%      45.741us        17.80%       2.083ms     694.211us       9.628ms       100.00%      12.815ms       4.272ms             3  
+                                  hf_kernels_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.629ms       100.01%       9.629ms       9.629ms             1  
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr...         0.00%       0.000us         0.00%       0.000us       0.000us       9.628ms       100.00%       9.628ms       3.209ms             3  
+                                Activity Buffer Request        14.62%       1.710ms        14.62%       1.710ms       1.710ms       3.188ms        33.11%       3.188ms       3.188ms             1  
+                                             aten::view         0.11%      12.972us         0.11%      12.972us       2.162us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::empty         0.26%      30.501us         0.26%      30.501us       3.389us       0.000us         0.00%       0.000us       0.000us             9  
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla...         0.04%       5.220us         0.04%       5.220us       1.740us       0.000us         0.00%       0.000us       0.000us             3  
+                                       cudaLaunchKernel         2.49%     291.291us         2.49%     291.291us      97.097us       0.000us         0.00%       0.000us       0.000us             3  
+                                  cudaDeviceSynchronize        80.85%       9.456ms        80.85%       9.456ms       9.456ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 11.697ms
+Self CUDA time total: 9.628ms
+
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_layer_norm    LN_B16_S2048_D4096     0.83  True
+hf_kernels_layer_norm    LN_B16_S2048_D8192     1.66  True
+hf_kernels_layer_norm    LN_B16_S4096_D4096     1.65  True
+hf_kernels_layer_norm    LN_B16_S4096_D8192     3.26  True
+
+
+
▶ UV Install Logs
+ +
+
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] +Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 6.81it/s] +Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.12it/s] +Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.56it/s]
+
+

Artifacts:

+layer_norm.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/layer_norm/impls/index.html b/layer_norm/impls/index.html new file mode 100644 index 0000000000000000000000000000000000000000..51ba6dd6789d67e2ffa1e3f02dea720dbda17216 --- /dev/null +++ b/layer_norm/impls/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /layer_norm/impls + + + +
+ ← back +
+

Index of /layer_norm/impls

+ + + \ No newline at end of file diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html new file mode 100644 index 0000000000000000000000000000000000000000..b27efb4b5a46f85ef083153354b6c1b716511ffb --- /dev/null +++ b/layer_norm/impls/torch_layer_norm.html @@ -0,0 +1,4073 @@ + + + + + + torch_layer_norm + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Torch LayerNorm Implementation

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.23s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 29 00:36:39 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   33C    P0            128W /  350W |       0MiB /  46068MiB |    100%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

LayerNorm Benchmark (PyTorch)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 7.38s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
+    return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.LAYER_NORM,
+    impl_name="torch_layer_norm",
+    impl_tags={"family": "torch", "op": "layer_norm"},
+    impl_func=torch_layer_norm,
+)
+
+ +
+
+
+
+
Running layer_norm benchmark on cuda with 4 workloads.
+
+======================================================================
+PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                       torch_layer_norm         3.94%     153.226us        45.99%       1.787ms       1.787ms       0.000us         0.00%       3.036ms       3.036ms             1  
+                                       aten::layer_norm         0.41%      15.819us        42.05%       1.634ms     544.665us       0.000us         0.00%       3.036ms       1.012ms             3  
+                                aten::native_layer_norm         2.10%      81.554us        41.64%       1.618ms     539.392us       2.323ms       100.00%       3.036ms       1.012ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       2.325ms       100.06%       2.325ms       2.325ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       2.323ms       100.00%       2.323ms     774.498us             3  
+                                Activity Buffer Request        36.88%       1.433ms        36.88%       1.433ms       1.433ms     712.322us        30.66%     712.322us     712.322us             1  
+                                            aten::empty         1.28%      49.611us         1.28%      49.611us       5.512us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         1.19%      46.322us         1.19%      46.322us      15.441us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.19%       7.380us         0.19%       7.380us       1.230us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        54.01%       2.099ms        54.01%       2.099ms       2.099ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.886ms
+Self CUDA time total: 2.323ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                       torch_layer_norm         1.13%      72.543us        25.40%       1.627ms       1.627ms       0.000us         0.00%       6.533ms       6.533ms             1  
+                                       aten::layer_norm         0.14%       8.900us        24.27%       1.554ms     518.074us       0.000us         0.00%       6.533ms       2.178ms             3  
+                                aten::native_layer_norm         0.84%      53.651us        24.13%       1.545ms     515.108us       4.915ms       100.00%       6.533ms       2.178ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.917ms       100.03%       4.917ms       4.917ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.915ms       100.00%       4.915ms       1.638ms             3  
+                                Activity Buffer Request        22.32%       1.430ms        22.32%       1.430ms       1.430ms       1.618ms        32.92%       1.618ms       1.618ms             1  
+                                            aten::empty         0.44%      28.460us         0.44%      28.460us       3.162us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         0.46%      29.343us         0.46%      29.343us       9.781us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.07%       4.330us         0.07%       4.330us       0.722us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        74.60%       4.777ms        74.60%       4.777ms       4.777ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 6.403ms
+Self CUDA time total: 4.915ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                       torch_layer_norm         1.16%      72.353us        26.06%       1.624ms       1.624ms       0.000us         0.00%       6.259ms       6.259ms             1  
+                                       aten::layer_norm         0.14%       8.650us        24.90%       1.551ms     517.051us       0.000us         0.00%       6.259ms       2.086ms             3  
+                                aten::native_layer_norm         0.85%      52.692us        24.76%       1.543ms     514.168us       4.742ms       100.00%       6.259ms       2.086ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       4.744ms       100.03%       4.744ms       4.744ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       4.742ms       100.00%       4.742ms       1.581ms             3  
+                                Activity Buffer Request        22.91%       1.427ms        22.91%       1.427ms       1.427ms       1.517ms        31.99%       1.517ms       1.517ms             1  
+                                            aten::empty         0.47%      29.452us         0.47%      29.452us       3.272us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         0.47%      29.331us         0.47%      29.331us       9.777us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.06%       4.009us         0.06%       4.009us       0.668us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        73.94%       4.606ms        73.94%       4.606ms       4.606ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 6.229ms
+Self CUDA time total: 4.742ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                       torch_layer_norm         0.67%      74.863us        13.13%       1.463ms       1.463ms       0.000us         0.00%      13.036ms      13.036ms             1  
+                                       aten::layer_norm         0.09%       9.640us        12.46%       1.388ms     462.622us       0.000us         0.00%      13.036ms       4.345ms             3  
+                                aten::native_layer_norm         0.46%      51.640us        12.37%       1.378ms     459.409us       9.812ms       100.00%      13.036ms       4.345ms             3  
+                                       torch_layer_norm         0.00%       0.000us         0.00%       0.000us       0.000us       9.814ms       100.01%       9.814ms       9.814ms             1  
+void at::native::(anonymous namespace)::vectorized_l...         0.00%       0.000us         0.00%       0.000us       0.000us       9.812ms       100.00%       9.812ms       3.271ms             3  
+                                Activity Buffer Request         9.60%       1.069ms         9.60%       1.069ms       1.069ms       3.224ms        32.85%       3.224ms       3.224ms             1  
+                                            aten::empty         0.26%      29.363us         0.26%      29.363us       3.263us       0.000us         0.00%       0.000us       0.000us             9  
+                                       cudaLaunchKernel         2.01%     223.547us         2.01%     223.547us      74.516us       0.000us         0.00%       0.000us       0.000us             3  
+                                             aten::view         0.04%       4.180us         0.04%       4.180us       0.697us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        86.87%       9.675ms        86.87%       9.675ms       9.675ms       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 11.138ms
+Self CUDA time total: 9.812ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_layer_norm         LN_B16_S2048_D4096     0.82  True
+torch_layer_norm         LN_B16_S2048_D8192     1.68  True
+torch_layer_norm         LN_B16_S4096_D4096     1.61  True
+torch_layer_norm         LN_B16_S4096_D8192     3.32  True
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+layer_norm.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/layer_norm/index.html b/layer_norm/index.html new file mode 100644 index 0000000000000000000000000000000000000000..12f60968be235270e079aa5c48545ec9a928579b --- /dev/null +++ b/layer_norm/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /layer_norm + + + +
+ ← back +
+

Index of /layer_norm

+ + + \ No newline at end of file diff --git a/layer_norm/results/artifacts/combine/latency.svg b/layer_norm/results/artifacts/combine/latency.svg new file mode 100644 index 0000000000000000000000000000000000000000..2b2c749f1fb0d4e53be110d2207865dbdced18be --- /dev/null +++ b/layer_norm/results/artifacts/combine/latency.svg @@ -0,0 +1,230 @@ + + + + + + + 2025-10-29T00:37:29.280510 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + LN_B16_S2048_D4096 + + + + + + + + + + + + + LN_B16_S2048_D8192 + + + + + + + + + + + + + LN_B16_S4096_D4096 + + + + + + + + + + + + + LN_B16_S4096_D8192 + + + + Workload + + + + + + + + + + + + + + + + + 1.0 + + + + + + + + + + + + + 1.5 + + + + + + + + + + + + + 2.0 + + + + + + + + + + + + + 2.5 + + + + + + + + + + + + + 3.0 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + torch_layer_norm + + + + + + + + + hf_kernels_layer_norm + + + + + + + + + + \ No newline at end of file diff --git a/layer_norm/results/cells/combine.py b/layer_norm/results/cells/combine.py new file mode 100644 index 0000000000000000000000000000000000000000..a6dbd0a965ba54848e36671a564ac6122b6790b8 --- /dev/null +++ b/layer_norm/results/cells/combine.py @@ -0,0 +1,26 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "matplotlib", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +from kernels_benchmark_tools.core.visuals import generate_combined_results + +# Map display names to uvnote environment variables +cache_env_map = { + "PyTorch LayerNorm": "UVNOTE_FILE_TORCH_LAYER_NORM_BENCHMARK", + "HF Kernels LayerNorm": "UVNOTE_FILE_HF_KERNELS_LAYER_NORM_BENCHMARK", +} + +# Generate combined results with visualization +generate_combined_results( + cache_env_map=cache_env_map, + output_filename="layer_norm.jsonl", + svg_filename="latency.svg" +) \ No newline at end of file diff --git a/layer_norm/results/combined_results.html b/layer_norm/results/combined_results.html new file mode 100644 index 0000000000000000000000000000000000000000..a979f564165585cc88a008eb1fbc5bfd5aa6bef5 --- /dev/null +++ b/layer_norm/results/combined_results.html @@ -0,0 +1,4466 @@ + + + + + + LayerNorm Benchmark - Combined Results + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

LayerNorm Benchmarks - Aggregated Results

+

This document combines benchmark results from multiple LayerNorm implementations.

+

Combined Summary and Visualization

+
+ + + + + + + 2025-10-29T00:37:29.280510 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + LN_B16_S2048_D4096 + + + + + + + + + + + + + LN_B16_S2048_D8192 + + + + + + + + + + + + + LN_B16_S4096_D4096 + + + + + + + + + + + + + LN_B16_S4096_D8192 + + + + Workload + + + + + + + + + + + + + + + + + 1.0 + + + + + + + + + + + + + 1.5 + + + + + + + + + + + + + 2.0 + + + + + + + + + + + + + 2.5 + + + + + + + + + + + + + 3.0 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + torch_layer_norm + + + + + + + + + hf_kernels_layer_norm + + + + + + + + + + +
+ +
+
+ +▶ code +▼ output + ▶ uv-logs + | +Cell: combine | 4.26s + | + +Raw +
+ +
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ PyTorch LayerNorm             : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3
+✓ HF Kernels LayerNorm          : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74
+
+  ✓ Found PyTorch LayerNorm
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3/layer_norm.jsonl
+  ✓ Found HF Kernels LayerNorm
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74/layer_norm.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_layer_norm    LN_B16_S2048_D4096     0.83  True
+hf_kernels_layer_norm    LN_B16_S2048_D8192     1.66  True
+hf_kernels_layer_norm    LN_B16_S4096_D4096     1.65  True
+hf_kernels_layer_norm    LN_B16_S4096_D8192     3.26  True
+torch_layer_norm         LN_B16_S2048_D4096     0.82  True
+torch_layer_norm         LN_B16_S2048_D8192     1.68  True
+torch_layer_norm         LN_B16_S4096_D4096     1.61  True
+torch_layer_norm         LN_B16_S4096_D8192     3.32  True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 8 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+  ✓ PyTorch LayerNorm
+  ✓ HF Kernels LayerNorm
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+latency.svg +
+ + + + + + + 2025-10-29T00:37:29.280510 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + LN_B16_S2048_D4096 + + + + + + + + + + + + + LN_B16_S2048_D8192 + + + + + + + + + + + + + LN_B16_S4096_D4096 + + + + + + + + + + + + + LN_B16_S4096_D8192 + + + + Workload + + + + + + + + + + + + + + + + + 1.0 + + + + + + + + + + + + + 1.5 + + + + + + + + + + + + + 2.0 + + + + + + + + + + + + + 2.5 + + + + + + + + + + + + + 3.0 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + torch_layer_norm + + + + + + + + + hf_kernels_layer_norm + + + + + + + + + + +
+
+
+
+
+ + + \ No newline at end of file diff --git a/layer_norm/results/index.html b/layer_norm/results/index.html new file mode 100644 index 0000000000000000000000000000000000000000..5b6bcefdc3dcaa949d66002abc2672c3de221470 --- /dev/null +++ b/layer_norm/results/index.html @@ -0,0 +1,88 @@ + + + + + + Index of /layer_norm/results + + + +
+ ← back +
+

Index of /layer_norm/results

+ + + \ No newline at end of file diff --git a/rotary/impls/artifacts/benchmark/rotary.jsonl b/rotary/impls/artifacts/benchmark/rotary.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3a21dbdf17991e5611014fcf9c02138c37ed1901 --- /dev/null +++ b/rotary/impls/artifacts/benchmark/rotary.jsonl @@ -0,0 +1,24 @@ +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07538300002352116, "p50": 0.07777199999736695, "p90": 0.07795200002647107, "mean": 0.07717860000866494, "iqr": 0.0014790000477660215, "raw_times": [0.07777199999736695, 0.07647299997870505, 0.07795200002647107, 0.07831300001726049, 0.07538300002352116], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0837029999729566, "peak_bytes": 1720320, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.00153350830078125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09504299998752685, "p50": 0.09633299998768052, "p90": 0.09746300003143915, "mean": 0.0966769999877215, "iqr": 0.0013000000649299182, "raw_times": [0.09504299998752685, 0.09633299998768052, 0.09838299996545175, 0.09616299996650923, 0.09746300003143915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09918300003164404, "peak_bytes": 3440640, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.00154876708984375, "mse_q": 1.5854835510253906e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0929430000269349, "p50": 0.09560399996644264, "p90": 0.09620299999824056, "mean": 0.09600920000139013, "iqr": 0.0026899999738816405, "raw_times": [0.09620299999824056, 0.09560399996644264, 0.10178299999097362, 0.09351300002435892, 0.0929430000269349], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10062299998025992, "peak_bytes": 6832128, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09350300001642609, "p50": 0.09415400000989393, "p90": 0.09585299994796515, "mean": 0.09842139999136634, "iqr": 0.001959999963219161, "raw_times": [0.09350300001642609, 0.09585299994796515, 0.09415400000989393, 0.11470399999780057, 0.09389299998474598], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09742299999970783, "peak_bytes": 13664256, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09248300000308518, "p50": 0.09347299999262759, "p90": 0.09500300001263895, "mean": 0.09405499998820233, "iqr": 0.0018000000636675395, "raw_times": [0.09248300000308518, 0.09500300001263895, 0.0961129999836885, 0.09347299999262759, 0.09320299994897141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09855400003289105, "peak_bytes": 6881280, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09233299999777955, "p50": 0.09477300000071409, "p90": 0.09477400004698211, "mean": 0.09424540002100912, "iqr": 0.0021910000214120373, "raw_times": [0.09233299999777955, 0.09477400004698211, 0.09477300000071409, 0.09676400003399976, 0.09258300002557007], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09677399998508918, "peak_bytes": 13762560, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09216400002287628, "p50": 0.09306300000844203, "p90": 0.09349300000849325, "mean": 0.09324520001428027, "iqr": 0.0005399999736255268, "raw_times": [0.09216400002287628, 0.09306300000844203, 0.09455299999672206, 0.09349300000849325, 0.09295300003486773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10914400002093316, "peak_bytes": 27328512, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.00153350830078125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.5854835510253906e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09248299994624176, "p50": 0.09334300000318763, "p90": 0.09355399998867142, "mean": 0.0935691999870869, "iqr": 0.00066100000140068, "raw_times": [0.09355399998867142, 0.09557300001006297, 0.09334300000318763, 0.09248299994624176, 0.09289299998727074], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09584299999687573, "peak_bytes": 54657024, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.00154876708984375, "mse_q": 1.621246337890625e-05, "mse_k": 1.621246337890625e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09247299999515235, "p50": 0.09385300000985808, "p90": 0.09445400002050519, "mean": 0.09405140001490508, "iqr": 0.001121000025250396, "raw_times": [0.09247299999515235, 0.09445400002050519, 0.0933329999952548, 0.09385300000985808, 0.09614400005375501], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09844400000247333, "peak_bytes": 27525120, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09372300002041811, "p50": 0.094173000036335, "p90": 0.09575299998232367, "mean": 0.09506720000445057, "iqr": 0.0020299999619055598, "raw_times": [0.09796399996275795, 0.09575299998232367, 0.094173000036335, 0.09372300002041811, 0.09372300002041811], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09865399999853253, "peak_bytes": 55050240, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09140299999899071, "p50": 0.092913999992561, "p90": 0.09422299996231231, "mean": 0.09330119999049202, "iqr": 0.0015199999552351073, "raw_times": [0.09140299999899071, 0.09526299999151888, 0.092913999992561, 0.09422299996231231, 0.09270300000707721], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09514300001001175, "peak_bytes": 109314048, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09479400000600435, "p50": 0.09623299996519563, "p90": 0.09679300001153024, "mean": 0.09610519999796452, "iqr": 0.000919999990856013, "raw_times": [0.09587300002067423, 0.09679300001153024, 0.09479400000600435, 0.09623299996519563, 0.09683299998641814], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09740300004068558, "peak_bytes": 218628096, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09216300003345168, "p50": 0.09397300004820863, "p90": 0.09462299999540846, "mean": 0.09381320001011773, "iqr": 0.0016889999869817984, "raw_times": [0.09293400000842666, 0.09537299996509319, 0.09397300004820863, 0.09216300003345168, 0.09462299999540846], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10023300001194002, "peak_bytes": 68698112, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0913630000241028, "p50": 0.0930929999753971, "p90": 0.09448299999803567, "mean": 0.09361499999158696, "iqr": 0.0023500000452258973, "raw_times": [0.0913630000241028, 0.09700300000758943, 0.09213299995280977, 0.09448299999803567, 0.0930929999753971], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09703300003138793, "peak_bytes": 6848512, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.5974044799804688e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0902330000371876, "p50": 0.09208300002683245, "p90": 0.0927039999965018, "mean": 0.0920254000220666, "iqr": 0.0007599999776175537, "raw_times": [0.0902330000371876, 0.09194400001888425, 0.09208300002683245, 0.09316300003092692, 0.0927039999965018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09501400000999638, "peak_bytes": 13647872, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09339300004285178, "p50": 0.09388300003365657, "p90": 0.09438299997555077, "mean": 0.09392300001991316, "iqr": 0.0009499999578110874, "raw_times": [0.09388300003365657, 0.09452300002976699, 0.09438299997555077, 0.09339300004285178, 0.09343300001773969], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09746399996402033, "peak_bytes": 27295744, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.621246337890625e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09369299999661962, "p50": 0.09495300002981821, "p90": 0.09641299999429975, "mean": 0.09557120000636132, "iqr": 0.001839999981712026, "raw_times": [0.09457300001258773, 0.09495300002981821, 0.09641299999429975, 0.09369299999661962, 0.0982239999984813], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09584299999687573, "peak_bytes": 13697024, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09207300001889962, "p50": 0.09441299999934927, "p90": 0.09493300001395255, "mean": 0.09826719999637135, "iqr": 0.0009000000318337698, "raw_times": [0.09207300001889962, 0.11588399996753651, 0.09441299999934927, 0.09493300001395255, 0.09403299998211878], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09803300002886317, "peak_bytes": 27394048, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.00153350830078125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.5974044799804688e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09320300000581483, "p50": 0.09509299997034759, "p90": 0.0968430000511944, "mean": 0.0957752000090295, "iqr": 0.0027100000465907215, "raw_times": [0.0968430000511944, 0.09413300000460367, 0.09509299997034759, 0.09960400001318703, 0.09320300000581483], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09855399997604763, "peak_bytes": 54591488, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0926630000321893, "p50": 0.09438299997555077, "p90": 0.09443299995837151, "mean": 0.09837319998950989, "iqr": 0.0016799999684735667, "raw_times": [0.09275299998989794, 0.09438299997555077, 0.09443299995837151, 0.0926630000321893, 0.1176339999915399], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09754300003805838, "peak_bytes": 109182976, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09100299996589456, "p50": 0.09359299997413473, "p90": 0.09518299998489965, "mean": 0.09356119999210932, "iqr": 0.0025699999355310865, "raw_times": [0.09100299996589456, 0.09518299998489965, 0.09261300004936857, 0.09541399998624911, 0.09359299997413473], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.11789399997041983, "peak_bytes": 54788096, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.125, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09348399998998502, "p50": 0.09433299999273004, "p90": 0.09580299996514441, "mean": 0.09473540000044522, "iqr": 0.0016299999288094114, "raw_times": [0.09433299999273004, 0.09580299996514441, 0.09588400001803166, 0.09348399998998502, 0.094173000036335], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09657300000753821, "peak_bytes": 109576192, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0974529999666629, "p50": 0.09860399995886837, "p90": 0.09875400002101742, "mean": 0.09851759998582565, "iqr": 0.0008510000384376326, "raw_times": [0.09790299998257979, 0.0974529999666629, 0.09860399995886837, 0.09875400002101742, 0.0998739999999998], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10540400000991212, "peak_bytes": 218365952, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} +{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2809499999898435, "p50": 0.28135000002293964, "p90": 0.2840199999809556, "mean": 0.28239179999900443, "iqr": 0.0029809999659846653, "raw_times": [0.2809499999898435, 0.28459999998631247, 0.2840199999809556, 0.28103900001497095, 0.28135000002293964], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.28416999998626125, "peak_bytes": 436731904, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.125, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null} diff --git a/rotary/impls/cells/benchmark.py b/rotary/impls/cells/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..0e8216119f01d4dff50b7e1571fac564b8d33892 --- /dev/null +++ b/rotary/impls/cells/benchmark.py @@ -0,0 +1,47 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "kernels", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +import torch +import sys +from kernels_benchmark_tools import KernelTypeEnum, run_benchmark +from kernels import get_kernel + +# Load the rotary kernel +rotary = get_kernel("kernels-community/rotary") + + +def hf_kernels_rotary(query, key, cos, sin, conj=False): + rotary_dim = cos.shape[-1] + + # Clone to avoid modifying inputs + q_out = query.clone() + k_out = key.clone() + + # Apply rotation to query + q1 = q_out[..., :rotary_dim] + q2 = q_out[..., rotary_dim : 2 * rotary_dim] + rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj) + + # Apply rotation to key + k1 = k_out[..., :rotary_dim] + k2 = k_out[..., rotary_dim : 2 * rotary_dim] + rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj) + + return q_out, k_out + + +run_benchmark( + kernel_type=KernelTypeEnum.ROTARY, + impl_name="hf_kernels_rotary", + impl_tags={"family": "hf-kernels", "backend": "cuda"}, + impl_func=hf_kernels_rotary, +) \ No newline at end of file diff --git a/rotary/impls/cells/nv.py b/rotary/impls/cells/nv.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5 --- /dev/null +++ b/rotary/impls/cells/nv.py @@ -0,0 +1,2 @@ +import subprocess +print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/rotary/impls/hf_kernels_rotary.html b/rotary/impls/hf_kernels_rotary.html new file mode 100644 index 0000000000000000000000000000000000000000..e458bcc5786ea404df91ec4027149e3ec0c0a5aa --- /dev/null +++ b/rotary/impls/hf_kernels_rotary.html @@ -0,0 +1,4653 @@ + + + + + + hf_kernels_rotary + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

HF Kernels - Rotary Position Embeddings

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.23s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 29 00:36:23 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   31C    P0             86W /  350W |       0MiB /  46068MiB |     22%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Rotary Embeddings Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 4.48s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the rotary kernel
+rotary = get_kernel("kernels-community/rotary")
+
+
+def hf_kernels_rotary(query, key, cos, sin, conj=False):
+    rotary_dim = cos.shape[-1]
+
+    # Clone to avoid modifying inputs
+    q_out = query.clone()
+    k_out = key.clone()
+
+    # Apply rotation to query
+    q1 = q_out[..., :rotary_dim]
+    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
+    rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
+
+    # Apply rotation to key
+    k1 = k_out[..., :rotary_dim]
+    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
+    rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
+
+    return q_out, k_out
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ROTARY,
+    impl_name="hf_kernels_rotary",
+    impl_tags={"family": "hf-kernels", "backend": "cuda"},
+    impl_func=hf_kernels_rotary,
+)
+
+ +
+
+
+
+
Running rotary benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     417.663us      1766.17%     417.663us     417.663us             1  
+                                      hf_kernels_rotary        11.92%     243.797us        99.67%       2.039ms       2.039ms       0.000us         0.00%      24.864us      24.864us             1  
+                          _rotary_dba7d1e::apply_rotary         2.64%      54.054us         5.06%     103.576us      17.263us      16.992us        71.85%      16.992us       2.832us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us        71.85%      16.992us       2.832us             6  
+                                            aten::clone         2.02%      41.272us        79.82%       1.633ms     272.116us       0.000us         0.00%       7.872us       1.312us             6  
+                                            aten::copy_         1.82%      37.200us        74.94%       1.533ms     255.467us       6.656us        28.15%       7.872us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us        28.15%       6.656us       1.109us             6  
+                                Activity Buffer Request        69.47%       1.421ms        69.47%       1.421ms       1.421ms       1.216us         5.14%       1.216us       1.216us             1  
+                                    aten::empty_strided         2.87%      58.622us         2.87%      58.622us       9.770us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.65%      74.674us         3.65%      74.674us      12.446us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.25%      46.121us         2.87%      58.631us       4.886us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.61%      12.510us         0.61%      12.510us       1.042us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.42%      49.522us         2.42%      49.522us       8.254us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.33%       6.691us         0.33%       6.691us       6.691us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.045ms
+Self CUDA time total: 23.648us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     362.684us      1535.76%     362.684us     362.684us             1  
+                                      hf_kernels_rotary         9.63%     184.044us        99.76%       1.906ms       1.906ms       0.000us         0.00%      24.736us      24.736us             1  
+                          _rotary_dba7d1e::apply_rotary         2.64%      50.383us         5.03%      96.065us      16.011us      16.864us        71.41%      16.864us       2.811us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.864us        71.41%      16.864us       2.811us             6  
+                                            aten::clone         1.50%      28.618us        82.74%       1.581ms     263.486us       0.000us         0.00%       7.872us       1.312us             6  
+                                            aten::copy_         1.95%      37.192us        79.54%       1.520ms     253.297us       6.752us        28.59%       7.872us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us        28.59%       6.752us       1.125us             6  
+                                Activity Buffer Request        74.55%       1.424ms        74.55%       1.424ms       1.424ms       1.120us         4.74%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.70%      32.513us         1.70%      32.513us       5.419us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.05%      58.263us         3.05%      58.263us       9.710us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.86%      35.461us         2.36%      45.051us       3.754us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.50%       9.590us         0.50%       9.590us       0.799us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.39%      45.682us         2.39%      45.682us       7.614us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.600us         0.24%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.911ms
+Self CUDA time total: 23.616us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.889us      1406.66%     352.889us     352.889us             1  
+                                      hf_kernels_rotary         9.52%     180.074us        99.73%       1.887ms       1.887ms       0.000us         0.00%      26.399us      26.399us             1  
+                          _rotary_dba7d1e::apply_rotary         2.26%      42.841us         4.55%      86.004us      14.334us      17.248us        68.75%      17.248us       2.875us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.248us        68.75%      17.248us       2.875us             6  
+                                            aten::clone         1.50%      28.330us        83.30%       1.576ms     262.706us       0.000us         0.00%       9.151us       1.525us             6  
+                                            aten::copy_         1.91%      36.070us        80.06%       1.515ms     252.487us       7.839us        31.25%       9.151us       1.525us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.839us        31.25%       7.839us       1.307us             6  
+                                Activity Buffer Request        75.19%       1.423ms        75.19%       1.423ms       1.423ms       1.312us         5.23%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.74%      32.981us         1.74%      32.981us       5.497us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.97%      56.174us         2.97%      56.174us       9.362us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.86%      35.224us         2.36%      44.742us       3.729us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.50%       9.518us         0.50%       9.518us       0.793us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.28%      43.163us         2.28%      43.163us       7.194us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.081us         0.27%       5.081us       5.081us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.892ms
+Self CUDA time total: 25.087us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     353.892us      1375.46%     353.892us     353.892us             1  
+                                      hf_kernels_rotary         8.61%     178.135us        99.77%       2.063ms       2.063ms       0.000us         0.00%      27.041us      27.041us             1  
+                          _rotary_dba7d1e::apply_rotary         2.02%      41.741us         4.14%      85.532us      14.255us      17.985us        69.90%      17.985us       2.997us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.985us        69.90%      17.985us       2.997us             6  
+                                            aten::clone         1.32%      27.361us        84.83%       1.754ms     292.410us       0.000us         0.00%       9.056us       1.509us             6  
+                                            aten::copy_         1.77%      36.582us        81.87%       1.693ms     282.183us       7.744us        30.10%       9.056us       1.509us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        30.10%       7.744us       1.291us             6  
+                                Activity Buffer Request        68.36%       1.414ms        68.36%       1.414ms       1.414ms       1.312us         5.10%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.64%      34.001us         1.64%      34.001us       5.667us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.73%     242.678us        11.73%     242.678us      40.446us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.70%      35.202us         2.18%      45.153us       3.763us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.951us         0.48%       9.951us       0.829us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.12%      43.791us         2.12%      43.791us       7.299us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.830us         0.23%       4.830us       4.830us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.068ms
+Self CUDA time total: 25.729us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     351.422us      1397.30%     351.422us     351.422us             1  
+                                      hf_kernels_rotary         8.84%     180.886us        99.76%       2.041ms       2.041ms       0.000us         0.00%      26.462us      26.462us             1  
+                          _rotary_dba7d1e::apply_rotary         2.10%      42.971us         4.17%      85.245us      14.208us      17.214us        68.45%      17.214us       2.869us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.214us        68.45%      17.214us       2.869us             6  
+                                            aten::clone         1.43%      29.360us        84.55%       1.730ms     288.328us       0.000us         0.00%       9.248us       1.541us             6  
+                                            aten::copy_         1.75%      35.821us        81.51%       1.668ms     277.955us       7.936us        31.55%       9.248us       1.541us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        31.55%       7.936us       1.323us             6  
+                                Activity Buffer Request        69.89%       1.430ms        69.89%       1.430ms       1.430ms       1.312us         5.22%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.61%      32.881us         1.61%      32.881us       5.480us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.87%     201.958us         9.87%     201.958us      33.660us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.76%      36.050us         2.20%      45.010us       3.751us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.44%       8.960us         0.44%       8.960us       0.747us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.07%      42.274us         2.07%      42.274us       7.046us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.920us         0.24%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.046ms
+Self CUDA time total: 25.150us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.166us      1351.10%     347.166us     347.166us             1  
+                                      hf_kernels_rotary        21.36%     176.235us        99.42%     820.279us     820.279us       0.000us         0.00%      27.039us      27.039us             1  
+                          _rotary_dba7d1e::apply_rotary         5.20%      42.901us        10.31%      85.044us      14.174us      17.951us        69.86%      17.951us       2.992us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.951us        69.86%      17.951us       2.992us             6  
+                                            aten::clone         2.62%      21.601us        62.49%     515.608us      85.935us       0.000us         0.00%       9.088us       1.515us             6  
+                                            aten::copy_         4.36%      35.950us        55.96%     461.697us      76.950us       7.744us        30.14%       9.088us       1.515us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        30.14%       7.744us       1.291us             6  
+                                Activity Buffer Request        27.88%     230.028us        27.88%     230.028us     230.028us       1.344us         5.23%       1.344us       1.344us             1  
+                                    aten::empty_strided         3.92%      32.310us         3.92%      32.310us       5.385us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.72%     195.719us        23.72%     195.719us      32.620us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.18%      34.481us         5.26%      43.392us       3.616us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.08%       8.911us         1.08%       8.911us       0.743us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.11%      42.143us         5.11%      42.143us       7.024us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.58%       4.821us         0.58%       4.821us       4.821us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 825.100us
+Self CUDA time total: 25.695us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.595us      1078.61%     348.595us     348.595us             1  
+                                      hf_kernels_rotary        21.56%     162.014us        99.35%     746.516us     746.516us       0.000us         0.00%      34.111us      34.111us             1  
+                          _rotary_dba7d1e::apply_rotary         5.56%      41.814us        11.41%      85.705us      14.284us      21.792us        67.43%      21.792us       3.632us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.792us        67.43%      21.792us       3.632us             6  
+                                            aten::clone         2.84%      21.362us        60.59%     455.236us      75.873us       0.000us         0.00%      12.319us       2.053us             6  
+                                            aten::copy_         5.05%      37.942us        53.37%     401.033us      66.839us      10.527us        32.57%      12.319us       2.053us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.527us        32.57%      10.527us       1.755us             6  
+                                Activity Buffer Request        22.09%     165.945us        22.09%     165.945us     165.945us       1.792us         5.54%       1.792us       1.792us             1  
+                                    aten::empty_strided         4.37%      32.841us         4.37%      32.841us       5.474us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        26.24%     197.146us        26.24%     197.146us      32.858us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.61%      34.610us         5.80%      43.561us       3.630us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.19%       8.951us         1.19%       8.951us       0.746us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.84%      43.891us         5.84%      43.891us       7.315us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.65%       4.870us         0.65%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 751.386us
+Self CUDA time total: 32.319us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     353.018us       687.35%     353.018us     353.018us             1  
+                                      hf_kernels_rotary        20.18%     167.279us        99.43%     824.358us     824.358us       0.000us         0.00%      54.175us      54.175us             1  
+                          _rotary_dba7d1e::apply_rotary         5.18%      42.971us        10.43%      86.461us      14.410us      34.432us        67.04%      34.432us       5.739us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.432us        67.04%      34.432us       5.739us             6  
+                                            aten::clone         2.72%      22.563us        63.67%     527.908us      87.985us       0.000us         0.00%      19.743us       3.290us             6  
+                                            aten::copy_         4.40%      36.441us        57.12%     473.605us      78.934us      16.927us        32.96%      19.743us       3.290us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.927us        32.96%      16.927us       2.821us             6  
+                                Activity Buffer Request        29.36%     243.449us        29.36%     243.449us     243.449us       2.816us         5.48%       2.816us       2.816us             1  
+                                    aten::empty_strided         3.83%      31.740us         3.83%      31.740us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.37%     193.715us        23.37%     193.715us      32.286us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.09%      33.928us         5.15%      42.710us       3.559us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.06%       8.782us         1.06%       8.782us       0.732us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.25%      43.490us         5.25%      43.490us       7.248us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       4.720us         0.57%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 829.078us
+Self CUDA time total: 51.359us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     380.387us      1176.94%     380.387us     380.387us             1  
+                                      hf_kernels_rotary         9.88%     201.876us        99.77%       2.039ms       2.039ms       0.000us         0.00%      34.144us      34.144us             1  
+                          _rotary_dba7d1e::apply_rotary         2.25%      45.971us         4.47%      91.374us      15.229us      21.760us        67.33%      21.760us       3.627us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.760us        67.33%      21.760us       3.627us             6  
+                                            aten::clone         1.35%      27.641us        83.24%       1.701ms     283.513us       0.000us         0.00%      12.384us       2.064us             6  
+                                            aten::copy_         1.82%      37.221us        80.29%       1.641ms     273.476us      10.560us        32.67%      12.384us       2.064us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        32.67%      10.560us       1.760us             6  
+                                Activity Buffer Request        69.28%       1.416ms        69.28%       1.416ms       1.416ms       1.824us         5.64%       1.824us       1.824us             1  
+                                    aten::empty_strided         1.59%      32.582us         1.59%      32.582us       5.430us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.19%     187.866us         9.19%     187.866us      31.311us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.75%      35.720us         2.18%      44.611us       3.718us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.44%       8.891us         0.44%       8.891us       0.741us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.22%      45.403us         2.22%      45.403us       7.567us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.671us         0.23%       4.671us       4.671us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.044ms
+Self CUDA time total: 32.320us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     358.145us       697.76%     358.145us     358.145us             1  
+                                      hf_kernels_rotary         9.30%     187.776us        99.78%       2.015ms       2.015ms       0.000us         0.00%      54.208us      54.208us             1  
+                          _rotary_dba7d1e::apply_rotary         2.06%      41.530us         4.25%      85.754us      14.292us      34.401us        67.02%      34.401us       5.734us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.401us        67.02%      34.401us       5.734us             6  
+                                            aten::clone         1.47%      29.652us        84.14%       1.699ms     283.188us       0.000us         0.00%      19.807us       3.301us             6  
+                                            aten::copy_         1.88%      38.042us        81.10%       1.638ms     272.963us      16.927us        32.98%      19.807us       3.301us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.927us        32.98%      16.927us       2.821us             6  
+                                Activity Buffer Request        70.14%       1.416ms        70.14%       1.416ms       1.416ms       2.880us         5.61%       2.880us       2.880us             1  
+                                    aten::empty_strided         1.57%      31.700us         1.57%      31.700us       5.283us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.08%     183.316us         9.08%     183.316us      30.553us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.65%      33.410us         2.09%      42.241us       3.520us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.44%       8.831us         0.44%       8.831us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.19%      44.224us         2.19%      44.224us       7.371us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       4.480us         0.22%       4.480us       4.480us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.019ms
+Self CUDA time total: 51.328us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     361.565us       334.59%     361.565us     361.565us             1  
+                                      hf_kernels_rotary         8.80%     177.873us        99.76%       2.017ms       2.017ms       0.000us         0.00%     126.174us     126.174us             1  
+                                            aten::clone         1.36%      27.530us        84.48%       1.708ms     284.721us       0.000us         0.00%      69.727us      11.621us             6  
+                                            aten::copy_         1.83%      37.081us        81.46%       1.647ms     274.541us      51.615us        47.76%      69.727us      11.621us             6  
+                          _rotary_dba7d1e::apply_rotary         2.15%      43.402us         4.34%      87.665us      14.611us      56.447us        52.24%      56.447us       9.408us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      56.447us        52.24%      56.447us       9.408us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.615us        47.76%      51.615us       8.603us             6  
+                                Activity Buffer Request        70.51%       1.426ms        70.51%       1.426ms       1.426ms      18.112us        16.76%      18.112us      18.112us             1  
+                                    aten::empty_strided         1.66%      33.551us         1.66%      33.551us       5.592us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.12%     184.328us         9.12%     184.328us      30.721us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.73%      34.962us         2.15%      43.472us       3.623us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.42%       8.510us         0.42%       8.510us       0.709us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.19%      44.263us         2.19%      44.263us       7.377us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.810us         0.24%       4.810us       4.810us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.022ms
+Self CUDA time total: 108.062us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     374.332us       209.83%     374.332us     374.332us             1  
+                                      hf_kernels_rotary         8.69%     176.335us        99.78%       2.024ms       2.024ms       0.000us         0.00%     202.046us     202.046us             1  
+                                            aten::clone         1.35%      27.382us        84.12%       1.707ms     284.468us       0.000us         0.00%     102.112us      17.019us             6  
+                                            aten::copy_         1.89%      38.342us        81.18%       1.647ms     274.513us      78.464us        43.98%     102.112us      17.019us             6  
+                          _rotary_dba7d1e::apply_rotary         2.26%      45.922us         4.48%      90.874us      15.146us      99.934us        56.02%      99.934us      16.656us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      99.934us        56.02%      99.934us      16.656us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.464us        43.98%      78.464us      13.077us             6  
+                                Activity Buffer Request        70.36%       1.428ms        70.36%       1.428ms       1.428ms      23.648us        13.26%      23.648us      23.648us             1  
+                                    aten::empty_strided         1.59%      32.350us         1.59%      32.350us       5.392us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.93%     181.117us         8.93%     181.117us      30.186us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.68%      34.110us         2.48%      50.391us       4.199us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.80%      16.281us         0.80%      16.281us       1.357us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.22%      44.952us         2.22%      44.952us       7.492us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.22%       4.521us         0.22%       4.521us       4.521us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.029ms
+Self CUDA time total: 178.398us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.716us      1341.48%     350.716us     350.716us             1  
+                                      hf_kernels_rotary         8.88%     178.684us        99.76%       2.007ms       2.007ms       0.000us         0.00%      27.264us      27.264us             1  
+                          _rotary_dba7d1e::apply_rotary         2.16%      43.370us         4.24%      85.224us      14.204us      19.393us        74.18%      19.393us       3.232us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.393us        74.18%      19.393us       3.232us             6  
+                                            aten::clone         1.56%      31.330us        84.58%       1.702ms     283.596us       0.000us         0.00%       7.871us       1.312us             6  
+                                            aten::copy_         1.80%      36.292us        81.38%       1.637ms     272.881us       6.751us        25.82%       7.871us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.751us        25.82%       6.751us       1.125us             6  
+                                Activity Buffer Request        70.41%       1.417ms        70.41%       1.417ms       1.417ms       1.120us         4.28%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.64%      32.961us         1.64%      32.961us       5.494us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.17%     184.457us         9.17%     184.457us      30.743us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.63%      32.712us         2.06%      41.532us       3.461us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.44%       8.820us         0.44%       8.820us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.08%      41.854us         2.08%      41.854us       6.976us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.24%       4.830us         0.24%       4.830us       4.830us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.012ms
+Self CUDA time total: 26.144us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     377.307us      1377.44%     377.307us     377.307us             1  
+                                      hf_kernels_rotary        21.29%     163.294us        99.28%     761.426us     761.426us       0.000us         0.00%      28.704us      28.704us             1  
+                          _rotary_dba7d1e::apply_rotary         5.68%      43.540us        11.49%      88.163us      14.694us      19.584us        71.50%      19.584us       3.264us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.584us        71.50%      19.584us       3.264us             6  
+                                            aten::clone         3.08%      23.620us        60.95%     467.436us      77.906us       0.000us         0.00%       9.120us       1.520us             6  
+                                            aten::copy_         5.00%      38.311us        53.59%     411.005us      68.501us       7.808us        28.50%       9.120us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us        28.50%       7.808us       1.301us             6  
+                                Activity Buffer Request        21.08%     161.645us        21.08%     161.645us     161.645us       1.312us         4.79%       1.312us       1.312us             1  
+                                    aten::empty_strided         4.28%      32.811us         4.28%      32.811us       5.468us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        27.52%     211.049us        27.52%     211.049us      35.175us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.33%      33.234us         5.55%      42.533us       3.544us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.21%       9.299us         1.21%       9.299us       0.775us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.82%      44.623us         5.82%      44.623us       7.437us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.72%       5.550us         0.72%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 766.976us
+Self CUDA time total: 27.392us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.153us      1234.28%     349.153us     349.153us             1  
+                                      hf_kernels_rotary        19.50%     158.266us        99.38%     806.788us     806.788us       0.000us         0.00%      29.600us      29.600us             1  
+                          _rotary_dba7d1e::apply_rotary         5.36%      43.530us        10.78%      87.514us      14.586us      20.544us        72.62%      20.544us       3.424us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.544us        72.62%      20.544us       3.424us             6  
+                                            aten::clone         2.63%      21.380us        63.75%     517.547us      86.258us       0.000us         0.00%       9.056us       1.509us             6  
+                                            aten::copy_         4.60%      37.352us        57.23%     464.607us      77.434us       7.744us        27.38%       9.056us       1.509us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        27.38%       7.744us       1.291us             6  
+                                Activity Buffer Request        29.79%     241.838us        29.79%     241.838us     241.838us       1.312us         4.64%       1.312us       1.312us             1  
+                                    aten::empty_strided         3.89%      31.560us         3.89%      31.560us       5.260us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.84%     185.417us        22.84%     185.417us      30.903us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.24%      34.459us         5.35%      43.461us       3.622us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.11%       9.002us         1.11%       9.002us       0.750us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.42%      43.984us         5.42%      43.984us       7.331us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.62%       5.020us         0.62%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 811.808us
+Self CUDA time total: 28.288us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.027us       976.29%     348.027us     348.027us             1  
+                                      hf_kernels_rotary        20.53%     156.455us        99.34%     757.166us     757.166us       0.000us         0.00%      37.440us      37.440us             1  
+                          _rotary_dba7d1e::apply_rotary         5.63%      42.881us        11.27%      85.894us      14.316us      25.184us        70.65%      25.184us       4.197us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.184us        70.65%      25.184us       4.197us             6  
+                                            aten::clone         3.00%      22.853us        61.65%     469.877us      78.313us       0.000us         0.00%      12.256us       2.043us             6  
+                                            aten::copy_         4.74%      36.121us        54.50%     415.394us      69.232us      10.464us        29.35%      12.256us       2.043us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us        29.35%      10.464us       1.744us             6  
+                                Activity Buffer Request        25.88%     197.217us        25.88%     197.217us     197.217us       1.792us         5.03%       1.792us       1.792us             1  
+                                    aten::empty_strided         4.15%      31.630us         4.15%      31.630us       5.272us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.89%     182.056us        23.89%     182.056us      30.343us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.53%      34.528us         5.90%      44.940us       3.745us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.37%      10.412us         1.37%      10.412us       0.868us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.64%      43.013us         5.64%      43.013us       7.169us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.66%       5.020us         0.66%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 762.186us
+Self CUDA time total: 35.648us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.012us      1220.37%     346.012us     346.012us             1  
+                                      hf_kernels_rotary        19.32%     159.865us        99.40%     822.269us     822.269us       0.000us         0.00%      29.665us      29.665us             1  
+                          _rotary_dba7d1e::apply_rotary         5.23%      43.230us        10.32%      85.383us      14.231us      20.577us        72.57%      20.577us       3.429us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.577us        72.57%      20.577us       3.429us             6  
+                                            aten::clone         2.67%      22.091us        64.52%     533.759us      88.960us       0.000us         0.00%       9.088us       1.515us             6  
+                                            aten::copy_         4.35%      36.002us        57.93%     479.208us      79.868us       7.776us        27.43%       9.088us       1.515us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        27.43%       7.776us       1.296us             6  
+                                Activity Buffer Request        31.47%     260.369us        31.47%     260.369us     260.369us       1.312us         4.63%       1.312us       1.312us             1  
+                                    aten::empty_strided         3.92%      32.460us         3.92%      32.460us       5.410us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.10%     182.837us        22.10%     182.837us      30.473us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.12%      34.091us         5.23%      43.262us       3.605us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.11%       9.171us         1.11%       9.171us       0.764us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.10%      42.153us         5.10%      42.153us       7.026us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.60%       4.990us         0.60%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 827.259us
+Self CUDA time total: 28.353us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     379.611us      1063.04%     379.611us     379.611us             1  
+                                      hf_kernels_rotary        17.54%     182.966us        99.53%       1.038ms       1.038ms       0.000us         0.00%      37.470us      37.470us             1  
+                          _rotary_dba7d1e::apply_rotary         4.31%      44.959us         8.52%      88.913us      14.819us      25.247us        70.70%      25.247us       4.208us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.247us        70.70%      25.247us       4.208us             6  
+                                            aten::clone         2.14%      22.291us        69.13%     721.275us     120.212us       0.000us         0.00%      12.223us       2.037us             6  
+                                            aten::copy_         3.58%      37.312us        63.91%     666.784us     111.131us      10.463us        29.30%      12.223us       2.037us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.463us        29.30%      10.463us       1.744us             6  
+                                Activity Buffer Request        42.63%     444.746us        42.63%     444.746us     444.746us       1.760us         4.93%       1.760us       1.760us             1  
+                                    aten::empty_strided         3.09%      32.200us         3.09%      32.200us       5.367us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.71%     184.726us        17.71%     184.726us      30.788us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.45%      36.000us         4.33%      45.221us       3.768us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.88%       9.221us         0.88%       9.221us       0.768us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.21%      43.954us         4.21%      43.954us       7.326us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.47%       4.940us         0.47%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.043ms
+Self CUDA time total: 35.710us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.330us       621.69%     350.330us     350.330us             1  
+                                      hf_kernels_rotary        20.69%     166.654us        99.40%     800.657us     800.657us       0.000us         0.00%      59.231us      59.231us             1  
+                          _rotary_dba7d1e::apply_rotary         5.43%      43.738us        10.71%      86.292us      14.382us      39.327us        69.79%      39.327us       6.554us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.327us        69.79%      39.327us       6.554us             6  
+                                            aten::clone         2.60%      20.920us        62.50%     503.467us      83.911us       0.000us         0.00%      19.904us       3.317us             6  
+                                            aten::copy_         4.42%      35.631us        55.79%     449.427us      74.904us      17.024us        30.21%      19.904us       3.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us        30.21%      17.024us       2.837us             6  
+                                Activity Buffer Request        28.71%     231.299us        28.71%     231.299us     231.299us       2.880us         5.11%       2.880us       2.880us             1  
+                                    aten::empty_strided         4.11%      33.120us         4.11%      33.120us       5.520us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.66%     182.497us        22.66%     182.497us      30.416us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.34%      34.964us         5.49%      44.244us       3.687us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.15%       9.280us         1.15%       9.280us       0.773us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.28%      42.554us         5.28%      42.554us       7.092us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.60%       4.850us         0.60%       4.850us       4.850us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 805.507us
+Self CUDA time total: 56.351us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     363.291us       308.26%     363.291us     363.291us             1  
+                                      hf_kernels_rotary        19.60%     166.384us        99.43%     844.179us     844.179us       0.000us         0.00%     134.846us     134.846us             1  
+                                            aten::clone         2.55%      21.670us        64.54%     547.969us      91.328us       0.000us         0.00%      70.143us      11.691us             6  
+                                            aten::copy_         4.54%      38.561us        58.31%     495.019us      82.503us      53.151us        45.10%      70.143us      11.691us             6  
+                          _rotary_dba7d1e::apply_rotary         4.97%      42.172us        10.27%      87.155us      14.526us      64.703us        54.90%      64.703us      10.784us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      64.703us        54.90%      64.703us      10.784us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.151us        45.10%      53.151us       8.859us             6  
+                                Activity Buffer Request        32.22%     273.530us        32.22%     273.530us     273.530us      16.992us        14.42%      16.992us      16.992us             1  
+                                    aten::empty_strided         3.68%      31.280us         3.68%      31.280us       5.213us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.55%     182.928us        21.55%     182.928us      30.488us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.96%      33.580us         5.03%      42.671us       3.556us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.07%       9.091us         1.07%       9.091us       0.758us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.30%      44.983us         5.30%      44.983us       7.497us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       4.820us         0.57%       4.820us       4.820us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 848.999us
+Self CUDA time total: 117.854us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     370.462us       657.41%     370.462us     370.462us             1  
+                                      hf_kernels_rotary         9.39%     189.846us        99.77%       2.018ms       2.018ms       0.000us         0.00%      59.200us      59.200us             1  
+                          _rotary_dba7d1e::apply_rotary         2.15%      43.502us         4.33%      87.525us      14.588us      39.360us        69.85%      39.360us       6.560us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.360us        69.85%      39.360us       6.560us             6  
+                                            aten::clone         1.41%      28.463us        83.80%       1.695ms     282.475us       0.000us         0.00%      19.840us       3.307us             6  
+                                            aten::copy_         1.87%      37.890us        80.77%       1.634ms     272.251us      16.992us        30.15%      19.840us       3.307us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us        30.15%      16.992us       2.832us             6  
+                                Activity Buffer Request        69.77%       1.411ms        69.77%       1.411ms       1.411ms       2.848us         5.05%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.63%      32.881us         1.63%      32.881us       5.480us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.13%     184.676us         9.13%     184.676us      30.779us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.76%      35.550us         2.25%      45.480us       3.790us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.49%       9.930us         0.49%       9.930us       0.827us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.18%      44.023us         2.18%      44.023us       7.337us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.690us         0.23%       4.690us       4.690us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.022ms
+Self CUDA time total: 56.352us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     359.680us       306.26%     359.680us     359.680us             1  
+                                      hf_kernels_rotary         9.06%     182.622us        99.75%       2.011ms       2.011ms       0.000us         0.00%     134.753us     134.753us             1  
+                                            aten::clone         1.36%      27.350us        84.30%       1.700ms     283.278us       0.000us         0.00%      70.114us      11.686us             6  
+                                            aten::copy_         1.85%      37.232us        81.34%       1.640ms     273.341us      52.802us        44.96%      70.114us      11.686us             6  
+                          _rotary_dba7d1e::apply_rotary         2.09%      42.192us         4.26%      85.926us      14.321us      64.639us        55.04%      64.639us      10.773us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      64.639us        55.04%      64.639us      10.773us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.802us        44.96%      52.802us       8.800us             6  
+                                Activity Buffer Request        70.45%       1.420ms        70.45%       1.420ms       1.420ms      17.312us        14.74%      17.312us      17.312us             1  
+                                    aten::empty_strided         1.60%      32.271us         1.60%      32.271us       5.379us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.05%     182.507us         9.05%     182.507us      30.418us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.67%      33.712us         2.12%      42.832us       3.569us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.45%       9.120us         0.45%       9.120us       0.760us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.17%      43.734us         2.17%      43.734us       7.289us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.25%       5.130us         0.25%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.016ms
+Self CUDA time total: 117.441us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     363.604us       186.68%     363.604us     363.604us             1  
+                                      hf_kernels_rotary        18.95%     159.454us        99.42%     836.628us     836.628us       0.000us         0.00%     218.425us     218.425us             1  
+                          _rotary_dba7d1e::apply_rotary         5.11%      42.982us        10.01%      84.264us      14.044us     114.460us        58.76%     114.460us      19.077us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     114.460us        58.76%     114.460us      19.077us             6  
+                                            aten::clone         2.64%      22.190us        65.28%     549.368us      91.561us       0.000us         0.00%     103.965us      17.328us             6  
+                                            aten::copy_         4.30%      36.168us        58.92%     495.836us      82.639us      80.318us        41.24%     103.965us      17.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      80.318us        41.24%      80.318us      13.386us             6  
+                                Activity Buffer Request        32.31%     271.900us        32.31%     271.900us     271.900us      23.647us        12.14%      23.647us      23.647us             1  
+                                    aten::empty_strided         3.72%      31.342us         3.72%      31.342us       5.224us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.31%     187.768us        22.31%     187.768us      31.295us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.01%      33.772us         5.17%      43.542us       3.628us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.16%       9.770us         1.16%       9.770us       0.814us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.91%      41.282us         4.91%      41.282us       6.880us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.58%       4.880us         0.58%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 841.508us
+Self CUDA time total: 194.778us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                      hf_kernels_rotary        13.69%     161.817us        65.35%     772.637us     772.637us       0.000us         0.00%     853.016us     853.016us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     794.618us       101.00%     794.618us     794.618us             1  
+                                            aten::clone         1.91%      22.540us        40.85%     482.956us      80.493us       0.000us         0.00%     580.923us      96.820us             6  
+                                            aten::copy_         3.05%      36.119us        36.34%     429.636us      71.606us     514.652us        65.42%     580.923us      96.820us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     514.652us        65.42%     514.652us      85.775us             6  
+                          _rotary_dba7d1e::apply_rotary         3.53%      41.772us         7.15%      84.524us      14.087us     272.093us        34.58%     272.093us      45.349us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     272.093us        34.58%     272.093us      45.349us             6  
+                                Activity Buffer Request        17.75%     209.918us        17.75%     209.918us     209.918us      66.271us         8.42%      66.271us      66.271us             1  
+                                    aten::empty_strided         2.60%      30.780us         2.60%      30.780us       5.130us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.53%     183.599us        15.53%     183.599us      30.600us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.92%      34.511us         3.67%      43.340us       3.612us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.75%       8.829us         0.75%       8.829us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         3.62%      42.752us         3.62%      42.752us       7.125us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        34.65%     409.744us        34.65%     409.744us     409.744us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.182ms
+Self CUDA time total: 786.745us
+
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.10  False
+hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.10  False
+hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  False
+hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.10  False
+hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.28  False
+hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.10  False
+hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S512_H32_D64_R32     0.10  False
+hf_kernels_rotary        cuda_B2_S512_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  False
+
+
+
▶ UV Install Logs
+ +
+
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.09it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.09it/s]
+
+

Artifacts:

+rotary.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/rotary/impls/index.html b/rotary/impls/index.html new file mode 100644 index 0000000000000000000000000000000000000000..02ebba5c766f3ff734bd85f035b2321b75e143f7 --- /dev/null +++ b/rotary/impls/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /rotary/impls + + + +
+ ← back +
+

Index of /rotary/impls

+ + + \ No newline at end of file diff --git a/rotary/impls/torch_rotary.html b/rotary/impls/torch_rotary.html new file mode 100644 index 0000000000000000000000000000000000000000..aa154efd76b2499f7b4f91ee8db4a21e33418431 --- /dev/null +++ b/rotary/impls/torch_rotary.html @@ -0,0 +1,4756 @@ + + + + + + torch_rotary + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

PyTorch Native - Rotary Position Embeddings

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.23s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 29 00:36:23 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   31C    P0             86W /  350W |       0MiB /  46068MiB |     22%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Rotary Embeddings Benchmark (PyTorch Native)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 7.56s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def apply_rotary_torch(x1, x2, cos, sin, conj=False):
+    """Reference rotary implementation."""
+    if not conj:
+        out1 = x1 * cos - x2 * sin
+        out2 = x1 * sin + x2 * cos
+    else:
+        out1 = x1 * cos + x2 * sin
+        out2 = -x1 * sin + x2 * cos
+    return out1, out2
+
+
+def torch_rotary(query, key, cos, sin, conj=False):
+    rotary_dim = cos.shape[-1]
+
+    # Clone inputs to avoid modifying them
+    q_out = query.clone()
+    k_out = key.clone()
+
+    # Apply rotation to query
+    q1 = q_out[..., :rotary_dim]
+    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
+    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
+    q_out[..., :rotary_dim] = q_out_1
+    q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
+
+    # Apply rotation to key
+    k1 = k_out[..., :rotary_dim]
+    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
+    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
+    k_out[..., :rotary_dim] = k_out_1
+    k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
+
+    return q_out, k_out
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.ROTARY,
+    impl_name="torch_eager",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=torch_rotary,
+)
+
+ +
+
+
+
+
Running rotary benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.078ms      1207.69%       1.078ms       1.078ms             1  
+                                            torch_eager        14.52%     400.522us        99.68%       2.750ms       2.750ms       0.000us         0.00%      90.462us      90.462us             1  
+                                              aten::mul         6.17%     170.271us        10.64%     293.512us      12.230us      46.944us        52.60%      46.944us       1.956us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.944us        52.60%      46.944us       1.956us            24  
+                                            aten::copy_         4.22%     116.515us        62.04%       1.711ms      95.079us      28.991us        32.48%      30.207us       1.678us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.335us        25.03%      22.335us       1.861us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.311us        14.91%      13.311us       1.109us            12  
+                                            aten::clone         1.56%      42.898us        60.57%       1.671ms     278.496us       0.000us         0.00%       7.872us       1.312us             6  
+                                              aten::sub         1.56%      43.002us         2.52%      69.413us      11.569us       6.688us         7.49%       6.688us       1.115us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us         7.46%       6.656us       1.109us             6  
+                                              aten::add         1.26%      34.801us         2.08%      57.392us       9.565us       6.623us         7.42%       6.623us       1.104us             6  
+                                Activity Buffer Request        52.61%       1.451ms        52.61%       1.451ms       1.451ms       1.216us         1.36%       1.216us       1.216us             1  
+                                    aten::empty_strided         2.18%      60.243us         2.18%      60.243us      10.040us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.73%      75.213us         2.73%      75.213us      12.535us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.16%      87.293us         4.15%     114.414us       4.767us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.98%      27.121us         0.98%      27.121us       1.130us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.73%     240.716us         8.73%     240.716us       5.015us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.32%       8.731us         0.32%       8.731us       8.731us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.759ms
+Self CUDA time total: 89.246us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     965.083us      1066.40%     965.083us     965.083us             1  
+                                            torch_eager        12.32%     311.423us        99.78%       2.522ms       2.522ms       0.000us         0.00%      91.619us      91.619us             1  
+                                              aten::mul         6.03%     152.369us        10.74%     271.352us      11.306us      47.843us        52.87%      47.843us       1.993us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.843us        52.87%      47.843us       1.993us            24  
+                                            aten::copy_         4.27%     107.805us        66.47%       1.680ms      93.342us      29.280us        32.35%      30.400us       1.689us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.528us        24.89%      22.528us       1.877us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.376us        14.78%      13.376us       1.115us            12  
+                                            aten::clone         0.93%      23.570us        63.24%       1.599ms     266.432us       0.000us         0.00%       7.872us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us         7.46%       6.752us       1.125us             6  
+                                              aten::sub         1.46%      36.933us         2.41%      60.984us      10.164us       6.720us         7.43%       6.720us       1.120us             6  
+                                              aten::add         1.19%      30.203us         2.05%      51.743us       8.624us       6.656us         7.35%       6.656us       1.109us             6  
+                                Activity Buffer Request        57.43%       1.452ms        57.43%       1.452ms       1.452ms       1.120us         1.24%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.24%      31.422us         1.24%      31.422us       5.237us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.19%      55.410us         2.19%      55.410us       9.235us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.81%      71.135us         3.62%      91.386us       3.808us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.80%      20.251us         0.80%      20.251us       0.844us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.10%     229.986us         9.10%     229.986us       4.791us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.22%       5.600us         0.22%       5.600us       5.600us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.528ms
+Self CUDA time total: 90.499us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     958.300us      1018.96%     958.300us     958.300us             1  
+                                            torch_eager        12.46%     312.732us        99.79%       2.504ms       2.504ms       0.000us         0.00%      95.391us      95.391us             1  
+                                              aten::mul         5.95%     149.403us        10.59%     265.726us      11.072us      48.799us        51.89%      48.799us       2.033us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.799us        51.89%      48.799us       2.033us            24  
+                                            aten::copy_         4.14%     103.773us        66.47%       1.668ms      92.665us      30.815us        32.77%      32.159us       1.787us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.976us        24.43%      22.976us       1.915us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.433us        15.35%      14.433us       1.203us            12  
+                                            aten::clone         0.91%      22.712us        63.16%       1.585ms     264.144us       0.000us         0.00%       9.183us       1.530us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.839us         8.34%       7.839us       1.306us             6  
+                                              aten::sub         1.38%      34.722us         2.34%      58.713us       9.786us       7.233us         7.69%       7.233us       1.206us             6  
+                                              aten::add         1.22%      30.569us         2.11%      52.831us       8.805us       7.200us         7.66%       7.200us       1.200us             6  
+                                Activity Buffer Request        57.45%       1.442ms        57.45%       1.442ms       1.442ms       1.344us         1.43%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.24%      31.042us         1.24%      31.042us       5.174us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.19%      55.002us         2.19%      55.002us       9.167us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.81%      70.525us         3.67%      92.215us       3.842us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.86%      21.690us         0.86%      21.690us       0.904us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.17%     230.176us         9.17%     230.176us       4.795us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.390us         0.21%       5.390us       5.390us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.509ms
+Self CUDA time total: 94.047us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     950.973us       939.57%     950.973us     950.973us             1  
+                                            torch_eager        12.66%     301.065us        99.78%       2.372ms       2.372ms       0.000us         0.00%     102.526us     102.526us             1  
+                                              aten::mul         6.27%     149.075us        11.17%     265.677us      11.070us      52.831us        52.20%      52.831us       2.201us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.831us        52.20%      52.831us       2.201us            24  
+                                            aten::copy_         4.35%     103.343us        65.02%       1.546ms      85.882us      32.383us        31.99%      33.695us       1.872us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.639us        24.34%      24.639us       2.053us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.000us        15.81%      16.000us       1.333us            12  
+                                            aten::clone         0.92%      21.771us        61.57%       1.464ms     243.982us       0.000us         0.00%       9.056us       1.509us             6  
+                                              aten::add         1.30%      30.988us         2.24%      53.211us       8.868us       8.001us         7.91%       8.001us       1.333us             6  
+                                              aten::sub         1.50%      35.711us         2.60%      61.701us      10.284us       7.999us         7.90%       7.999us       1.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.65%       7.744us       1.291us             6  
+                                Activity Buffer Request        45.51%       1.082ms        45.51%       1.082ms       1.082ms       1.312us         1.30%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.34%      31.870us         1.34%      31.870us       5.312us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        12.33%     293.180us        12.33%     293.180us      48.863us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.97%      70.623us         3.83%      91.173us       3.799us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.86%      20.550us         0.86%      20.550us       0.856us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.76%     232.079us         9.76%     232.079us       4.835us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.22%       5.310us         0.22%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.378ms
+Self CUDA time total: 101.214us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     979.418us      1041.05%     979.418us     979.418us             1  
+                                            torch_eager        12.27%     347.559us        99.79%       2.828ms       2.828ms       0.000us         0.00%      95.392us      95.392us             1  
+                                              aten::mul         5.36%     151.975us         9.52%     269.888us      11.245us      49.087us        52.18%      49.087us       2.045us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.087us        52.18%      49.087us       2.045us            24  
+                                            aten::copy_         3.87%     109.552us        68.68%       1.946ms     108.124us      30.817us        32.76%      32.129us       1.785us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.945us        24.39%      22.945us       1.912us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        15.07%      14.176us       1.181us            12  
+                                            aten::clone         0.99%      27.952us        65.99%       1.870ms     311.676us       0.000us         0.00%       9.184us       1.531us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         8.37%       7.872us       1.312us             6  
+                                              aten::add         1.09%      30.843us         1.89%      53.454us       8.909us       7.104us         7.55%       7.104us       1.184us             6  
+                                              aten::sub         1.23%      34.731us         2.14%      60.592us      10.099us       7.072us         7.52%       7.072us       1.179us             6  
+                                Activity Buffer Request        50.62%       1.434ms        50.62%       1.434ms       1.434ms       1.312us         1.39%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.13%      31.881us         1.13%      31.881us       5.314us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.86%     336.003us        11.86%     336.003us      56.000us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.47%      69.892us         3.18%      90.023us       3.751us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.71%      20.131us         0.71%      20.131us       0.839us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.21%     232.618us         8.21%     232.618us       4.846us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       6.050us         0.21%       6.050us       6.050us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.834ms
+Self CUDA time total: 94.080us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     948.701us       937.33%     948.701us     948.701us             1  
+                                            torch_eager        11.33%     313.022us        99.82%       2.758ms       2.758ms       0.000us         0.00%     102.525us     102.525us             1  
+                                              aten::mul         5.41%     149.533us         9.55%     263.868us      10.995us      52.638us        52.01%      52.638us       2.193us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.638us        52.01%      52.638us       2.193us            24  
+                                            aten::copy_         3.84%     106.183us        69.66%       1.925ms     106.940us      32.512us        32.12%      33.824us       1.879us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.736us        24.44%      24.736us       2.061us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.063us        15.87%      16.063us       1.339us            12  
+                                            aten::clone         0.85%      23.503us        66.72%       1.844ms     307.294us       0.000us         0.00%       9.088us       1.515us             6  
+                                              aten::add         1.12%      31.031us         1.92%      53.131us       8.855us       8.064us         7.97%       8.064us       1.344us             6  
+                                              aten::sub         1.28%      35.373us         2.17%      59.842us       9.974us       7.999us         7.90%       7.999us       1.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         7.68%       7.776us       1.296us             6  
+                                Activity Buffer Request        51.85%       1.433ms        51.85%       1.433ms       1.433ms       1.312us         1.30%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.12%      30.890us         1.12%      30.890us       5.148us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.57%     319.641us        11.57%     319.641us      53.273us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.48%      68.444us         3.23%      89.144us       3.714us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.75%      20.700us         0.75%      20.700us       0.863us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.22%     227.175us         8.22%     227.175us       4.733us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.010us         0.18%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.763ms
+Self CUDA time total: 101.213us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     959.141us       795.45%     959.141us     959.141us             1  
+                                            torch_eager        11.32%     313.154us        99.78%       2.759ms       2.759ms       0.000us         0.00%     122.369us     122.369us             1  
+                                              aten::mul         5.48%     151.445us         9.68%     267.778us      11.157us      61.986us        51.41%      61.986us       2.583us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.986us        51.41%      61.986us       2.583us            24  
+                                            aten::copy_         3.75%     103.760us        69.32%       1.917ms     106.485us      39.329us        32.62%      41.120us       2.284us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.898us        23.97%      28.898us       2.408us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.263us        15.98%      19.263us       1.605us            12  
+                                            aten::clone         0.88%      24.203us        66.44%       1.837ms     306.209us       0.000us         0.00%      12.222us       2.037us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us         8.65%      10.431us       1.739us             6  
+                                              aten::add         1.09%      30.212us         1.88%      52.093us       8.682us       9.695us         8.04%       9.695us       1.616us             6  
+                                              aten::sub         1.36%      37.662us         2.24%      62.032us      10.339us       9.568us         7.94%       9.568us       1.595us             6  
+                                Activity Buffer Request        52.07%       1.440ms        52.07%       1.440ms       1.440ms       1.791us         1.49%       1.791us       1.791us             1  
+                                    aten::empty_strided         1.11%      30.761us         1.11%      30.761us       5.127us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.08%     306.470us        11.08%     306.470us      51.078us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.59%      71.623us         3.35%      92.502us       3.854us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.76%      20.879us         0.76%      20.879us       0.870us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.29%     229.176us         8.29%     229.176us       4.774us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.22%       5.960us         0.22%       5.960us       5.960us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.765ms
+Self CUDA time total: 120.578us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     936.986us       546.18%     936.986us     936.986us             1  
+                                            torch_eager        19.74%     302.858us        99.67%       1.529ms       1.529ms       0.000us         0.00%     174.370us     174.370us             1  
+                                              aten::mul         9.62%     147.674us        16.98%     260.548us      10.856us      89.250us        52.02%      89.250us       3.719us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.250us        52.02%      89.250us       3.719us            24  
+                                            aten::copy_         6.71%     102.945us        46.27%     710.024us      39.446us      57.601us        33.58%      60.418us       3.357us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.543us        23.63%      40.543us       3.379us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.702us        14.40%      24.702us       2.059us            12  
+                                            aten::clone         1.46%      22.434us        41.15%     631.323us     105.220us       0.000us         0.00%      19.875us       3.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.058us         9.94%      17.058us       2.843us             6  
+                                              aten::sub         2.30%      35.263us         3.87%      59.363us       9.894us      12.352us         7.20%      12.352us       2.059us             6  
+                                              aten::add         1.99%      30.582us         3.46%      53.142us       8.857us      12.350us         7.20%      12.350us       2.058us             6  
+                                Activity Buffer Request        16.56%     254.079us        16.56%     254.079us     254.079us       2.817us         1.64%       2.817us       2.817us             1  
+                                    aten::empty_strided         1.94%      29.811us         1.94%      29.811us       4.968us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.86%     289.319us        18.86%     289.319us      48.220us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.62%      70.853us         5.94%      91.142us       3.798us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.32%      20.289us         1.32%      20.289us       0.845us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.55%     223.215us        14.55%     223.215us       4.650us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.33%       5.040us         0.33%       5.040us       5.040us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.534ms
+Self CUDA time total: 171.553us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     965.148us       800.00%     965.148us     965.148us             1  
+                                            torch_eager        19.51%     299.410us        99.63%       1.529ms       1.529ms       0.000us         0.00%     122.467us     122.467us             1  
+                                              aten::mul         9.83%     150.825us        17.48%     268.249us      11.177us      62.048us        51.43%      62.048us       2.585us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.048us        51.43%      62.048us       2.585us            24  
+                                            aten::copy_         7.55%     115.928us        45.67%     700.806us      38.934us      39.490us        32.73%      41.314us       2.295us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.961us        24.01%      28.961us       2.413us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.105us        15.84%      19.105us       1.592us            12  
+                                            aten::clone         1.36%      20.940us        39.52%     606.529us     101.088us       0.000us         0.00%      12.353us       2.059us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.529us         8.73%      10.529us       1.755us             6  
+                                              aten::add         2.06%      31.661us         3.57%      54.801us       9.133us       9.568us         7.93%       9.568us       1.595us             6  
+                                              aten::sub         2.41%      36.983us         4.07%      62.503us      10.417us       9.537us         7.91%       9.537us       1.589us             6  
+                                Activity Buffer Request        16.28%     249.768us        16.28%     249.768us     249.768us       1.824us         1.51%       1.824us       1.824us             1  
+                                    aten::empty_strided         1.98%      30.440us         1.98%      30.440us       5.073us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.54%     269.148us        17.54%     269.148us      44.858us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.63%      71.053us         5.99%      91.854us       3.827us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.36%      20.801us         1.36%      20.801us       0.867us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.12%     232.046us        15.12%     232.046us       4.834us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.37%       5.660us         0.37%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.535ms
+Self CUDA time total: 120.643us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     953.270us       555.27%     953.270us     953.270us             1  
+                                            torch_eager        11.16%     301.267us        99.78%       2.693ms       2.693ms       0.000us         0.00%     174.555us     174.555us             1  
+                                              aten::mul         5.45%     147.123us         9.90%     267.330us      11.139us      88.990us        51.84%      88.990us       3.708us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      88.990us        51.84%      88.990us       3.708us            24  
+                                            aten::copy_         3.86%     104.254us        69.09%       1.865ms     103.603us      57.726us        33.62%      60.605us       3.367us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.575us        23.63%      40.575us       3.381us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.960us        14.54%      24.960us       2.080us            12  
+                                            aten::clone         0.88%      23.712us        66.26%       1.789ms     298.097us       0.000us         0.00%      20.030us       3.338us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.151us         9.99%      17.151us       2.858us             6  
+                                              aten::add         1.11%      29.833us         1.90%      51.253us       8.542us      12.512us         7.29%      12.512us       2.085us             6  
+                                              aten::sub         1.37%      36.961us         2.28%      61.643us      10.274us      12.448us         7.25%      12.448us       2.075us             6  
+                                Activity Buffer Request        53.11%       1.433ms        53.11%       1.433ms       1.433ms       2.879us         1.68%       2.879us       2.879us             1  
+                                    aten::empty_strided         1.15%      30.972us         1.15%      30.972us       5.162us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.79%     264.150us         9.79%     264.150us      44.025us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.62%      70.779us         3.41%      92.149us       3.840us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.79%      21.370us         0.79%      21.370us       0.890us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.50%     229.301us         8.50%     229.301us       4.777us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.22%       6.011us         0.22%       6.011us       6.011us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.699ms
+Self CUDA time total: 171.676us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     943.705us       333.64%     943.705us     943.705us             1  
+                                            torch_eager        19.68%     292.650us        99.63%       1.482ms       1.482ms       0.000us         0.00%     301.376us     301.376us             1  
+                                              aten::mul         9.80%     145.836us        17.38%     258.449us      10.769us     132.447us        46.83%     132.447us       5.519us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     132.447us        46.83%     132.447us       5.519us            24  
+                                            aten::copy_         7.01%     104.213us        45.19%     672.153us      37.342us     109.183us        38.60%     127.711us       7.095us            18  
+                                            aten::clone         1.46%      21.712us        39.66%     589.861us      98.310us       0.000us         0.00%      70.593us      11.766us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.118us        20.19%      57.118us       4.760us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.065us        18.41%      52.065us       8.678us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.218us        14.57%      41.218us       3.435us            12  
+                                              aten::sub         2.40%      35.641us         4.03%      59.963us       9.994us      20.704us         7.32%      20.704us       3.451us             6  
+                                              aten::add         2.14%      31.871us         3.63%      53.951us       8.992us      20.514us         7.25%      20.514us       3.419us             6  
+                                Activity Buffer Request        16.79%     249.768us        16.79%     249.768us     249.768us      18.528us         6.55%      18.528us      18.528us             1  
+                                    aten::empty_strided         2.08%      30.950us         2.08%      30.950us       5.158us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        17.02%     253.139us        17.02%     253.139us      42.190us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.80%      71.455us         6.19%      92.023us       3.834us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.38%      20.568us         1.38%      20.568us       0.857us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.06%     224.048us        15.06%     224.048us       4.668us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.37%       5.550us         0.37%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.487ms
+Self CUDA time total: 282.848us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.008ms       178.28%       1.008ms       1.008ms             1  
+                                            torch_eager        20.19%     309.543us        99.64%       1.528ms       1.528ms       0.000us         0.00%     589.177us     589.177us             1  
+                                            aten::copy_         6.92%     106.132us        42.74%     655.343us      36.408us     274.429us        48.53%     298.108us      16.562us            18  
+                                              aten::mul        10.35%     158.718us        18.57%     284.772us      11.866us     225.374us        39.85%     225.374us       9.391us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     225.374us        39.85%     225.374us       9.391us            24  
+                                            aten::clone         1.37%      21.073us        36.86%     565.269us      94.211us       0.000us         0.00%     207.356us      34.559us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.677us        32.48%     183.677us      30.613us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.752us        16.05%      90.752us       7.563us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.695us        11.62%      65.695us       5.475us            12  
+                                              aten::sub         2.38%      36.444us         4.07%      62.445us      10.407us      33.376us         5.90%      33.376us       5.563us             6  
+                                              aten::add         2.04%      31.281us         3.53%      54.151us       9.025us      32.319us         5.72%      32.319us       5.387us             6  
+                                Activity Buffer Request        15.09%     231.317us        15.09%     231.317us     231.317us      23.679us         4.19%      23.679us      23.679us             1  
+                                    aten::empty_strided         1.92%      29.470us         1.92%      29.470us       4.912us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.26%     249.288us        16.26%     249.288us      41.548us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         5.79%      88.836us         7.24%     111.045us       4.627us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.45%      22.209us         1.45%      22.209us       0.925us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.88%     243.531us        15.88%     243.531us       5.074us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.560us         0.36%       5.560us       5.560us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.533ms
+Self CUDA time total: 565.498us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     978.269us      1057.44%     978.269us     978.269us             1  
+                                            torch_eager        11.40%     307.808us        99.80%       2.694ms       2.694ms       0.000us         0.00%      93.633us      93.633us             1  
+                                              aten::mul         5.67%     153.175us        10.04%     271.038us      11.293us      49.695us        53.72%      49.695us       2.071us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.695us        53.72%      49.695us       2.071us            24  
+                                            aten::copy_         3.91%     105.435us        68.36%       1.845ms     102.519us      29.377us        31.75%      30.497us       1.694us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.625us        24.46%      22.625us       1.885us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.441us        14.53%      13.441us       1.120us            12  
+                                            aten::clone         0.87%      23.532us        65.28%       1.762ms     293.710us       0.000us         0.00%       7.872us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us         7.30%       6.752us       1.125us             6  
+                                              aten::sub         1.66%      44.684us         2.62%      70.744us      11.791us       6.721us         7.26%       6.721us       1.120us             6  
+                                              aten::add         1.10%      29.730us         1.92%      51.961us       8.660us       6.720us         7.26%       6.720us       1.120us             6  
+                                Activity Buffer Request        52.82%       1.426ms        52.82%       1.426ms       1.426ms       1.120us         1.21%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.12%      30.311us         1.12%      30.311us       5.052us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.14%     246.769us         9.14%     246.769us      41.128us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.68%      72.284us         3.46%      93.345us       3.889us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.78%      21.061us         0.78%      21.061us       0.878us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.64%     233.365us         8.64%     233.365us       4.862us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.520us         0.20%       5.520us       5.520us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.700ms
+Self CUDA time total: 92.513us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     938.603us       972.86%     938.603us     938.603us             1  
+                                            torch_eager        19.93%     287.519us        99.56%       1.436ms       1.436ms       0.000us         0.00%      97.823us      97.823us             1  
+                                              aten::mul        10.40%     150.056us        18.25%     263.188us      10.966us      51.362us        53.24%      51.362us       2.140us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.362us        53.24%      51.362us       2.140us            24  
+                                            aten::copy_         7.16%     103.273us        43.48%     627.121us      34.840us      30.911us        32.04%      32.255us       1.792us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.008us        23.85%      23.008us       1.917us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.206us        14.72%      14.206us       1.184us            12  
+                                            aten::clone         1.50%      21.587us        37.74%     544.337us      90.723us       0.000us         0.00%       9.247us       1.541us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.903us         8.19%       7.903us       1.317us             6  
+                                              aten::sub         2.45%      35.381us         4.12%      59.382us       9.897us       7.103us         7.36%       7.103us       1.184us             6  
+                                              aten::add         2.21%      31.862us         3.86%      55.642us       9.274us       7.103us         7.36%       7.103us       1.184us             6  
+                                Activity Buffer Request        14.93%     215.407us        14.93%     215.407us     215.407us       1.344us         1.39%       1.344us       1.344us             1  
+                                    aten::empty_strided         2.08%      30.053us         2.08%      30.053us       5.009us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.77%     241.899us        16.77%     241.899us      40.317us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.91%      70.826us         6.34%      91.477us       3.812us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.43%      20.651us         1.43%      20.651us       0.860us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.77%     227.455us        15.77%     227.455us       4.739us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.44%       6.400us         0.44%       6.400us       6.400us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.442ms
+Self CUDA time total: 96.479us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     982.735us       947.89%     982.735us     982.735us             1  
+                                            torch_eager        20.01%     296.499us        99.62%       1.476ms       1.476ms       0.000us         0.00%     105.019us     105.019us             1  
+                                              aten::mul        11.54%     171.043us        19.63%     290.918us      12.122us      55.326us        53.36%      55.326us       2.305us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.326us        53.36%      55.326us       2.305us            24  
+                                            aten::copy_         7.11%     105.421us        42.19%     625.268us      34.737us      32.415us        31.27%      33.758us       1.875us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.672us        23.80%      24.672us       2.056us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.935us        15.37%      15.935us       1.328us            12  
+                                            aten::clone         1.46%      21.629us        36.49%     540.678us      90.113us       0.000us         0.00%       9.086us       1.514us             6  
+                                              aten::sub         2.54%      37.603us         4.22%      62.603us      10.434us       7.968us         7.69%       7.968us       1.328us             6  
+                                              aten::add         2.14%      31.750us         3.67%      54.341us       9.057us       7.967us         7.68%       7.967us       1.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.743us         7.47%       7.743us       1.291us             6  
+                                Activity Buffer Request        14.41%     213.507us        14.41%     213.507us     213.507us       1.343us         1.30%       1.343us       1.343us             1  
+                                    aten::empty_strided         2.05%      30.383us         2.05%      30.383us       5.064us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.24%     240.608us        16.24%     240.608us      40.101us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.91%      72.718us         6.38%      94.560us       3.940us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.47%      21.842us         1.47%      21.842us       0.910us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.74%     233.198us        15.74%     233.198us       4.858us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.681us         0.38%       5.681us       5.681us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.482ms
+Self CUDA time total: 103.676us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     979.903us       792.35%     979.903us     979.903us             1  
+                                            torch_eager        11.44%     307.736us        99.80%       2.685ms       2.685ms       0.000us         0.00%     125.495us     125.495us             1  
+                                              aten::mul         5.76%     155.021us        10.44%     280.767us      11.699us      65.018us        52.57%      65.018us       2.709us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.018us        52.57%      65.018us       2.709us            24  
+                                            aten::copy_         4.05%     108.834us        68.12%       1.833ms     101.807us      39.389us        31.85%      41.213us       2.290us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.830us        23.31%      28.830us       2.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.264us        15.58%      19.264us       1.605us            12  
+                                            aten::clone         0.88%      23.603us        65.09%       1.751ms     291.863us       0.000us         0.00%      12.383us       2.064us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.559us         8.54%      10.559us       1.760us             6  
+                                              aten::sub         1.31%      35.349us         2.21%      59.490us       9.915us       9.633us         7.79%       9.633us       1.606us             6  
+                                              aten::add         1.16%      31.200us         1.98%      53.350us       8.892us       9.631us         7.79%       9.631us       1.605us             6  
+                                Activity Buffer Request        52.85%       1.422ms        52.85%       1.422ms       1.422ms       1.824us         1.47%       1.824us       1.824us             1  
+                                    aten::empty_strided         1.16%      31.331us         1.16%      31.331us       5.222us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.81%     236.968us         8.81%     236.968us      39.495us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.73%      73.381us         3.57%      95.963us       3.998us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.84%      22.582us         0.84%      22.582us       0.941us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.81%     236.979us         8.81%     236.979us       4.937us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.460us         0.20%       5.460us       5.460us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.690ms
+Self CUDA time total: 123.671us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     945.998us       913.29%     945.998us     945.998us             1  
+                                            torch_eager        20.62%     293.766us        99.60%       1.419ms       1.419ms       0.000us         0.00%     104.893us     104.893us             1  
+                                              aten::mul        10.57%     150.564us        18.69%     266.299us      11.096us      55.198us        53.29%      55.198us       2.300us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.198us        53.29%      55.198us       2.300us            24  
+                                            aten::copy_         7.32%     104.233us        42.25%     601.777us      33.432us      32.416us        31.30%      33.728us       1.874us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        23.79%      24.640us       2.053us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.967us        15.41%      15.967us       1.331us            12  
+                                            aten::clone         1.47%      20.971us        36.37%     518.086us      86.348us       0.000us         0.00%       9.088us       1.515us             6  
+                                              aten::sub         2.48%      35.340us         4.16%      59.262us       9.877us       8.000us         7.72%       8.000us       1.333us             6  
+                                              aten::add         2.24%      31.871us         3.82%      54.371us       9.062us       7.967us         7.69%       7.967us       1.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         7.51%       7.776us       1.296us             6  
+                                Activity Buffer Request        13.80%     196.526us        13.80%     196.526us     196.526us       1.312us         1.27%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.11%      29.991us         2.11%      29.991us       4.999us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.46%     234.477us        16.46%     234.477us      39.079us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.98%      70.892us         6.48%      92.342us       3.848us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.51%      21.450us         1.51%      21.450us       0.894us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.06%     228.698us        16.06%     228.698us       4.765us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       5.670us         0.40%       5.670us       5.670us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.424ms
+Self CUDA time total: 103.581us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     967.032us       780.87%     967.032us     967.032us             1  
+                                            torch_eager        20.22%     292.458us        99.59%       1.441ms       1.441ms       0.000us         0.00%     125.633us     125.633us             1  
+                                              aten::mul        10.50%     151.918us        18.64%     269.672us      11.236us      65.056us        52.53%      65.056us       2.711us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.056us        52.53%      65.056us       2.711us            24  
+                                            aten::copy_         8.04%     116.307us        42.64%     616.762us      34.265us      39.457us        31.86%      41.250us       2.292us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.928us        23.36%      28.928us       2.411us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.327us        15.61%      19.327us       1.611us            12  
+                                            aten::clone         1.53%      22.111us        35.99%     520.557us      86.759us       0.000us         0.00%      12.322us       2.054us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.529us         8.50%      10.529us       1.755us             6  
+                                              aten::add         2.26%      32.642us         3.84%      55.612us       9.269us       9.696us         7.83%       9.696us       1.616us             6  
+                                              aten::sub         2.60%      37.653us         4.38%      63.343us      10.557us       9.631us         7.78%       9.631us       1.605us             6  
+                                Activity Buffer Request        13.64%     197.336us        13.64%     197.336us     197.336us       1.793us         1.45%       1.793us       1.793us             1  
+                                    aten::empty_strided         2.06%      29.750us         2.06%      29.750us       4.958us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.31%     235.989us        16.31%     235.989us      39.331us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.84%      69.978us         6.29%      90.981us       3.791us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.45%      21.003us         1.45%      21.003us       0.875us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.14%     233.544us        16.14%     233.544us       4.866us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.41%       5.890us         0.41%       5.890us       5.890us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.447ms
+Self CUDA time total: 123.840us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     959.509us       542.22%     959.509us     959.509us             1  
+                                            torch_eager        11.19%     299.303us        99.80%       2.668ms       2.668ms       0.000us         0.00%     179.839us     179.839us             1  
+                                              aten::mul         5.70%     152.426us        10.09%     269.786us      11.241us      94.591us        53.45%      94.591us       3.941us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.591us        53.45%      94.591us       3.941us            24  
+                                            aten::copy_         4.03%     107.815us        68.77%       1.839ms     102.157us      57.793us        32.66%      60.672us       3.371us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.674us        22.98%      40.674us       3.389us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.576us        13.89%      24.576us       2.048us            12  
+                                            aten::clone         0.89%      23.682us        65.69%       1.756ms     292.710us       0.000us         0.00%      19.998us       3.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.119us         9.67%      17.119us       2.853us             6  
+                                              aten::add         1.15%      30.841us         2.01%      53.673us       8.945us      12.288us         6.94%      12.288us       2.048us             6  
+                                              aten::sub         1.30%      34.671us         2.23%      59.581us       9.930us      12.288us         6.94%      12.288us       2.048us             6  
+                                Activity Buffer Request        53.49%       1.430ms        53.49%       1.430ms       1.430ms       2.879us         1.63%       2.879us       2.879us             1  
+                                    aten::empty_strided         1.18%      31.429us         1.18%      31.429us       5.238us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.73%     233.388us         8.73%     233.388us      38.898us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.65%      70.737us         3.45%      92.144us       3.839us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.80%      21.407us         0.80%      21.407us       0.892us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.70%     232.603us         8.70%     232.603us       4.846us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.340us         0.20%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.674ms
+Self CUDA time total: 176.960us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     971.862us       328.16%     971.862us     971.862us             1  
+                                            torch_eager        12.15%     329.142us        99.82%       2.705ms       2.705ms       0.000us         0.00%     313.305us     313.305us             1  
+                                              aten::mul         5.49%     148.746us         9.75%     264.179us      11.007us     144.477us        48.78%     144.477us       6.020us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     144.477us        48.78%     144.477us       6.020us            24  
+                                            aten::copy_         3.89%     105.362us        68.19%       1.848ms     102.658us     110.590us        37.34%     127.741us       7.097us            18  
+                                            aten::clone         1.05%      28.441us        65.26%       1.769ms     294.758us       0.000us         0.00%      70.398us      11.733us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.343us        19.36%      57.343us       4.779us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.247us        17.98%      53.247us       8.874us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.087us        13.87%      41.087us       3.424us            12  
+                                              aten::sub         1.30%      35.153us         2.18%      59.153us       9.859us      20.672us         6.98%      20.672us       3.445us             6  
+                                              aten::add         1.16%      31.441us         1.98%      53.651us       8.942us      20.415us         6.89%      20.415us       3.402us             6  
+                                Activity Buffer Request        53.08%       1.438ms        53.08%       1.438ms       1.438ms      17.151us         5.79%      17.151us      17.151us             1  
+                                    aten::empty_strided         1.17%      31.740us         1.17%      31.740us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.63%     233.787us         8.63%     233.787us      38.964us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.59%      70.073us         3.35%      90.793us       3.783us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.76%      20.720us         0.76%      20.720us       0.863us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.56%     231.958us         8.56%     231.958us       4.832us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.010us         0.18%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.710ms
+Self CUDA time total: 296.154us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     945.200us       534.23%     945.200us     945.200us             1  
+                                            torch_eager        20.38%     296.401us        99.59%       1.448ms       1.448ms       0.000us         0.00%     179.808us     179.808us             1  
+                                              aten::mul        10.46%     152.181us        18.12%     263.514us      10.980us      94.525us        53.43%      94.525us       3.939us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.525us        53.43%      94.525us       3.939us            24  
+                                            aten::copy_         7.17%     104.245us        43.33%     630.244us      35.014us      57.666us        32.59%      60.547us       3.364us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.673us        22.99%      40.673us       3.389us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.736us        13.98%      24.736us       2.061us            12  
+                                            aten::clone         1.49%      21.691us        37.84%     550.400us      91.733us       0.000us         0.00%      19.874us       3.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.993us         9.60%      16.993us       2.832us             6  
+                                              aten::sub         2.47%      35.942us         4.11%      59.792us       9.965us      12.384us         7.00%      12.384us       2.064us             6  
+                                              aten::add         2.09%      30.411us         3.88%      56.491us       9.415us      12.352us         6.98%      12.352us       2.059us             6  
+                                Activity Buffer Request        15.10%     219.557us        15.10%     219.557us     219.557us       2.881us         1.63%       2.881us       2.881us             1  
+                                    aten::empty_strided         2.03%      29.530us         2.03%      29.530us       4.922us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.76%     243.759us        16.76%     243.759us      40.627us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.81%      69.906us         6.24%      90.817us       3.784us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.44%      20.911us         1.44%      20.911us       0.871us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.40%     223.946us        15.40%     223.946us       4.666us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.41%       5.961us         0.41%       5.961us       5.961us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.454ms
+Self CUDA time total: 176.927us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     966.807us       326.27%     966.807us     966.807us             1  
+                                            torch_eager        21.10%     301.699us        99.64%       1.425ms       1.425ms       0.000us         0.00%     314.141us     314.141us             1  
+                                              aten::mul        10.74%     153.603us        18.95%     270.927us      11.289us     144.864us        48.89%     144.864us       6.036us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     144.864us        48.89%     144.864us       6.036us            24  
+                                            aten::copy_         7.47%     106.842us        41.64%     595.420us      33.079us     110.942us        37.44%     128.766us       7.154us            18  
+                                            aten::clone         1.49%      21.294us        35.47%     507.209us      84.535us       0.000us         0.00%      71.614us      11.936us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.152us        19.29%      57.152us       4.763us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.790us        18.15%      53.790us       8.965us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.511us        13.67%      40.511us       3.376us            12  
+                                              aten::sub         2.47%      35.333us         4.25%      60.804us      10.134us      20.288us         6.85%      20.288us       3.381us             6  
+                                              aten::add         2.13%      30.471us         3.66%      52.363us       8.727us      20.223us         6.82%      20.223us       3.371us             6  
+                                Activity Buffer Request        13.62%     194.727us        13.62%     194.727us     194.727us      17.824us         6.02%      17.824us      17.824us             1  
+                                    aten::empty_strided         2.14%      30.600us         2.14%      30.600us       5.100us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.72%     224.758us        15.72%     224.758us      37.460us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.94%      70.633us         6.40%      91.582us       3.816us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.47%      20.949us         1.47%      20.949us       0.873us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.35%     233.780us        16.35%     233.780us       4.870us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.210us         0.36%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.430ms
+Self CUDA time total: 296.317us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     954.667us       163.51%     954.667us     954.667us             1  
+                                            torch_eager        21.08%     298.350us        99.62%       1.410ms       1.410ms       0.000us         0.00%     607.510us     607.510us             1  
+                                            aten::copy_         7.23%     102.385us        41.44%     586.482us      32.582us     268.667us        46.02%     292.315us      16.240us            18  
+                                              aten::mul        10.73%     151.847us        18.95%     268.240us      11.177us     249.820us        42.79%     249.820us      10.409us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     249.820us        42.79%     249.820us      10.409us            24  
+                                            aten::clone         1.47%      20.758us        35.72%     505.547us      84.258us       0.000us         0.00%     201.757us      33.626us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     178.109us        30.51%     178.109us      29.685us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.558us        15.51%      90.558us       7.547us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.375us        11.20%      65.375us       5.448us            12  
+                                              aten::sub         2.55%      36.094us         4.25%      60.153us      10.026us      32.800us         5.62%      32.800us       5.467us             6  
+                                              aten::add         2.18%      30.790us         3.76%      53.162us       8.860us      32.575us         5.58%      32.575us       5.429us             6  
+                                Activity Buffer Request        14.07%     199.186us        14.07%     199.186us     199.186us      23.648us         4.05%      23.648us      23.648us             1  
+                                    aten::empty_strided         2.17%      30.642us         2.17%      30.642us       5.107us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.65%     221.418us        15.65%     221.418us      36.903us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         5.01%      70.953us         6.50%      91.982us       3.833us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.49%      21.029us         1.49%      21.029us       0.876us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.99%     226.317us        15.99%     226.317us       4.715us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.410us         0.38%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.415ms
+Self CUDA time total: 583.862us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager        13.40%     294.433us        64.71%       1.422ms       1.422ms       0.000us         0.00%       1.833ms       1.833ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.806ms       102.20%       1.806ms       1.806ms             1  
+                                            aten::copy_         4.78%     104.916us        25.99%     570.899us      31.717us     790.968us        44.76%     856.920us      47.607us            18  
+                                              aten::mul         7.09%     155.744us        13.34%     293.130us      12.214us     828.278us        46.87%     828.278us      34.512us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     828.278us        46.87%     828.278us      34.512us            24  
+                                            aten::clone         0.98%      21.583us        22.19%     487.616us      81.269us       0.000us         0.00%     623.577us     103.929us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     557.625us        31.56%     557.625us      92.937us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     233.343us        13.20%     233.343us      19.445us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.904us         8.37%     147.904us      12.325us            12  
+                                              aten::sub         1.71%      37.532us         2.94%      64.522us      10.754us      89.216us         5.05%      89.216us      14.869us             6  
+                                Activity Buffer Request         8.13%     178.646us         8.13%     178.646us     178.646us      65.952us         3.73%      65.952us      65.952us             1  
+                                              aten::add         1.39%      30.430us         2.44%      53.591us       8.932us      58.688us         3.32%      58.688us       9.781us             6  
+                                    aten::empty_strided         1.37%      30.060us         1.37%      30.060us       5.010us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.15%     222.926us        10.15%     222.926us      37.154us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.32%      73.001us         4.25%      93.471us       3.895us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.93%      20.470us         0.93%      20.470us       0.853us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        11.47%     251.948us        11.47%     251.948us       5.249us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        35.29%     775.316us        35.29%     775.316us     775.316us       0.000us         0.00%       0.000us       0.000us             1  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.197ms
+Self CUDA time total: 1.767ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_eager              cuda_B1_S128_H32_D128_R64     0.23  True
+torch_eager              cuda_B1_S128_H32_D64_R32     0.23  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.23  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S2048_H8_D64_R32     0.23  True
+torch_eager              cuda_B1_S512_H32_D128_R64     0.23  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.23  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S512_H8_D64_R32     0.23  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.23  True
+torch_eager              cuda_B2_S128_H32_D64_R32     0.23  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.23  True
+torch_eager              cuda_B2_S128_H8_D64_R32     0.23  True
+torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
+torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
+torch_eager              cuda_B2_S2048_H8_D128_R64     0.23  True
+torch_eager              cuda_B2_S2048_H8_D64_R32     0.23  True
+torch_eager              cuda_B2_S512_H32_D128_R64     0.23  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.23  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.23  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.23  True
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+rotary.jsonl +
+
+
+
+ + + \ No newline at end of file diff --git a/rotary/index.html b/rotary/index.html new file mode 100644 index 0000000000000000000000000000000000000000..5ff503336b04c290f15ed24958b96a45568efad3 --- /dev/null +++ b/rotary/index.html @@ -0,0 +1,3879 @@ + + + + + + index + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Rotary Position Embeddings Benchmarks

+

This directory contains benchmarks for Rotary Position Embeddings (RoPE) implementations.

+

Implementations

+ +

Results

+ +
+ + + \ No newline at end of file diff --git a/rotary/results/artifacts/combine/latency.svg b/rotary/results/artifacts/combine/latency.svg new file mode 100644 index 0000000000000000000000000000000000000000..fbdea134ec02a1374a095572b0b66bc987fe1081 --- /dev/null +++ b/rotary/results/artifacts/combine/latency.svg @@ -0,0 +1,489 @@ + + + + + + + 2025-10-29T00:37:24.930217 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_B1_S128_H8_D64_R32 + + + + + + + + + + + + + cuda_B1_S128_H8_D128_R64 + + + + + + + + + + + + + cuda_B1_S128_H32_D64_R32 + + + + + + + + + + + + + cuda_B1_S128_H32_D128_R64 + + + + + + + + + + + + + cuda_B1_S512_H8_D64_R32 + + + + + + + + + + + + + cuda_B1_S512_H8_D128_R64 + + + + + + + + + + + + + cuda_B1_S512_H32_D64_R32 + + + + + + + + + + + + + cuda_B1_S512_H32_D128_R64 + + + + + + + + + + + + + cuda_B1_S2048_H8_D64_R32 + + + + + + + + + + + + + cuda_B1_S2048_H8_D128_R64 + + + + + + + + + + + + + cuda_B1_S2048_H32_D64_R32 + + + + + + + + + + + + + cuda_B1_S2048_H32_D128_R64 + + + + + + + + + + + + + cuda_B2_S128_H8_D64_R32 + + + + + + + + + + + + + cuda_B2_S128_H8_D128_R64 + + + + + + + + + + + + + cuda_B2_S128_H32_D64_R32 + + + + + + + + + + + + + cuda_B2_S128_H32_D128_R64 + + + + + + + + + + + + + cuda_B2_S512_H8_D64_R32 + + + + + + + + + + + + + cuda_B2_S512_H8_D128_R64 + + + + + + + + + + + + + cuda_B2_S512_H32_D64_R32 + + + + + + + + + + + + + cuda_B2_S512_H32_D128_R64 + + + + + + + + + + + + + cuda_B2_S2048_H8_D64_R32 + + + + + + + + + + + + + cuda_B2_S2048_H8_D128_R64 + + + + + + + + + + + + + cuda_B2_S2048_H32_D64_R32 + + + + + + + + + + + + + cuda_B2_S2048_H32_D128_R64 + + + + Workload + + + + + + + + + + + + + + + + + 0.2 + + + + + + + + + + + + + 0.3 + + + + + + + + + + + + + 0.4 + + + + + + + + + + + + + 0.5 + + + + + + + + + + + + + 0.6 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + torch_eager + + + + + + + + + + \ No newline at end of file diff --git a/rotary/results/cells/combine.py b/rotary/results/cells/combine.py new file mode 100644 index 0000000000000000000000000000000000000000..92c70bcedd0c600f59230f141fc59b2158a3df4d --- /dev/null +++ b/rotary/results/cells/combine.py @@ -0,0 +1,26 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch==2.8.0", +# "kernels-benchmark-tools", +# "matplotlib", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "../../../../../tools", editable = true } +# /// +from kernels_benchmark_tools.core.visuals import generate_combined_results + +# Map display names to uvnote environment variables +cache_env_map = { + "HF Kernels Rotary": "UVNOTE_FILE_HF_KERNELS_ROTARY_BENCHMARK", + "PyTorch Rotary": "UVNOTE_FILE_TORCH_ROTARY_BENCHMARK", +} + +# Generate combined results with visualization +generate_combined_results( + cache_env_map=cache_env_map, + output_filename="rotary.jsonl", + svg_filename="latency.svg" +) \ No newline at end of file diff --git a/rotary/results/combined_results.html b/rotary/results/combined_results.html new file mode 100644 index 0000000000000000000000000000000000000000..488032a1b5a5ed4e0a06a61b22547b7136e3e053 --- /dev/null +++ b/rotary/results/combined_results.html @@ -0,0 +1,5024 @@ + + + + + + Rotary Position Embeddings Benchmark - Combined Results + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Rotary Position Embeddings Benchmarks - Aggregated Results

+

This document combines benchmark results from multiple Rotary Position Embeddings implementations.

+

Combined Summary and Visualization

+
+ + + + + + + 2025-10-29T00:37:24.930217 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_B1_S128_H8_D64_R32 + + + + + + + + + + + + + cuda_B1_S128_H8_D128_R64 + + + + + + + + + + + + + cuda_B1_S128_H32_D64_R32 + + + + + + + + + + + + + cuda_B1_S128_H32_D128_R64 + + + + + + + + + + + + + cuda_B1_S512_H8_D64_R32 + + + + + + + + + + + + + cuda_B1_S512_H8_D128_R64 + + + + + + + + + + + + + cuda_B1_S512_H32_D64_R32 + + + + + + + + + + + + + cuda_B1_S512_H32_D128_R64 + + + + + + + + + + + + + cuda_B1_S2048_H8_D64_R32 + + + + + + + + + + + + + cuda_B1_S2048_H8_D128_R64 + + + + + + + + + + + + + cuda_B1_S2048_H32_D64_R32 + + + + + + + + + + + + + cuda_B1_S2048_H32_D128_R64 + + + + + + + + + + + + + cuda_B2_S128_H8_D64_R32 + + + + + + + + + + + + + cuda_B2_S128_H8_D128_R64 + + + + + + + + + + + + + cuda_B2_S128_H32_D64_R32 + + + + + + + + + + + + + cuda_B2_S128_H32_D128_R64 + + + + + + + + + + + + + cuda_B2_S512_H8_D64_R32 + + + + + + + + + + + + + cuda_B2_S512_H8_D128_R64 + + + + + + + + + + + + + cuda_B2_S512_H32_D64_R32 + + + + + + + + + + + + + cuda_B2_S512_H32_D128_R64 + + + + + + + + + + + + + cuda_B2_S2048_H8_D64_R32 + + + + + + + + + + + + + cuda_B2_S2048_H8_D128_R64 + + + + + + + + + + + + + cuda_B2_S2048_H32_D64_R32 + + + + + + + + + + + + + cuda_B2_S2048_H32_D128_R64 + + + + Workload + + + + + + + + + + + + + + + + + 0.2 + + + + + + + + + + + + + 0.3 + + + + + + + + + + + + + 0.4 + + + + + + + + + + + + + 0.5 + + + + + + + + + + + + + 0.6 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + torch_eager + + + + + + + + + + +
+ +
+
+ +▶ code +▼ output + ▶ uv-logs + | +Cell: combine | 4.37s + | + +Raw +
+ +
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ HF Kernels Rotary             : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/49ec9501b131c967277abe3cccb638422565260339bb30f5ea386b0076f2183e
+✓ PyTorch Rotary                : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5
+
+  ✓ Found HF Kernels Rotary
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/49ec9501b131c967277abe3cccb638422565260339bb30f5ea386b0076f2183e/rotary.jsonl
+  ✓ Found PyTorch Rotary
+     Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5/rotary.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl                     wl                  p50(ms)  ok
+hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.10  False
+hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.10  False
+hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  False
+hf_kernels_rotary        cuda_B1_S2048_H32_D128_R64     0.10  False
+hf_kernels_rotary        cuda_B1_S2048_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S2048_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.28  False
+hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.10  False
+hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S512_H32_D64_R32     0.10  False
+hf_kernels_rotary        cuda_B2_S512_H8_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  False
+torch_eager              cuda_B1_S128_H32_D128_R64     0.23  True
+torch_eager              cuda_B1_S128_H32_D64_R32     0.23  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.23  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S2048_H8_D64_R32     0.23  True
+torch_eager              cuda_B1_S512_H32_D128_R64     0.23  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.23  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S512_H8_D64_R32     0.23  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.23  True
+torch_eager              cuda_B2_S128_H32_D64_R32     0.23  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.23  True
+torch_eager              cuda_B2_S128_H8_D64_R32     0.23  True
+torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
+torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
+torch_eager              cuda_B2_S2048_H8_D128_R64     0.23  True
+torch_eager              cuda_B2_S2048_H8_D64_R32     0.23  True
+torch_eager              cuda_B2_S512_H32_D128_R64     0.23  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.23  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.23  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.23  True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 48 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+  ✓ HF Kernels Rotary
+  ✓ PyTorch Rotary
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+latency.svg +
+ + + + + + + 2025-10-29T00:37:24.930217 + image/svg+xml + + + Matplotlib v3.10.7, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cuda_B1_S128_H8_D64_R32 + + + + + + + + + + + + + cuda_B1_S128_H8_D128_R64 + + + + + + + + + + + + + cuda_B1_S128_H32_D64_R32 + + + + + + + + + + + + + cuda_B1_S128_H32_D128_R64 + + + + + + + + + + + + + cuda_B1_S512_H8_D64_R32 + + + + + + + + + + + + + cuda_B1_S512_H8_D128_R64 + + + + + + + + + + + + + cuda_B1_S512_H32_D64_R32 + + + + + + + + + + + + + cuda_B1_S512_H32_D128_R64 + + + + + + + + + + + + + cuda_B1_S2048_H8_D64_R32 + + + + + + + + + + + + + cuda_B1_S2048_H8_D128_R64 + + + + + + + + + + + + + cuda_B1_S2048_H32_D64_R32 + + + + + + + + + + + + + cuda_B1_S2048_H32_D128_R64 + + + + + + + + + + + + + cuda_B2_S128_H8_D64_R32 + + + + + + + + + + + + + cuda_B2_S128_H8_D128_R64 + + + + + + + + + + + + + cuda_B2_S128_H32_D64_R32 + + + + + + + + + + + + + cuda_B2_S128_H32_D128_R64 + + + + + + + + + + + + + cuda_B2_S512_H8_D64_R32 + + + + + + + + + + + + + cuda_B2_S512_H8_D128_R64 + + + + + + + + + + + + + cuda_B2_S512_H32_D64_R32 + + + + + + + + + + + + + cuda_B2_S512_H32_D128_R64 + + + + + + + + + + + + + cuda_B2_S2048_H8_D64_R32 + + + + + + + + + + + + + cuda_B2_S2048_H8_D128_R64 + + + + + + + + + + + + + cuda_B2_S2048_H32_D64_R32 + + + + + + + + + + + + + cuda_B2_S2048_H32_D128_R64 + + + + Workload + + + + + + + + + + + + + + + + + 0.2 + + + + + + + + + + + + + 0.3 + + + + + + + + + + + + + 0.4 + + + + + + + + + + + + + 0.5 + + + + + + + + + + + + + 0.6 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + torch_eager + + + + + + + + + + +
+
+
+
+
+ + + \ No newline at end of file diff --git a/rotary/results/index.html b/rotary/results/index.html new file mode 100644 index 0000000000000000000000000000000000000000..329ab80bbf228ddf5f35f9df4da9dcb65d51731e --- /dev/null +++ b/rotary/results/index.html @@ -0,0 +1,88 @@ + + + + + + Index of /rotary/results + + + +
+ ← back +
+

Index of /rotary/results

+ + + \ No newline at end of file