diff --git a/activation/impls/artifacts/benchmark/activation.jsonl b/activation/impls/artifacts/benchmark/activation.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..cc4da7fcd25da93cbac6f187979dc308423e4df7
--- /dev/null
+++ b/activation/impls/artifacts/benchmark/activation.jsonl
@@ -0,0 +1,9 @@
+{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.022950000015953265, "p50": 0.023951000002853107, "p90": 0.0245499999778076, "mean": 0.02414040001212925, "iqr": 0.0010899999551838846, "raw_times": [0.02579100004140855, 0.0245499999778076, 0.023951000002853107, 0.022950000015953265, 0.023460000022623717], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031180999997104664, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02659000000448941, "p50": 0.03026100000624865, "p90": 0.03163099995617813, "mean": 0.03016299999671901, "iqr": 0.001709999935428641, "raw_times": [0.02659000000448941, 0.03026100000624865, 0.02992100002074949, 0.03163099995617813, 0.032411999995929364], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03256100001181039, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02795999995441889, "p50": 0.0293610000312583, "p90": 0.02937200002861573, "mean": 0.029306999988421012, "iqr": 9.100006082007894e-05, "raw_times": [0.02795999995441889, 0.03056099996001649, 0.0293610000312583, 0.02928099996779565, 0.02937200002861573], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03265100002636245, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02837199997429707, "p50": 0.029151000035199104, "p90": 0.0292910000325719, "mean": 0.028971200003979902, "iqr": 0.0007500000265281415, "raw_times": [0.02854100000604376, 0.0292910000325719, 0.029500999971787678, 0.029151000035199104, 0.02837199997429707], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03205100000513994, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0284509999914917, "p50": 0.02926099995192999, "p90": 0.029411000014079036, "mean": 0.029144599977826147, "iqr": 0.0005010000450056395, "raw_times": [0.028909999969073397, 0.029689999962556612, 0.029411000014079036, 0.0284509999914917, 0.02926099995192999], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031930999966789386, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027061000025696558, "p50": 0.028121000013925368, "p90": 0.02836999999544787, "mean": 0.027967000005446607, "iqr": 0.0005990000317979138, "raw_times": [0.027770999963649956, 0.028512000028513285, 0.028121000013925368, 0.02836999999544787, 0.027061000025696558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030291000030047144, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02748099996097153, "p50": 0.029001000029893476, "p90": 0.030041000002256624, "mean": 0.029116999996858794, "iqr": 0.0011299999869152089, "raw_times": [0.02748099996097153, 0.030150999975830928, 0.030041000002256624, 0.029001000029893476, 0.028911000015341415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031200999956126907, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028581000037775084, "p50": 0.028771000017968618, "p90": 0.02886099997567726, "mean": 0.028774800000519463, "iqr": 0.00020999999605919584, "raw_times": [0.028581000037775084, 0.02900999999155829, 0.028771000017968618, 0.028650999979618064, 0.02886099997567726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03162100000508872, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028431000032469456, "p50": 0.029390999998213374, "p90": 0.029580999978406908, "mean": 0.029274800010625768, "iqr": 0.00035999994452140527, "raw_times": [0.028431000032469456, 0.029221000033885502, 0.0297500000101536, 0.029390999998213374, 0.029580999978406908], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030401000003621448, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
diff --git a/activation/impls/cells/benchmark.py b/activation/impls/cells/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f9df27c14acf429b58dba6cf0677c00cbbbced
--- /dev/null
+++ b/activation/impls/cells/benchmark.py
@@ -0,0 +1,34 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the activation kernel
+activation = get_kernel("kernels-community/activation")
+
+
+def hf_kernels_swiglu(input_tensor):
+ hidden_dim = input_tensor.shape[-1] // 2
+ out_shape = input_tensor.shape[:-1] + (hidden_dim,)
+ out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
+ return activation.silu_and_mul(out, input_tensor)
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ACTIVATION,
+ impl_name="hf_kernels_swiglu",
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
+ impl_func=hf_kernels_swiglu,
+)
\ No newline at end of file
diff --git a/activation/impls/cells/nv.py b/activation/impls/cells/nv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5
--- /dev/null
+++ b/activation/impls/cells/nv.py
@@ -0,0 +1,2 @@
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
\ No newline at end of file
diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html
new file mode 100644
index 0000000000000000000000000000000000000000..acb55b041fa0c36b529ec1b92a7fddcfe345e099
--- /dev/null
+++ b/activation/impls/hf_kernels_swiglu.html
@@ -0,0 +1,4181 @@
+
+
+
+
+
+ hf_kernels_swiglu
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
HF Kernels - SwiGLU Activation
+
GPU Info
+
+
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+
+
+
+
+
+
Wed Oct 29 00:36:01 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 29C P0 77W / 350W | 0MiB / 46068MiB | 0% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
SwiGLU Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the activation kernel
+activation = get_kernel("kernels-community/activation")
+
+
+def hf_kernels_swiglu(input_tensor):
+ hidden_dim = input_tensor.shape[-1] // 2
+ out_shape = input_tensor.shape[:-1] + (hidden_dim,)
+ out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
+ return activation.silu_and_mul(out, input_tensor)
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ACTIVATION,
+ impl_name="hf_kernels_swiglu",
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
+ impl_func=hf_kernels_swiglu,
+)
+
+
+
+
+
+
+
Running activation benchmark on cuda with 9 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 74.624us 1850.79% 74.624us 74.624us 1
+ hf_kernels_swiglu 11.04% 191.977us 99.56% 1.732ms 1.732ms 0.000us 0.00% 5.440us 5.440us 1
+ _activation_beeaae6::silu_and_mul 1.14% 19.900us 85.86% 1.493ms 497.784us 4.032us 100.00% 5.440us 1.813us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3
+ Activity Buffer Request 82.36% 1.432ms 82.36% 1.432ms 1.432ms 1.408us 34.92% 1.408us 1.408us 1
+ aten::empty 2.66% 46.201us 2.66% 46.201us 15.400us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 2.36% 41.042us 2.36% 41.042us 13.681us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.44% 7.690us 0.44% 7.690us 7.690us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.739ms
+Self CUDA time total: 4.032us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 58.016us 1462.10% 58.016us 58.016us 1
+ hf_kernels_swiglu 6.64% 105.933us 99.68% 1.591ms 1.591ms 0.000us 0.00% 5.280us 5.280us 1
+ _activation_beeaae6::silu_and_mul 1.34% 21.350us 91.75% 1.465ms 488.260us 3.968us 100.00% 5.280us 1.760us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
+ Activity Buffer Request 88.86% 1.419ms 88.86% 1.419ms 1.419ms 1.312us 33.06% 1.312us 1.312us 1
+ aten::empty 1.30% 20.712us 1.30% 20.712us 6.904us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.56% 24.841us 1.56% 24.841us 8.280us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.32% 5.080us 0.32% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.597ms
+Self CUDA time total: 3.968us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.167us 1371.87% 67.167us 67.167us 1
+ hf_kernels_swiglu 6.20% 101.314us 99.65% 1.628ms 1.628ms 0.000us 0.00% 6.560us 6.560us 1
+ _activation_beeaae6::silu_and_mul 1.28% 20.850us 92.18% 1.506ms 501.997us 4.896us 100.00% 6.560us 2.187us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3
+ Activity Buffer Request 89.24% 1.458ms 89.24% 1.458ms 1.458ms 1.664us 33.99% 1.664us 1.664us 1
+ aten::empty 1.26% 20.660us 1.26% 20.660us 6.887us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.67% 27.252us 1.67% 27.252us 9.084us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.35% 5.710us 0.35% 5.710us 5.710us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.634ms
+Self CUDA time total: 4.896us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.055us 1610.42% 69.055us 69.055us 1
+ hf_kernels_swiglu 5.98% 106.323us 99.73% 1.773ms 1.773ms 0.000us 0.00% 5.728us 5.728us 1
+ _activation_beeaae6::silu_and_mul 1.23% 21.902us 92.63% 1.646ms 548.829us 4.288us 100.00% 5.728us 1.909us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3
+ Activity Buffer Request 80.11% 1.424ms 80.11% 1.424ms 1.424ms 1.440us 33.58% 1.440us 1.440us 1
+ aten::empty 1.11% 19.750us 1.11% 19.750us 6.583us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 11.30% 200.767us 11.30% 200.767us 66.922us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 4.870us 0.27% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.777ms
+Self CUDA time total: 4.288us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.438us 1043.62% 61.438us 61.438us 1
+ hf_kernels_swiglu 19.33% 85.364us 98.97% 437.156us 437.156us 0.000us 0.00% 7.871us 7.871us 1
+ _activation_beeaae6::silu_and_mul 4.88% 21.551us 75.28% 332.532us 110.844us 5.887us 100.00% 7.871us 2.624us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.887us 100.00% 5.887us 1.962us 3
+ Activity Buffer Request 35.23% 155.635us 35.23% 155.635us 155.635us 1.984us 33.70% 1.984us 1.984us 1
+ aten::empty 4.36% 19.260us 4.36% 19.260us 6.420us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 35.17% 155.346us 35.17% 155.346us 51.782us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.03% 4.560us 1.03% 4.560us 4.560us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 441.716us
+Self CUDA time total: 5.887us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.160us 828.30% 64.160us 64.160us 1
+ hf_kernels_swiglu 7.42% 129.826us 99.74% 1.746ms 1.746ms 0.000us 0.00% 10.339us 10.339us 1
+ _activation_beeaae6::silu_and_mul 1.16% 20.220us 91.25% 1.597ms 532.391us 7.746us 100.00% 10.339us 3.446us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.746us 100.00% 7.746us 2.582us 3
+ Activity Buffer Request 81.29% 1.423ms 81.29% 1.423ms 1.423ms 2.593us 33.48% 2.593us 2.593us 1
+ aten::empty 1.08% 18.840us 1.08% 18.840us 6.280us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.81% 154.125us 8.81% 154.125us 51.375us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.26% 4.481us 0.26% 4.481us 4.481us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.750ms
+Self CUDA time total: 7.746us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.847us 1069.55% 70.847us 70.847us 1
+ hf_kernels_swiglu 6.38% 111.683us 99.73% 1.745ms 1.745ms 0.000us 0.00% 8.832us 8.832us 1
+ _activation_beeaae6::silu_and_mul 1.20% 21.011us 92.19% 1.613ms 537.758us 6.624us 100.00% 8.832us 2.944us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 100.00% 6.624us 2.208us 3
+ Activity Buffer Request 82.19% 1.438ms 82.19% 1.438ms 1.438ms 2.208us 33.33% 2.208us 2.208us 1
+ aten::empty 1.16% 20.281us 1.16% 20.281us 6.760us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.80% 153.915us 8.80% 153.915us 51.305us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 4.700us 0.27% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.750ms
+Self CUDA time total: 6.624us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.070us 668.11% 63.070us 63.070us 1
+ hf_kernels_swiglu 18.75% 87.072us 98.86% 459.026us 459.026us 0.000us 0.00% 12.608us 12.608us 1
+ _activation_beeaae6::silu_and_mul 4.59% 21.321us 76.16% 353.653us 117.884us 9.440us 100.00% 12.608us 4.203us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.440us 100.00% 9.440us 3.147us 3
+ Activity Buffer Request 38.99% 181.046us 38.99% 181.046us 181.046us 3.168us 33.56% 3.168us 3.168us 1
+ aten::empty 3.94% 18.301us 3.94% 18.301us 6.100us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 32.58% 151.286us 32.58% 151.286us 50.429us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.14% 5.310us 1.14% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 464.336us
+Self CUDA time total: 9.440us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.326us 483.85% 63.326us 63.326us 1
+ hf_kernels_swiglu 16.17% 100.313us 99.24% 615.771us 615.771us 0.000us 0.00% 17.472us 17.472us 1
+ _activation_beeaae6::silu_and_mul 3.48% 21.570us 80.17% 497.486us 165.829us 13.088us 100.00% 17.472us 5.824us 3
+void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.088us 100.00% 13.088us 4.363us 3
+ Activity Buffer Request 52.45% 325.441us 52.45% 325.441us 325.441us 4.384us 33.50% 4.384us 4.384us 1
+ aten::empty 2.90% 17.972us 2.90% 17.972us 5.991us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 24.25% 150.475us 24.25% 150.475us 50.158us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.76% 4.730us 0.76% 4.730us 4.730us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 620.501us
+Self CUDA time total: 13.088us
+
+
+impl wl p50(ms) ok
+hf_kernels_swiglu cuda_T128_D1024 0.03 True
+hf_kernels_swiglu cuda_T128_D2048 0.03 True
+hf_kernels_swiglu cuda_T128_D768 0.02 True
+hf_kernels_swiglu cuda_T256_D1024 0.03 True
+hf_kernels_swiglu cuda_T256_D2048 0.03 True
+hf_kernels_swiglu cuda_T256_D768 0.03 True
+hf_kernels_swiglu cuda_T512_D1024 0.03 True
+hf_kernels_swiglu cuda_T512_D2048 0.03 True
+hf_kernels_swiglu cuda_T512_D768 0.03 True
+
+
+
+
+Installed 15 packages in 15ms
+
+
+
Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
+Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 12.38it/s]
+Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 17.32it/s]
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/activation/impls/index.html b/activation/impls/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..02d457f5814d7ec7515a6c7ef12f11b92d7783cf
--- /dev/null
+++ b/activation/impls/index.html
@@ -0,0 +1,89 @@
+
+
+
+
+
+ Index of /activation/impls
+
+
+
+
+ Index of /activation/impls
+
+
+
\ No newline at end of file
diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html
new file mode 100644
index 0000000000000000000000000000000000000000..61a6a78129e9c9547448c27bfc80f0f6b42b18ce
--- /dev/null
+++ b/activation/impls/torch_swiglu.html
@@ -0,0 +1,4199 @@
+
+
+
+
+
+ torch_swiglu
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
PyTorch Native - SwiGLU Activation
+
GPU Info
+
+
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+
+
+
+
+
+
Wed Oct 29 00:36:01 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 29C P0 77W / 350W | 0MiB / 46068MiB | 0% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
SwiGLU Benchmark (PyTorch Native)
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+import torch, torch.nn.functional as F
+
+
+def swiglu_eager(x):
+ d = x.shape[-1] // 2
+ return F.silu(x[..., :d]) * x[..., d:]
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ACTIVATION,
+ impl_name="torch_eager",
+ impl_tags={"family":"hf-kernels", "backend":"eager"},
+ impl_func=swiglu_eager,
+)
+
+
+
+
+
+
+
Running activation benchmark on cuda with 9 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T128_D768
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 208.254us 1623.18% 208.254us 208.254us 1
+ torch_eager 11.63% 222.938us 99.53% 1.908ms 1.908ms 0.000us 0.00% 15.165us 15.165us 1
+ aten::silu 3.35% 64.173us 81.27% 1.558ms 519.434us 6.558us 51.11% 8.893us 2.964us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.558us 51.11% 6.558us 2.186us 3
+ aten::mul 2.01% 38.591us 3.22% 61.711us 20.570us 6.272us 48.89% 6.272us 2.091us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 48.89% 6.272us 2.091us 3
+ Activity Buffer Request 75.51% 1.448ms 75.51% 1.448ms 1.448ms 2.335us 18.20% 2.335us 2.335us 1
+ aten::slice 2.75% 52.771us 3.41% 65.422us 10.904us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.66% 12.651us 0.66% 12.651us 2.108us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.62% 69.391us 3.62% 69.391us 11.565us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.47% 9.050us 0.47% 9.050us 9.050us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.917ms
+Self CUDA time total: 12.830us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T128_D1024
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.071us 1219.79% 151.071us 151.071us 1
+ torch_eager 7.39% 126.424us 99.65% 1.704ms 1.704ms 0.000us 0.00% 14.561us 14.561us 1
+ aten::silu 2.37% 40.550us 87.76% 1.501ms 500.240us 6.400us 51.68% 8.576us 2.859us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 51.68% 6.400us 2.133us 3
+ aten::mul 1.49% 25.470us 2.58% 44.190us 14.730us 5.985us 48.32% 5.985us 1.995us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.985us 48.32% 5.985us 1.995us 3
+ Activity Buffer Request 83.86% 1.434ms 83.86% 1.434ms 1.434ms 2.176us 17.57% 2.176us 2.176us 1
+ aten::slice 1.55% 26.493us 1.91% 32.623us 5.437us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.36% 6.130us 0.36% 6.130us 1.022us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 2.63% 44.922us 2.63% 44.922us 7.487us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.35% 5.980us 0.35% 5.980us 5.980us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.710ms
+Self CUDA time total: 12.385us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T128_D2048
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.943us 1178.09% 154.943us 154.943us 1
+ torch_eager 7.25% 123.104us 99.64% 1.692ms 1.692ms 0.000us 0.00% 15.424us 15.424us 1
+ aten::silu 2.33% 39.532us 87.79% 1.491ms 496.854us 6.784us 51.58% 9.056us 3.019us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.58% 6.784us 2.261us 3
+ aten::mul 1.58% 26.910us 2.71% 46.021us 15.340us 6.368us 48.42% 6.368us 2.123us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 48.42% 6.368us 2.123us 3
+ Activity Buffer Request 83.90% 1.424ms 83.90% 1.424ms 1.424ms 2.272us 17.27% 2.272us 2.272us 1
+ aten::slice 1.53% 26.021us 1.89% 32.121us 5.353us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.36% 6.100us 0.36% 6.100us 1.017us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 2.69% 45.642us 2.69% 45.642us 7.607us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.36% 6.080us 0.36% 6.080us 6.080us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.698ms
+Self CUDA time total: 13.152us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T256_D768
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 180.768us 1415.79% 180.768us 180.768us 1
+ torch_eager 7.93% 123.526us 99.68% 1.554ms 1.554ms 0.000us 0.00% 14.976us 14.976us 1
+ aten::silu 3.24% 50.441us 85.53% 1.333ms 444.348us 6.592us 51.63% 8.800us 2.933us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 51.63% 6.592us 2.197us 3
+ aten::mul 1.75% 27.260us 4.09% 63.791us 21.264us 6.176us 48.37% 6.176us 2.059us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.37% 6.176us 2.059us 3
+ Activity Buffer Request 67.46% 1.051ms 67.46% 1.051ms 1.051ms 2.208us 17.29% 2.208us 2.208us 1
+ aten::slice 1.70% 26.549us 2.13% 33.261us 5.543us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.43% 6.712us 0.43% 6.712us 1.119us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 17.18% 267.779us 17.18% 267.779us 44.630us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.32% 4.940us 0.32% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.559ms
+Self CUDA time total: 12.768us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T256_D1024
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.816us 1138.41% 150.816us 150.816us 1
+ torch_eager 6.24% 117.054us 99.74% 1.872ms 1.872ms 0.000us 0.00% 15.520us 15.520us 1
+ aten::silu 2.12% 39.802us 89.47% 1.679ms 559.729us 6.784us 51.21% 9.056us 3.019us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.21% 6.784us 2.261us 3
+ aten::mul 1.34% 25.111us 2.35% 44.062us 14.687us 6.464us 48.79% 6.464us 2.155us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.79% 6.464us 2.155us 3
+ Activity Buffer Request 75.90% 1.425ms 75.90% 1.425ms 1.425ms 2.272us 17.15% 2.272us 2.272us 1
+ aten::slice 1.36% 25.472us 1.68% 31.591us 5.265us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.33% 6.119us 0.33% 6.119us 1.020us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 12.46% 233.778us 12.46% 233.778us 38.963us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.26% 4.950us 0.26% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.877ms
+Self CUDA time total: 13.248us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T256_D2048
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.615us 923.45% 143.615us 143.615us 1
+ torch_eager 17.00% 110.812us 99.16% 646.262us 646.262us 0.000us 0.00% 18.240us 18.240us 1
+ aten::silu 6.35% 41.393us 70.99% 462.667us 154.222us 7.936us 51.03% 10.624us 3.541us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.03% 7.936us 2.645us 3
+ aten::mul 3.56% 23.221us 6.51% 42.412us 14.137us 7.616us 48.97% 7.616us 2.539us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.97% 7.616us 2.539us 3
+ Activity Buffer Request 32.67% 212.907us 32.67% 212.907us 212.907us 2.688us 17.28% 2.688us 2.688us 1
+ aten::slice 3.77% 24.551us 4.66% 30.371us 5.062us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.89% 5.820us 0.89% 5.820us 0.970us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 34.91% 227.558us 34.91% 227.558us 37.926us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.84% 5.490us 0.84% 5.490us 5.490us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 651.752us
+Self CUDA time total: 15.552us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T512_D768
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.197us 1080.16% 155.197us 155.197us 1
+ torch_eager 6.30% 118.195us 99.70% 1.872ms 1.872ms 0.000us 0.00% 16.864us 16.864us 1
+ aten::silu 2.16% 40.640us 89.31% 1.677ms 558.889us 7.360us 51.22% 9.856us 3.285us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 51.22% 7.360us 2.453us 3
+ aten::mul 1.39% 26.190us 2.47% 46.331us 15.444us 7.008us 48.78% 7.008us 2.336us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.78% 7.008us 2.336us 3
+ Activity Buffer Request 76.28% 1.432ms 76.28% 1.432ms 1.432ms 2.496us 17.37% 2.496us 2.496us 1
+ aten::slice 1.31% 24.671us 1.64% 30.721us 5.120us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.32% 6.050us 0.32% 6.050us 1.008us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 11.93% 224.049us 11.93% 224.049us 37.341us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.30% 5.540us 0.30% 5.540us 5.540us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.877ms
+Self CUDA time total: 14.368us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T512_D1024
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 144.252us 927.61% 144.252us 144.252us 1
+ torch_eager 18.42% 116.554us 99.16% 627.471us 627.471us 0.000us 0.00% 18.239us 18.239us 1
+ aten::silu 6.52% 41.251us 69.31% 438.595us 146.198us 7.968us 51.24% 10.656us 3.552us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.24% 7.968us 2.656us 3
+ aten::mul 3.66% 23.182us 6.58% 41.632us 13.877us 7.583us 48.76% 7.583us 2.528us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.583us 48.76% 7.583us 2.528us 3
+ Activity Buffer Request 30.96% 195.937us 30.96% 195.937us 195.937us 2.688us 17.29% 2.688us 2.688us 1
+ aten::slice 3.89% 24.640us 4.85% 30.690us 5.115us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.96% 6.050us 0.96% 6.050us 1.008us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 34.74% 219.857us 34.74% 219.857us 36.643us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.84% 5.310us 0.84% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 632.781us
+Self CUDA time total: 15.551us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_T512_D2048
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.463us 665.09% 150.463us 150.463us 1
+ torch_eager 5.93% 109.544us 99.69% 1.842ms 1.842ms 0.000us 0.00% 26.527us 26.527us 1
+ aten::silu 2.24% 41.413us 89.69% 1.657ms 552.422us 11.584us 51.20% 15.488us 5.163us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.584us 51.20% 11.584us 3.861us 3
+ aten::mul 1.32% 24.310us 2.35% 43.432us 14.477us 11.039us 48.80% 11.039us 3.680us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.039us 48.80% 11.039us 3.680us 3
+ Activity Buffer Request 76.49% 1.413ms 76.49% 1.413ms 1.413ms 3.904us 17.26% 3.904us 3.904us 1
+ aten::slice 1.39% 25.640us 1.72% 31.740us 5.290us 0.000us 0.00% 0.000us 0.000us 6
+ aten::as_strided 0.33% 6.100us 0.33% 6.100us 1.017us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 12.00% 221.728us 12.00% 221.728us 36.955us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.31% 5.690us 0.31% 5.690us 5.690us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.848ms
+Self CUDA time total: 22.623us
+
+
+impl wl p50(ms) ok
+torch_eager cuda_T128_D1024 0.05 True
+torch_eager cuda_T128_D2048 0.05 True
+torch_eager cuda_T128_D768 0.04 True
+torch_eager cuda_T256_D1024 0.05 True
+torch_eager cuda_T256_D2048 0.05 True
+torch_eager cuda_T256_D768 0.05 True
+torch_eager cuda_T512_D1024 0.05 True
+torch_eager cuda_T512_D2048 0.05 True
+torch_eager cuda_T512_D768 0.05 True
+
+
+
+
+Installed 37 packages in 235ms
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/activation/index.html b/activation/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..ddb801226b9a0aa8d81788bef013e946eb8554ed
--- /dev/null
+++ b/activation/index.html
@@ -0,0 +1,89 @@
+
+
+
+
+
+ Index of /activation
+
+
+
+
+ Index of /activation
+
+
+
\ No newline at end of file
diff --git a/activation/results/artifacts/combine/latency.svg b/activation/results/artifacts/combine/latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..cc4b0932cb850954c8d6b0adb27feeabe3e3f7f4
--- /dev/null
+++ b/activation/results/artifacts/combine/latency.svg
@@ -0,0 +1,318 @@
+
+
\ No newline at end of file
diff --git a/activation/results/cells/combine.py b/activation/results/cells/combine.py
new file mode 100644
index 0000000000000000000000000000000000000000..c27fc2d1dc911098e8feb19f8e4a7ed33d851a12
--- /dev/null
+++ b/activation/results/cells/combine.py
@@ -0,0 +1,27 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "HF Kernels SwiGLU": "UVNOTE_FILE_HF_KERNELS_SWIGLU_BENCHMARK",
+ "PyTorch SwiGLU": "UVNOTE_FILE_TORCH_SWIGLU_BENCHMARK",
+ # "Compiled SwiGLU": "UVNOTE_FILE_COMPILED_SWIGLU_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="activation.jsonl",
+ svg_filename="latency.svg"
+)
\ No newline at end of file
diff --git a/activation/results/combined_results.html b/activation/results/combined_results.html
new file mode 100644
index 0000000000000000000000000000000000000000..432464a5a72bec48d82e3ed28b902538354c87bf
--- /dev/null
+++ b/activation/results/combined_results.html
@@ -0,0 +1,4654 @@
+
+
+
+
+
+ SwiGLU Activation Benchmark - Combined Results
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
SwiGLU Activation Benchmarks - Aggregated Results
+
This document combines benchmark results from multiple SwiGLU activation implementations.
+
Combined Summary and Visualization
+
+
+
+
+
+
+
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "HF Kernels SwiGLU": "UVNOTE_FILE_HF_KERNELS_SWIGLU_BENCHMARK",
+ "PyTorch SwiGLU": "UVNOTE_FILE_TORCH_SWIGLU_BENCHMARK",
+ # "Compiled SwiGLU": "UVNOTE_FILE_COMPILED_SWIGLU_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="activation.jsonl",
+ svg_filename="latency.svg"
+)
+
+
+
+
+
+
+
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ HF Kernels SwiGLU : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b
+✓ PyTorch SwiGLU : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb
+
+ ✓ Found HF Kernels SwiGLU
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b/activation.jsonl
+ ✓ Found PyTorch SwiGLU
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb/activation.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl wl p50(ms) ok
+hf_kernels_swiglu cuda_T128_D1024 0.03 True
+hf_kernels_swiglu cuda_T128_D2048 0.03 True
+hf_kernels_swiglu cuda_T128_D768 0.02 True
+hf_kernels_swiglu cuda_T256_D1024 0.03 True
+hf_kernels_swiglu cuda_T256_D2048 0.03 True
+hf_kernels_swiglu cuda_T256_D768 0.03 True
+hf_kernels_swiglu cuda_T512_D1024 0.03 True
+hf_kernels_swiglu cuda_T512_D2048 0.03 True
+hf_kernels_swiglu cuda_T512_D768 0.03 True
+torch_eager cuda_T128_D1024 0.05 True
+torch_eager cuda_T128_D2048 0.05 True
+torch_eager cuda_T128_D768 0.04 True
+torch_eager cuda_T256_D1024 0.05 True
+torch_eager cuda_T256_D2048 0.05 True
+torch_eager cuda_T256_D768 0.05 True
+torch_eager cuda_T512_D1024 0.05 True
+torch_eager cuda_T512_D2048 0.05 True
+torch_eager cuda_T512_D768 0.05 True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 18 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+ ✓ HF Kernels SwiGLU
+ ✓ PyTorch SwiGLU
+
+
+
+
+Installed 37 packages in 208ms
+
+
+
+
Artifacts:
+
latency.svg
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/activation/results/index.html b/activation/results/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..5c60fe94ab1a86a4d9f299448a7d8a5b85027447
--- /dev/null
+++ b/activation/results/index.html
@@ -0,0 +1,88 @@
+
+
+
+
+
+ Index of /activation/results
+
+
+
+
+ Index of /activation/results
+
+
+
\ No newline at end of file
diff --git a/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..42d266f1bc1ac47e5d0d53656b57da93664a039f
--- /dev/null
+++ b/causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
@@ -0,0 +1,24 @@
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.046111000017390325, "p50": 0.046270999973785365, "p90": 0.04740100001754399, "mean": 0.04670720001058726, "iqr": 0.001160000010713702, "raw_times": [0.047512000037386315, 0.04740100001754399, 0.04624100000683029, 0.046270999973785365, 0.046111000017390325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05871199999774035, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05225199998903918, "p50": 0.053462000039417035, "p90": 0.053592000028857, "mean": 0.05365380001194353, "iqr": 0.0002100000529026147, "raw_times": [0.053462000039417035, 0.055581000026450056, 0.053592000028857, 0.053381999975954386, 0.05225199998903918], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0581319999923835, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05121200001667603, "p50": 0.05470199999990655, "p90": 0.05482099999198908, "mean": 0.05431980000594194, "iqr": 0.0013289999856169743, "raw_times": [0.05121200001667603, 0.057372000014765945, 0.05470199999990655, 0.05482099999198908, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056541999981618574, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05210199998373355, "p50": 0.05333199999313365, "p90": 0.05396199998131124, "mean": 0.05322599998862643, "iqr": 0.0016399999935856613, "raw_times": [0.05210199998373355, 0.05333199999313365, 0.05396199998131124, 0.052321999987725576, 0.05441199999722812], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09094299997514099, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05103099999814731, "p50": 0.05309199997327596, "p90": 0.053381999975954386, "mean": 0.05291379998197954, "iqr": 0.0004199999921183917, "raw_times": [0.053381999975954386, 0.052961999983835994, 0.05103099999814731, 0.05309199997327596, 0.054101999978684034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05603199997494812, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051181999992877536, "p50": 0.05189199998767435, "p90": 0.05201199996918149, "mean": 0.052023999978700886, "iqr": 0.0004999999987376214, "raw_times": [0.05151199997044387, 0.05352199997332718, 0.05189199998767435, 0.05201199996918149, 0.051181999992877536], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055981999992127385, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05042200001525998, "p50": 0.052002000018092076, "p90": 0.05382199998393844, "mean": 0.05366420000427752, "iqr": 0.00333999997792489, "raw_times": [0.05048200000601355, 0.05042200001525998, 0.052002000018092076, 0.06159299999808354, 0.05382199998393844], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05433199999060889, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0522220000220841, "p50": 0.053632000003744906, "p90": 0.05870200004665094, "mean": 0.056078200009324064, "iqr": 0.005690000079994206, "raw_times": [0.0522220000220841, 0.06282300000748364, 0.053632000003744906, 0.05301199996665673, 0.05870200004665094], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.055741999972269696, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05032100000335049, "p50": 0.050921000024573004, "p90": 0.05318199998782802, "mean": 0.05303959999309882, "iqr": 0.0023800000121809717, "raw_times": [0.05080199997564705, 0.050921000024573004, 0.05032100000335049, 0.059971999974095525, 0.05318199998782802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0550720000092042, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05211199999166638, "p50": 0.05235200001152407, "p90": 0.053132000005007285, "mean": 0.05707820000679931, "iqr": 0.0008700000080352766, "raw_times": [0.05235200001152407, 0.05226199999697201, 0.053132000005007285, 0.07553300002882679, 0.05211199999166638], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05610199997363452, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0512720000074296, "p50": 0.0524320000181433, "p90": 0.05278200001157529, "mean": 0.05529400000341411, "iqr": 0.000919999990856013, "raw_times": [0.05278200001157529, 0.0524320000181433, 0.0512720000074296, 0.0681219999592031, 0.05186200002071928], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05547199998545693, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05112100001269937, "p50": 0.051342000006115995, "p90": 0.05172099997707846, "mean": 0.053885599993463984, "iqr": 0.00040899999476096127, "raw_times": [0.05112100001269937, 0.06393199998910859, 0.05172099997707846, 0.0513119999823175, 0.051342000006115995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055091999968226446, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050531999988834286, "p50": 0.05176199999823439, "p90": 0.051821999988987955, "mean": 0.05163600000059887, "iqr": 0.0003099999617006688, "raw_times": [0.050531999988834286, 0.05176199999823439, 0.052551999999650434, 0.051821999988987955, 0.051512000027287286], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055182000039621926, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124200004047452, "p50": 0.05148200000348879, "p90": 0.05251200002476253, "mean": 0.051918000008299714, "iqr": 0.0011100000278929656, "raw_times": [0.05251200002476253, 0.05295199997590316, 0.05148200000348879, 0.05140199999686956, 0.05124200004047452], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05506200000127137, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05025200005093211, "p50": 0.05105200000343757, "p90": 0.05146199998762313, "mean": 0.05136380001431462, "iqr": 0.0005399999736255268, "raw_times": [0.05146199998762313, 0.053131000015582686, 0.050922000013997604, 0.05025200005093211, 0.05105200000343757], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0684330000240152, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051181999992877536, "p50": 0.052152000023397704, "p90": 0.05241200000227764, "mean": 0.05240600000888662, "iqr": 0.00034999999343199306, "raw_times": [0.052152000023397704, 0.05422200001703459, 0.05241200000227764, 0.051181999992877536, 0.052062000008845644], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05490099999860831, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05016099999011203, "p50": 0.05225199998903918, "p90": 0.05251199996791911, "mean": 0.05182779999586273, "iqr": 0.001349999934063817, "raw_times": [0.05016099999011203, 0.053051999998388055, 0.05116200003385529, 0.05251199996791911, 0.05225199998903918], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05627199999480581, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05154100000481776, "p50": 0.0524320000181433, "p90": 0.05299099996136647, "mean": 0.05266959998380116, "iqr": 0.0006189999908201571, "raw_times": [0.05154100000481776, 0.054011999964131974, 0.05299099996136647, 0.0524320000181433, 0.05237199997054631], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05572200001324745, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05098199994790775, "p50": 0.05128100002593783, "p90": 0.052071999959935056, "mean": 0.05161159999715892, "iqr": 0.0008409999168179638, "raw_times": [0.05098199994790775, 0.052071999959935056, 0.05128100002593783, 0.05123100004311709, 0.052492000008896866], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055401999986770534, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050202000011267955, "p50": 0.05295199997590316, "p90": 0.05307200001425372, "mean": 0.052619999996750266, "iqr": 0.00046000002384971594, "raw_times": [0.050202000011267955, 0.05307200001425372, 0.054261999991922494, 0.05295199997590316, 0.052611999990404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05440200004613871, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05220100001679384, "p50": 0.052891999985149596, "p90": 0.05323199997064876, "mean": 0.05431980000594194, "iqr": 0.0007509999591093219, "raw_times": [0.05220100001679384, 0.052891999985149596, 0.05323199997064876, 0.06079300004557808, 0.052481000011539436], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0552820000052634, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05108200002723606, "p50": 0.05157200001804085, "p90": 0.053041000001030625, "mean": 0.051985800007514626, "iqr": 0.0018490000002202578, "raw_times": [0.05157200001804085, 0.05108200002723606, 0.053041000001030625, 0.05119200000081037, 0.053041999990455224], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05657200000541707, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05095099999152808, "p50": 0.0515919999770631, "p90": 0.05208099997844329, "mean": 0.05173159999003474, "iqr": 0.0006789999815737247, "raw_times": [0.0515919999770631, 0.05208099997844329, 0.052632000006269664, 0.05095099999152808, 0.05140199999686956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056392000033156364, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
+{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05110099999683371, "p50": 0.051662000032592914, "p90": 0.051741999982368725, "mean": 0.05161380000799909, "iqr": 0.00010999997357430402, "raw_times": [0.05163200000879442, 0.05110099999683371, 0.051741999982368725, 0.051662000032592914, 0.05193200001940568], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05588200002648591, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
diff --git a/causal_conv1d/impls/cells/benchmark.py b/causal_conv1d/impls/cells/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..725b12c4018e4eec05c5ddccb0c88a8eae6f150d
--- /dev/null
+++ b/causal_conv1d/impls/cells/benchmark.py
@@ -0,0 +1,31 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the causal conv1d kernel
+causal_conv1d = get_kernel("kernels-community/causal-conv1d")
+
+
+def hf_kernels_causal_conv1d(input_tensor, weight, bias):
+ return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
+ impl_name="hf_kernels_causal_conv1d",
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
+ impl_func=hf_kernels_causal_conv1d,
+)
\ No newline at end of file
diff --git a/causal_conv1d/impls/cells/nv.py b/causal_conv1d/impls/cells/nv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5
--- /dev/null
+++ b/causal_conv1d/impls/cells/nv.py
@@ -0,0 +1,2 @@
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
\ No newline at end of file
diff --git a/causal_conv1d/impls/hf_kernels_causal_conv1d.html b/causal_conv1d/impls/hf_kernels_causal_conv1d.html
new file mode 100644
index 0000000000000000000000000000000000000000..4299d818b5235441dc2b29a2515c8e5718a75f01
--- /dev/null
+++ b/causal_conv1d/impls/hf_kernels_causal_conv1d.html
@@ -0,0 +1,4542 @@
+
+
+
+
+
+ hf_kernels_causal_conv1d
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
HF Kernels - Causal Conv1D
+
GPU Info
+
+
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+
+
+
+
+
+
Wed Oct 29 00:36:08 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 30C P0 87W / 350W | 0MiB / 46068MiB | 18% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
Causal Conv1D Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the causal conv1d kernel
+causal_conv1d = get_kernel("kernels-community/causal-conv1d")
+
+
+def hf_kernels_causal_conv1d(input_tensor, weight, bias):
+ return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
+ impl_name="hf_kernels_causal_conv1d",
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
+ impl_func=hf_kernels_causal_conv1d,
+)
+
+
+
+
+
+
+
Running causal_conv1d benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 144.543us 3556.67% 144.543us 144.543us 1
+ hf_kernels_causal_conv1d 8.86% 163.685us 99.64% 1.842ms 1.842ms 0.000us 0.00% 5.504us 5.504us 1
+ CausalConv1dFn 5.76% 106.513us 90.78% 1.678ms 559.289us 0.000us 0.00% 5.504us 1.835us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 2.24% 41.381us 81.42% 1.505ms 501.611us 4.064us 100.00% 5.504us 1.835us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.064us 100.00% 4.064us 1.355us 3
+ Activity Buffer Request 76.78% 1.419ms 76.78% 1.419ms 1.419ms 1.440us 35.43% 1.440us 1.440us 1
+ aten::empty_like 0.97% 17.931us 3.60% 66.522us 22.174us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 2.63% 48.591us 2.63% 48.591us 16.197us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 2.40% 44.403us 2.40% 44.403us 14.801us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.36% 6.650us 0.36% 6.650us 6.650us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.848ms
+Self CUDA time total: 4.064us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 128.574us 3378.19% 128.574us 128.574us 1
+ hf_kernels_causal_conv1d 6.39% 108.804us 99.67% 1.696ms 1.696ms 0.000us 0.00% 5.085us 5.085us 1
+ CausalConv1dFn 4.62% 78.561us 93.27% 1.588ms 529.188us 0.000us 0.00% 5.085us 1.695us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.51% 25.693us 86.85% 1.478ms 492.734us 3.806us 100.00% 5.085us 1.695us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.806us 100.00% 3.806us 1.269us 3
+ Activity Buffer Request 83.54% 1.422ms 83.54% 1.422ms 1.422ms 1.279us 33.60% 1.279us 1.279us 1
+ aten::empty_like 0.47% 8.001us 1.81% 30.802us 10.267us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.34% 22.801us 1.34% 22.801us 7.600us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.80% 30.601us 1.80% 30.601us 10.200us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.33% 5.681us 0.33% 5.681us 5.681us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.702ms
+Self CUDA time total: 3.806us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 126.111us 3339.80% 126.111us 126.111us 1
+ hf_kernels_causal_conv1d 5.63% 95.933us 99.70% 1.698ms 1.698ms 0.000us 0.00% 5.056us 5.056us 1
+ CausalConv1dFn 4.46% 75.892us 94.07% 1.602ms 534.022us 0.000us 0.00% 5.056us 1.685us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.40% 23.785us 87.77% 1.495ms 498.271us 3.776us 100.00% 5.056us 1.685us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.776us 100.00% 3.776us 1.259us 3
+ Activity Buffer Request 84.61% 1.441ms 84.61% 1.441ms 1.441ms 1.280us 33.90% 1.280us 1.280us 1
+ aten::empty_like 0.49% 8.320us 1.84% 31.360us 10.453us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.35% 23.040us 1.35% 23.040us 7.680us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.77% 30.070us 1.77% 30.070us 10.023us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.30% 5.061us 0.30% 5.061us 5.061us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.703ms
+Self CUDA time total: 3.776us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 128.703us 3352.51% 128.703us 128.703us 1
+ hf_kernels_causal_conv1d 5.02% 93.431us 99.72% 1.856ms 1.856ms 0.000us 0.00% 5.119us 5.119us 1
+ CausalConv1dFn 4.18% 77.825us 94.70% 1.762ms 587.414us 0.000us 0.00% 5.119us 1.706us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.36% 25.311us 88.83% 1.653ms 551.005us 3.839us 100.00% 5.119us 1.706us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.839us 100.00% 3.839us 1.280us 3
+ Activity Buffer Request 76.83% 1.430ms 76.83% 1.430ms 1.430ms 1.280us 33.34% 1.280us 1.280us 1
+ aten::empty_like 0.45% 8.401us 1.69% 31.401us 10.467us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.24% 23.000us 1.24% 23.000us 7.667us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 10.65% 198.147us 10.65% 198.147us 66.049us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.28% 5.120us 0.28% 5.120us 5.120us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.861ms
+Self CUDA time total: 3.839us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 125.247us 2609.31% 125.247us 125.247us 1
+ hf_kernels_causal_conv1d 5.46% 99.082us 99.73% 1.809ms 1.809ms 0.000us 0.00% 6.432us 6.432us 1
+ CausalConv1dFn 4.18% 75.835us 94.27% 1.709ms 569.830us 0.000us 0.00% 6.432us 2.144us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.40% 25.379us 88.34% 1.602ms 533.975us 4.800us 100.00% 6.432us 2.144us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.800us 100.00% 4.800us 1.600us 3
+ Activity Buffer Request 77.97% 1.414ms 77.97% 1.414ms 1.414ms 1.632us 34.00% 1.632us 1.632us 1
+ aten::empty_like 0.46% 8.420us 1.75% 31.730us 10.577us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.29% 23.310us 1.29% 23.310us 7.770us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.97% 162.627us 8.97% 162.627us 54.209us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 4.860us 0.27% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.813ms
+Self CUDA time total: 4.800us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D64_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.190us 2487.38% 120.190us 120.190us 1
+ hf_kernels_causal_conv1d 14.45% 80.914us 99.14% 554.970us 554.970us 0.000us 0.00% 6.464us 6.464us 1
+ CausalConv1dFn 12.94% 72.432us 84.69% 474.056us 158.019us 0.000us 0.00% 6.464us 2.155us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.57% 25.572us 66.53% 372.404us 124.135us 4.832us 100.00% 6.464us 2.155us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.832us 100.00% 4.832us 1.611us 3
+ Activity Buffer Request 34.22% 191.566us 34.22% 191.566us 191.566us 1.632us 33.77% 1.632us 1.632us 1
+ aten::empty_like 1.40% 7.860us 5.22% 29.220us 9.740us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.82% 21.360us 3.82% 21.360us 7.120us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 27.74% 155.266us 27.74% 155.266us 51.755us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.86% 4.800us 0.86% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 559.770us
+Self CUDA time total: 4.832us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 133.279us 1243.27% 133.279us 133.279us 1
+ hf_kernels_causal_conv1d 5.54% 100.182us 99.73% 1.805ms 1.805ms 0.000us 0.00% 14.336us 14.336us 1
+ CausalConv1dFn 4.54% 82.173us 94.20% 1.705ms 568.267us 0.000us 0.00% 14.336us 4.779us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.47% 26.531us 87.96% 1.592ms 530.609us 10.720us 100.00% 14.336us 4.779us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.720us 100.00% 10.720us 3.573us 3
+ Activity Buffer Request 77.82% 1.408ms 77.82% 1.408ms 1.408ms 3.616us 33.73% 3.616us 3.616us 1
+ aten::empty_like 0.46% 8.260us 1.70% 30.801us 10.267us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.25% 22.541us 1.25% 22.541us 7.514us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.67% 156.947us 8.67% 156.947us 52.316us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 4.830us 0.27% 4.830us 4.830us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.810ms
+Self CUDA time total: 10.720us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.037us 1127.54% 123.037us 123.037us 1
+ hf_kernels_causal_conv1d 20.63% 102.765us 99.04% 493.397us 493.397us 0.000us 0.00% 14.592us 14.592us 1
+ CausalConv1dFn 14.78% 73.650us 78.41% 390.632us 130.211us 0.000us 0.00% 14.592us 4.864us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.23% 26.041us 57.43% 286.091us 95.364us 10.912us 100.00% 14.592us 4.864us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 10.912us 100.00% 10.912us 3.637us 3
+ Activity Buffer Request 21.15% 105.364us 21.15% 105.364us 105.364us 3.680us 33.72% 3.680us 3.680us 1
+ aten::empty_like 1.51% 7.510us 6.20% 30.891us 10.297us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.69% 23.381us 4.69% 23.381us 7.794us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 31.05% 154.686us 31.05% 154.686us 51.562us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.96% 4.790us 0.96% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 498.187us
+Self CUDA time total: 10.912us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 130.944us 1189.53% 130.944us 130.944us 1
+ hf_kernels_causal_conv1d 5.42% 97.593us 99.72% 1.796ms 1.796ms 0.000us 0.00% 14.720us 14.720us 1
+ CausalConv1dFn 4.08% 73.404us 94.31% 1.699ms 566.233us 0.000us 0.00% 14.720us 4.907us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.44% 26.001us 88.45% 1.593ms 531.068us 11.008us 100.00% 14.720us 4.907us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.008us 100.00% 11.008us 3.669us 3
+ Activity Buffer Request 78.36% 1.411ms 78.36% 1.411ms 1.411ms 3.712us 33.72% 3.712us 3.712us 1
+ aten::empty_like 0.46% 8.350us 1.78% 32.090us 10.697us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.32% 23.740us 1.32% 23.740us 7.913us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.65% 155.786us 8.65% 155.786us 51.929us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.28% 4.990us 0.28% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.801ms
+Self CUDA time total: 11.008us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 122.014us 1080.15% 122.014us 122.014us 1
+ hf_kernels_causal_conv1d 12.40% 73.852us 99.19% 590.511us 590.511us 0.000us 0.00% 15.104us 15.104us 1
+ CausalConv1dFn 12.35% 73.524us 86.78% 516.659us 172.220us 0.000us 0.00% 15.104us 5.035us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.03% 24.020us 69.45% 413.474us 137.825us 11.296us 100.00% 15.104us 5.035us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 11.296us 100.00% 11.296us 3.765us 3
+ Activity Buffer Request 38.81% 231.068us 38.81% 231.068us 231.068us 3.808us 33.71% 3.808us 3.808us 1
+ aten::empty_like 1.25% 7.459us 4.98% 29.661us 9.887us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.73% 22.202us 3.73% 22.202us 7.401us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 26.60% 158.386us 26.60% 158.386us 52.795us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.81% 4.840us 0.81% 4.840us 4.840us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 595.351us
+Self CUDA time total: 11.296us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 135.582us 269.70% 135.582us 135.582us 1
+ hf_kernels_causal_conv1d 12.51% 76.722us 99.20% 608.371us 608.371us 0.000us 0.00% 83.711us 83.711us 1
+ CausalConv1dFn 13.24% 81.202us 86.69% 531.649us 177.216us 0.000us 0.00% 83.711us 27.904us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.12% 25.291us 68.50% 420.085us 140.028us 50.271us 100.00% 83.711us 27.904us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 50.271us 100.00% 50.271us 16.757us 3
+ Activity Buffer Request 38.84% 238.229us 38.84% 238.229us 238.229us 33.440us 66.52% 33.440us 33.440us 1
+ aten::empty_like 1.27% 7.790us 4.95% 30.362us 10.121us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.68% 22.572us 3.68% 22.572us 7.524us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 25.53% 156.565us 25.53% 156.565us 52.188us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.80% 4.910us 0.80% 4.910us 4.910us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 613.281us
+Self CUDA time total: 50.271us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B2_D2048_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 127.740us 248.41% 127.740us 127.740us 1
+ hf_kernels_causal_conv1d 15.37% 77.574us 99.04% 499.998us 499.998us 0.000us 0.00% 85.854us 85.854us 1
+ CausalConv1dFn 14.63% 73.842us 83.68% 422.424us 140.808us 0.000us 0.00% 85.854us 28.618us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.23% 26.412us 63.27% 319.402us 106.467us 51.423us 100.00% 85.854us 28.618us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 51.423us 100.00% 51.423us 17.141us 3
+ Activity Buffer Request 27.23% 137.484us 27.23% 137.484us 137.484us 34.431us 66.96% 34.431us 34.431us 1
+ aten::empty_like 1.41% 7.140us 5.78% 29.180us 9.727us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.37% 22.040us 4.37% 22.040us 7.347us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 30.80% 155.506us 30.80% 155.506us 51.835us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.96% 4.831us 0.96% 4.831us 4.831us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 504.829us
+Self CUDA time total: 51.423us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 117.437us 3008.12% 117.437us 117.437us 1
+ hf_kernels_causal_conv1d 12.18% 74.242us 99.17% 604.340us 604.340us 0.000us 0.00% 5.152us 5.152us 1
+ CausalConv1dFn 11.66% 71.062us 86.99% 530.098us 176.699us 0.000us 0.00% 5.152us 1.717us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.18% 25.499us 70.51% 429.675us 143.225us 3.904us 100.00% 5.152us 1.717us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.904us 100.00% 3.904us 1.301us 3
+ Activity Buffer Request 41.02% 249.979us 41.02% 249.979us 249.979us 1.248us 31.97% 1.248us 1.248us 1
+ aten::empty_like 1.33% 8.110us 4.82% 29.361us 9.787us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.49% 21.251us 3.49% 21.251us 7.084us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 25.30% 154.197us 25.30% 154.197us 51.399us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.83% 5.050us 0.83% 5.050us 5.050us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 609.390us
+Self CUDA time total: 3.904us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 157.920us 4012.20% 157.920us 157.920us 1
+ hf_kernels_causal_conv1d 19.90% 106.583us 99.11% 530.709us 530.709us 0.000us 0.00% 5.216us 5.216us 1
+ CausalConv1dFn 15.55% 83.245us 79.21% 424.126us 141.375us 0.000us 0.00% 5.216us 1.739us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.02% 26.862us 57.76% 309.281us 103.094us 3.936us 100.00% 5.216us 1.739us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 3.936us 100.00% 3.936us 1.312us 3
+ Activity Buffer Request 22.25% 119.154us 22.25% 119.154us 119.154us 1.280us 32.52% 1.280us 1.280us 1
+ aten::empty_like 1.55% 8.320us 5.90% 31.600us 10.533us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.35% 23.280us 4.35% 23.280us 7.760us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 30.49% 163.265us 30.49% 163.265us 54.422us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.89% 4.750us 0.89% 4.750us 4.750us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 535.459us
+Self CUDA time total: 3.936us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.777us 2998.47% 123.777us 123.777us 1
+ hf_kernels_causal_conv1d 13.54% 78.054us 99.15% 571.700us 571.700us 0.000us 0.00% 5.504us 5.504us 1
+ CausalConv1dFn 12.84% 74.051us 85.62% 493.646us 164.549us 0.000us 0.00% 5.504us 1.835us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.19% 24.152us 67.43% 388.784us 129.595us 4.128us 100.00% 5.504us 1.835us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.128us 100.00% 4.128us 1.376us 3
+ Activity Buffer Request 36.14% 208.368us 36.14% 208.368us 208.368us 1.376us 33.33% 1.376us 1.376us 1
+ aten::empty_like 1.37% 7.901us 5.34% 30.811us 10.270us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.97% 22.910us 3.97% 22.910us 7.637us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 27.10% 156.264us 27.10% 156.264us 52.088us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.85% 4.881us 0.85% 4.881us 4.881us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 576.581us
+Self CUDA time total: 4.128us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 118.875us 2925.07% 118.875us 118.875us 1
+ hf_kernels_causal_conv1d 17.67% 83.134us 98.92% 465.527us 465.527us 0.000us 0.00% 5.440us 5.440us 1
+ CausalConv1dFn 15.04% 70.762us 81.26% 382.393us 127.464us 0.000us 0.00% 5.440us 1.813us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 4.98% 23.432us 59.87% 281.731us 93.910us 4.064us 100.00% 5.440us 1.813us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 4.064us 100.00% 4.064us 1.355us 3
+ Activity Buffer Request 22.07% 103.873us 22.07% 103.873us 103.873us 1.376us 33.86% 1.376us 1.376us 1
+ aten::empty_like 1.61% 7.590us 6.35% 29.900us 9.967us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.74% 22.310us 4.74% 22.310us 7.437us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 32.82% 154.426us 32.82% 154.426us 51.475us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.08% 5.061us 1.08% 5.061us 5.061us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 470.588us
+Self CUDA time total: 4.064us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 126.722us 2343.23% 126.722us 126.722us 1
+ hf_kernels_causal_conv1d 12.92% 104.393us 99.42% 803.188us 803.188us 0.000us 0.00% 7.264us 7.264us 1
+ CausalConv1dFn 9.39% 75.863us 86.50% 698.795us 232.932us 0.000us 0.00% 7.264us 2.421us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 3.09% 24.969us 73.13% 590.770us 196.923us 5.408us 100.00% 7.264us 2.421us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.408us 100.00% 5.408us 1.803us 3
+ Activity Buffer Request 49.73% 401.794us 49.73% 401.794us 401.794us 1.856us 34.32% 1.856us 1.856us 1
+ aten::empty_like 0.96% 7.780us 3.98% 32.162us 10.721us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 3.02% 24.382us 3.02% 24.382us 8.127us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 20.30% 164.007us 20.30% 164.007us 54.669us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.58% 4.700us 0.58% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 807.888us
+Self CUDA time total: 5.408us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D64_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 120.446us 2201.13% 120.446us 120.446us 1
+ hf_kernels_causal_conv1d 18.67% 89.551us 99.02% 474.966us 474.966us 0.000us 0.00% 7.328us 7.328us 1
+ CausalConv1dFn 15.56% 74.654us 80.35% 385.415us 128.472us 0.000us 0.00% 7.328us 2.443us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.05% 24.231us 58.47% 280.459us 93.486us 5.472us 100.00% 7.328us 2.443us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 5.472us 100.00% 5.472us 1.824us 3
+ Activity Buffer Request 20.97% 100.573us 20.97% 100.573us 100.573us 1.856us 33.92% 1.856us 1.856us 1
+ aten::empty_like 1.52% 7.312us 6.32% 30.302us 10.101us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.79% 22.990us 4.79% 22.990us 7.663us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 32.45% 155.655us 32.45% 155.655us 51.885us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.98% 4.720us 0.98% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 479.686us
+Self CUDA time total: 5.472us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 130.208us 742.52% 130.208us 130.208us 1
+ hf_kernels_causal_conv1d 5.57% 103.684us 99.74% 1.855ms 1.855ms 0.000us 0.00% 23.424us 23.424us 1
+ CausalConv1dFn 4.08% 75.922us 94.16% 1.751ms 583.780us 0.000us 0.00% 23.424us 7.808us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.22% 22.672us 88.43% 1.645ms 548.249us 17.536us 100.00% 23.424us 7.808us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.536us 100.00% 17.536us 5.845us 3
+ Activity Buffer Request 78.77% 1.465ms 78.77% 1.465ms 1.465ms 5.888us 33.58% 5.888us 5.888us 1
+ aten::empty_like 0.43% 7.931us 1.65% 30.671us 10.224us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.22% 22.740us 1.22% 22.740us 7.580us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.44% 157.016us 8.44% 157.016us 52.339us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.26% 4.860us 0.26% 4.860us 4.860us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.860ms
+Self CUDA time total: 17.536us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.970us 691.76% 123.970us 123.970us 1
+ hf_kernels_causal_conv1d 18.87% 88.734us 98.86% 464.856us 464.856us 0.000us 0.00% 23.905us 23.905us 1
+ CausalConv1dFn 15.17% 71.352us 79.99% 376.122us 125.374us 0.000us 0.00% 23.905us 7.968us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.25% 24.691us 58.28% 274.030us 91.343us 17.921us 100.00% 23.905us 7.968us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 17.921us 100.00% 17.921us 5.974us 3
+ Activity Buffer Request 19.83% 93.233us 19.83% 93.233us 93.233us 5.984us 33.39% 5.984us 5.984us 1
+ aten::empty_like 1.60% 7.540us 6.54% 30.740us 10.247us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.93% 23.200us 4.93% 23.200us 7.733us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 33.20% 156.106us 33.20% 156.106us 52.035us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.14% 5.350us 1.14% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 470.206us
+Self CUDA time total: 17.921us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 130.879us 726.50% 130.879us 130.879us 1
+ hf_kernels_causal_conv1d 5.43% 99.212us 99.73% 1.824ms 1.824ms 0.000us 0.00% 24.063us 24.063us 1
+ CausalConv1dFn 4.16% 76.013us 94.31% 1.725ms 574.860us 0.000us 0.00% 24.063us 8.021us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.28% 23.352us 88.43% 1.617ms 539.055us 18.015us 100.00% 24.063us 8.021us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.015us 100.00% 18.015us 6.005us 3
+ Activity Buffer Request 78.67% 1.439ms 78.67% 1.439ms 1.439ms 6.048us 33.57% 6.048us 6.048us 1
+ aten::empty_like 0.41% 7.570us 1.72% 31.401us 10.467us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.30% 23.831us 1.30% 23.831us 7.944us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 8.49% 155.235us 8.49% 155.235us 51.745us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.27% 4.890us 0.27% 4.890us 4.890us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.829ms
+Self CUDA time total: 18.015us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 123.645us 665.08% 123.645us 123.645us 1
+ hf_kernels_causal_conv1d 22.59% 109.155us 99.05% 478.537us 478.537us 0.000us 0.00% 24.830us 24.830us 1
+ CausalConv1dFn 15.84% 76.521us 76.45% 369.382us 123.127us 0.000us 0.00% 24.830us 8.277us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.13% 24.791us 54.61% 263.860us 87.953us 18.591us 100.00% 24.830us 8.277us 3
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 18.591us 100.00% 18.591us 6.197us 3
+ Activity Buffer Request 17.60% 85.023us 17.60% 85.023us 85.023us 6.239us 33.56% 6.239us 6.239us 1
+ aten::empty_like 1.53% 7.411us 6.00% 29.001us 9.667us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.47% 21.590us 4.47% 21.590us 7.197us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 31.88% 154.046us 31.88% 154.046us 51.349us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.95% 4.601us 0.95% 4.601us 4.601us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 483.138us
+Self CUDA time total: 18.591us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 5.67% 104.074us 99.72% 1.829ms 1.829ms 0.000us 0.00% 162.623us 162.623us 1
+ CausalConv1dFn 4.47% 81.893us 94.05% 1.725ms 574.926us 0.000us 0.00% 162.623us 54.208us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 1.47% 26.950us 87.82% 1.611ms 536.865us 97.823us 100.00% 162.623us 54.208us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 146.719us 149.98% 146.719us 146.719us 1
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 97.823us 100.00% 97.823us 32.608us 3
+ Activity Buffer Request 77.02% 1.413ms 77.02% 1.413ms 1.413ms 64.800us 66.24% 64.800us 64.800us 1
+ aten::empty_like 0.45% 8.219us 1.76% 32.292us 10.764us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 1.31% 24.073us 1.31% 24.073us 8.024us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 9.33% 171.076us 9.33% 171.076us 57.025us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 0.28% 5.071us 0.28% 5.071us 5.071us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.834ms
+Self CUDA time total: 97.823us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_causal_conv1d | cuda_B4_D2048_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_causal_conv1d 19.41% 95.622us 98.95% 487.536us 487.536us 0.000us 0.00% 165.309us 165.309us 1
+ CausalConv1dFn 15.06% 74.214us 79.54% 391.914us 130.638us 0.000us 0.00% 165.309us 55.103us 3
+ _causal_conv1d_90f5a60::causal_conv1d_fwd 5.22% 25.702us 58.53% 288.390us 96.130us 99.646us 100.00% 165.309us 55.103us 3
+ hf_kernels_causal_conv1d 0.00% 0.000us 0.00% 0.000us 0.000us 134.941us 135.42% 134.941us 134.941us 1
+void causal_conv1d_fwd_kernel<Causal_conv1d_fwd_kern... 0.00% 0.000us 0.00% 0.000us 0.000us 99.646us 100.00% 99.646us 33.215us 3
+ Activity Buffer Request 20.90% 102.993us 20.90% 102.993us 102.993us 65.663us 65.90% 65.663us 65.663us 1
+ aten::empty_like 1.51% 7.430us 5.95% 29.310us 9.770us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 4.44% 21.880us 4.44% 21.880us 7.293us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 32.41% 159.695us 32.41% 159.695us 53.232us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 1.05% 5.180us 1.05% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 492.716us
+Self CUDA time total: 99.646us
+
+
+impl wl p50(ms) ok
+hf_kernels_causal_conv1d cuda_B2_D2048_S128_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D2048_S128_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S512_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S512_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S128_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
+
+
+
+
+Installed 15 packages in 13ms
+
+
+
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
+Fetching 11 files: 18%|█▊ | 2/11 [00:00<00:00, 14.15it/s]
+Fetching 11 files: 64%|██████▎ | 7/11 [00:02<00:01, 3.20it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:02<00:00, 5.38it/s]
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/causal_conv1d/impls/index.html b/causal_conv1d/impls/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..53e1498a3461d88697955f51c8d45218113d21f0
--- /dev/null
+++ b/causal_conv1d/impls/index.html
@@ -0,0 +1,89 @@
+
+
+
+
+
+ Index of /causal_conv1d/impls
+
+
+
+
+ Index of /causal_conv1d/impls
+
+
+
\ No newline at end of file
diff --git a/causal_conv1d/impls/torch_causal_conv1d.html b/causal_conv1d/impls/torch_causal_conv1d.html
new file mode 100644
index 0000000000000000000000000000000000000000..ff8df9ac4df5c3feb7604ba7136a718d0e2413fd
--- /dev/null
+++ b/causal_conv1d/impls/torch_causal_conv1d.html
@@ -0,0 +1,4787 @@
+
+
+
+
+
+ torch_causal_conv1d
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
PyTorch Native - Causal Conv1D
+
GPU Info
+
+
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+
+
+
+
+
+
Wed Oct 29 00:36:08 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 30C P0 87W / 350W | 0MiB / 46068MiB | 18% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
Causal Conv1D Benchmark (PyTorch Native)
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import torch.nn.functional as F
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_causal_conv1d(input_tensor, weight, bias):
+ # Convert to weight dtype for computation
+ x = input_tensor.to(weight.dtype)
+ dim = weight.shape[0]
+ width = weight.shape[1]
+ seqlen = input_tensor.shape[-1]
+
+ # Depthwise causal conv1d using PyTorch
+ out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
+
+ # Truncate to original sequence length
+ out = out[..., :seqlen]
+
+ # Convert back to original dtype
+ return out.to(input_tensor.dtype)
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
+ impl_name="torch_eager",
+ impl_tags={"family": "pytorch", "backend": "eager"},
+ impl_func=torch_causal_conv1d,
+)
+
+
+
+
+
+
+
Running causal_conv1d benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 460.509us 2386.43% 460.509us 460.509us 1
+ torch_eager 10.46% 229.787us 99.65% 2.189ms 2.189ms 0.000us 0.00% 21.633us 21.633us 1
+ aten::to 0.59% 12.913us 79.38% 1.743ms 290.578us 0.000us 0.00% 14.272us 2.379us 6
+ aten::_to_copy 1.99% 43.750us 78.79% 1.731ms 288.426us 0.000us 0.00% 14.272us 2.379us 6
+ aten::copy_ 2.89% 63.562us 74.16% 1.629ms 271.469us 11.936us 61.85% 14.272us 2.379us 6
+ aten::conv1d 0.44% 9.671us 7.66% 168.306us 56.102us 0.000us 0.00% 7.361us 2.454us 3
+ aten::convolution 0.72% 15.890us 7.22% 158.635us 52.878us 0.000us 0.00% 7.361us 2.454us 3
+ aten::_convolution 1.69% 37.102us 6.50% 142.745us 47.582us 0.000us 0.00% 7.361us 2.454us 3
+ aten::_conv_depthwise2d 1.60% 35.230us 3.77% 82.773us 27.591us 7.361us 38.15% 7.361us 2.454us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 38.15% 7.361us 2.454us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.50% 6.272us 2.091us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.35% 5.664us 1.888us 3
+ Activity Buffer Request 68.26% 1.499ms 68.26% 1.499ms 1.499ms 2.336us 12.11% 2.336us 2.336us 1
+ aten::empty_strided 2.64% 57.992us 2.64% 57.992us 9.665us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 4.12% 90.443us 4.12% 90.443us 10.049us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.47% 32.392us 1.88% 41.212us 4.579us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.64% 14.011us 0.64% 14.011us 0.934us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.55% 12.120us 0.55% 12.120us 4.040us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.50% 10.961us 0.50% 10.961us 3.654us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.43% 9.410us 0.51% 11.220us 3.740us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.196ms
+Self CUDA time total: 19.297us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 350.557us 1795.89% 350.557us 350.557us 1
+ torch_eager 6.82% 130.236us 99.71% 1.905ms 1.905ms 0.000us 0.00% 21.632us 21.632us 1
+ aten::to 0.35% 6.597us 84.97% 1.623ms 270.580us 0.000us 0.00% 13.728us 2.288us 6
+ aten::_to_copy 1.27% 24.323us 84.63% 1.617ms 269.481us 0.000us 0.00% 13.728us 2.288us 6
+ aten::copy_ 2.68% 51.130us 81.67% 1.560ms 260.072us 11.616us 59.51% 13.728us 2.288us 6
+ aten::conv1d 0.33% 6.400us 6.43% 122.914us 40.971us 0.000us 0.00% 7.904us 2.635us 3
+ aten::convolution 0.52% 9.901us 6.10% 116.514us 38.838us 0.000us 0.00% 7.904us 2.635us 3
+ aten::_convolution 1.28% 24.410us 5.58% 106.613us 35.538us 0.000us 0.00% 7.904us 2.635us 3
+ aten::_conv_depthwise2d 1.25% 23.932us 3.35% 63.983us 21.328us 7.904us 40.49% 7.904us 2.635us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 40.49% 7.904us 2.635us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 31.15% 6.080us 2.027us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.536us 28.36% 5.536us 1.845us 3
+ Activity Buffer Request 76.19% 1.456ms 76.19% 1.456ms 1.456ms 2.112us 10.82% 2.112us 2.112us 1
+ aten::empty_strided 1.68% 32.131us 1.68% 32.131us 5.355us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.93% 75.003us 3.93% 75.003us 8.334us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.97% 18.540us 1.29% 24.620us 2.736us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.51% 9.711us 0.51% 9.711us 0.647us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.51% 9.650us 0.51% 9.650us 3.217us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.47% 9.000us 0.47% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.37% 7.100us 0.45% 8.560us 2.853us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.911ms
+Self CUDA time total: 19.520us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 379.390us 2047.55% 379.390us 379.390us 1
+ torch_eager 8.20% 159.835us 99.65% 1.942ms 1.942ms 0.000us 0.00% 20.449us 20.449us 1
+ aten::to 0.37% 7.179us 83.32% 1.624ms 270.686us 0.000us 0.00% 13.536us 2.256us 6
+ aten::_to_copy 1.40% 27.213us 82.96% 1.617ms 269.489us 0.000us 0.00% 13.536us 2.256us 6
+ aten::copy_ 2.62% 51.160us 79.92% 1.558ms 259.635us 11.616us 62.69% 13.536us 2.256us 6
+ aten::conv1d 0.34% 6.560us 6.49% 126.453us 42.151us 0.000us 0.00% 6.913us 2.304us 3
+ aten::convolution 0.57% 11.119us 6.15% 119.893us 39.964us 0.000us 0.00% 6.913us 2.304us 3
+ aten::_convolution 1.29% 25.191us 5.58% 108.774us 36.258us 0.000us 0.00% 6.913us 2.304us 3
+ aten::_conv_depthwise2d 1.16% 22.580us 3.36% 65.502us 21.834us 6.913us 37.31% 6.913us 2.304us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.913us 37.31% 6.913us 2.304us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 31.95% 5.920us 1.973us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.74% 5.696us 1.899us 3
+ Activity Buffer Request 74.82% 1.458ms 74.82% 1.458ms 1.458ms 1.920us 10.36% 1.920us 1.920us 1
+ aten::empty_strided 1.64% 31.911us 1.64% 31.911us 5.319us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.59% 70.043us 3.59% 70.043us 7.783us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.01% 19.612us 1.35% 26.392us 2.932us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.55% 10.750us 0.55% 10.750us 0.717us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.62% 12.182us 0.62% 12.182us 4.061us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.46% 8.910us 0.46% 8.910us 2.970us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.35% 6.890us 0.42% 8.260us 2.753us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.949ms
+Self CUDA time total: 18.529us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.058us 1736.41% 340.058us 340.058us 1
+ torch_eager 6.15% 129.375us 99.74% 2.097ms 2.097ms 0.000us 0.00% 21.760us 21.760us 1
+ aten::to 0.32% 6.700us 86.45% 1.818ms 303.002us 0.000us 0.00% 14.112us 2.352us 6
+ aten::_to_copy 1.17% 24.651us 86.13% 1.811ms 301.886us 0.000us 0.00% 14.112us 2.352us 6
+ aten::copy_ 2.42% 50.883us 83.54% 1.757ms 292.785us 11.936us 60.95% 14.112us 2.352us 6
+ aten::conv1d 0.30% 6.290us 5.74% 120.803us 40.268us 0.000us 0.00% 7.648us 2.549us 3
+ aten::convolution 0.48% 10.020us 5.45% 114.513us 38.171us 0.000us 0.00% 7.648us 2.549us 3
+ aten::_convolution 1.15% 24.209us 4.97% 104.493us 34.831us 0.000us 0.00% 7.648us 2.549us 3
+ aten::_conv_depthwise2d 1.00% 21.080us 2.93% 61.691us 20.564us 7.648us 39.05% 7.648us 2.549us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.648us 39.05% 7.648us 2.549us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 31.70% 6.208us 2.069us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.25% 5.728us 1.909us 3
+ Activity Buffer Request 71.15% 1.496ms 71.15% 1.496ms 1.496ms 2.176us 11.11% 2.176us 2.176us 1
+ aten::empty_strided 1.42% 29.951us 1.42% 29.951us 4.992us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.98% 230.807us 10.98% 230.807us 25.645us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.94% 19.863us 1.21% 25.543us 2.838us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.46% 9.630us 0.46% 9.630us 0.642us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.50% 10.541us 0.50% 10.541us 3.514us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.42% 8.810us 0.42% 8.810us 2.937us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.35% 7.411us 0.44% 9.201us 3.067us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.103ms
+Self CUDA time total: 19.584us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.070us 1381.53% 339.070us 339.070us 1
+ torch_eager 6.44% 132.135us 99.72% 2.045ms 2.045ms 0.000us 0.00% 26.814us 26.814us 1
+ aten::to 0.33% 6.722us 86.08% 1.765ms 294.155us 0.000us 0.00% 15.262us 2.544us 6
+ aten::_to_copy 1.20% 24.702us 85.75% 1.758ms 293.035us 0.000us 0.00% 15.262us 2.544us 6
+ aten::copy_ 2.39% 49.030us 83.04% 1.702ms 283.750us 12.991us 52.93% 15.262us 2.544us 6
+ aten::conv1d 0.29% 5.850us 5.78% 118.603us 39.534us 0.000us 0.00% 11.552us 3.851us 3
+ aten::convolution 0.55% 11.220us 5.50% 112.753us 37.584us 0.000us 0.00% 11.552us 3.851us 3
+ aten::_convolution 1.18% 24.170us 4.95% 101.533us 33.844us 0.000us 0.00% 11.552us 3.851us 3
+ aten::_conv_depthwise2d 1.08% 22.212us 2.99% 61.273us 20.424us 11.552us 47.07% 11.552us 3.851us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.07% 11.552us 3.851us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.655us 27.12% 6.655us 2.218us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 25.82% 6.336us 2.112us 3
+ Activity Buffer Request 71.25% 1.461ms 71.25% 1.461ms 1.461ms 2.271us 9.25% 2.271us 2.271us 1
+ aten::empty_strided 1.51% 31.010us 1.51% 31.010us 5.168us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.41% 213.527us 10.41% 213.527us 23.725us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.89% 18.350us 1.15% 23.660us 2.629us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.131us 0.45% 9.131us 0.609us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.46% 9.481us 0.46% 9.481us 3.160us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.43% 8.760us 0.43% 8.760us 2.920us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.27% 5.520us 0.33% 6.850us 2.283us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.050ms
+Self CUDA time total: 24.543us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.129us 1305.15% 339.129us 339.129us 1
+ torch_eager 6.29% 128.886us 99.74% 2.043ms 2.043ms 0.000us 0.00% 28.224us 28.224us 1
+ aten::to 0.34% 6.902us 86.10% 1.763ms 293.882us 0.000us 0.00% 15.168us 2.528us 6
+ aten::_to_copy 1.23% 25.190us 85.76% 1.756ms 292.731us 0.000us 0.00% 15.168us 2.528us 6
+ aten::copy_ 2.41% 49.270us 83.08% 1.701ms 283.571us 12.928us 49.75% 15.168us 2.528us 6
+ aten::conv1d 0.31% 6.370us 5.92% 121.333us 40.444us 0.000us 0.00% 13.056us 4.352us 3
+ aten::convolution 0.49% 10.120us 5.61% 114.963us 38.321us 0.000us 0.00% 13.056us 4.352us 3
+ aten::_convolution 1.25% 25.500us 5.12% 104.843us 34.948us 0.000us 0.00% 13.056us 4.352us 3
+ aten::_conv_depthwise2d 1.08% 22.212us 3.04% 62.243us 20.748us 13.056us 50.25% 13.056us 4.352us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.056us 50.25% 13.056us 4.352us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 25.37% 6.592us 2.197us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 24.38% 6.336us 2.112us 3
+ Activity Buffer Request 71.41% 1.463ms 71.41% 1.463ms 1.463ms 2.240us 8.62% 2.240us 2.240us 1
+ aten::empty_strided 1.45% 29.770us 1.45% 29.770us 4.962us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.25% 209.968us 10.25% 209.968us 23.330us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.92% 18.870us 1.21% 24.780us 2.753us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.47% 9.601us 0.47% 9.601us 0.640us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.51% 10.510us 0.51% 10.510us 3.503us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.45% 9.181us 0.45% 9.181us 3.060us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.32% 6.640us 0.40% 8.140us 2.713us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.048ms
+Self CUDA time total: 25.984us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 362.270us 942.63% 362.270us 362.270us 1
+ torch_eager 7.50% 163.876us 99.75% 2.180ms 2.180ms 0.000us 0.00% 40.993us 40.993us 1
+ aten::conv1d 0.34% 7.388us 5.94% 129.794us 43.265us 0.000us 0.00% 22.464us 7.488us 3
+ aten::convolution 0.56% 12.301us 5.60% 122.406us 40.802us 0.000us 0.00% 22.464us 7.488us 3
+ aten::_convolution 1.18% 25.829us 5.04% 110.105us 36.702us 0.000us 0.00% 22.464us 7.488us 3
+ aten::_conv_depthwise2d 1.07% 23.371us 2.94% 64.311us 21.437us 22.464us 58.45% 22.464us 7.488us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.464us 58.45% 22.464us 7.488us 3
+ aten::to 0.36% 7.830us 84.95% 1.856ms 309.406us 0.000us 0.00% 18.529us 3.088us 6
+ aten::_to_copy 1.44% 31.560us 84.59% 1.849ms 308.101us 0.000us 0.00% 18.529us 3.088us 6
+ aten::copy_ 2.41% 52.633us 81.64% 1.784ms 297.326us 15.968us 41.55% 18.529us 3.088us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.609us 22.40% 8.609us 2.870us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.359us 19.15% 7.359us 2.453us 3
+ Activity Buffer Request 65.39% 1.429ms 65.39% 1.429ms 1.429ms 2.561us 6.66% 2.561us 2.561us 1
+ aten::empty_strided 1.51% 33.091us 1.51% 33.091us 5.515us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 14.87% 325.052us 14.87% 325.052us 36.117us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.00% 21.833us 1.21% 26.523us 2.947us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.39% 8.492us 0.39% 8.492us 0.566us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.44% 9.570us 0.44% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.40% 8.750us 0.40% 8.750us 2.917us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.37% 7.980us 0.45% 9.772us 3.257us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.185ms
+Self CUDA time total: 38.432us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.836us 827.74% 339.836us 339.836us 1
+ torch_eager 6.54% 141.434us 99.74% 2.158ms 2.158ms 0.000us 0.00% 43.648us 43.648us 1
+ aten::conv1d 0.28% 6.090us 5.53% 119.574us 39.858us 0.000us 0.00% 25.407us 8.469us 3
+ aten::convolution 0.46% 9.939us 5.25% 113.484us 37.828us 0.000us 0.00% 25.407us 8.469us 3
+ aten::_convolution 1.12% 24.214us 4.79% 103.545us 34.515us 0.000us 0.00% 25.407us 8.469us 3
+ aten::_conv_depthwise2d 1.05% 22.612us 2.94% 63.593us 21.198us 25.407us 61.88% 25.407us 8.469us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.407us 61.88% 25.407us 8.469us 3
+ aten::to 0.29% 6.201us 86.38% 1.869ms 311.424us 0.000us 0.00% 18.241us 3.040us 6
+ aten::_to_copy 1.18% 25.424us 86.09% 1.862ms 310.391us 0.000us 0.00% 18.241us 3.040us 6
+ aten::copy_ 2.40% 51.862us 83.52% 1.807ms 301.107us 15.649us 38.12% 18.241us 3.040us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 20.27% 8.320us 2.773us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.329us 17.85% 7.329us 2.443us 3
+ Activity Buffer Request 68.07% 1.472ms 68.07% 1.472ms 1.472ms 2.592us 6.31% 2.592us 2.592us 1
+ aten::empty_strided 1.40% 30.280us 1.40% 30.280us 5.047us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 14.06% 304.169us 14.06% 304.169us 33.797us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.84% 18.230us 1.08% 23.418us 2.602us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.40% 8.619us 0.40% 8.619us 0.575us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.48% 10.370us 0.48% 10.370us 3.457us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.41% 8.770us 0.41% 8.770us 2.923us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.26% 5.659us 0.32% 6.990us 2.330us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.163ms
+Self CUDA time total: 41.056us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.560us 329.80% 338.560us 338.560us 1
+ torch_eager 6.25% 131.427us 99.74% 2.098ms 2.098ms 0.000us 0.00% 108.608us 108.608us 1
+ aten::conv1d 0.29% 6.110us 5.71% 120.083us 40.028us 0.000us 0.00% 70.496us 23.499us 3
+ aten::convolution 0.47% 9.940us 5.42% 113.973us 37.991us 0.000us 0.00% 70.496us 23.499us 3
+ aten::_convolution 1.11% 23.441us 4.94% 104.033us 34.678us 0.000us 0.00% 70.496us 23.499us 3
+ aten::_conv_depthwise2d 1.04% 21.830us 2.93% 61.652us 20.551us 70.496us 68.67% 70.496us 23.499us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.496us 68.67% 70.496us 23.499us 3
+ aten::to 0.30% 6.292us 86.43% 1.818ms 303.059us 0.000us 0.00% 38.112us 6.352us 6
+ aten::_to_copy 1.17% 24.539us 86.13% 1.812ms 302.010us 0.000us 0.00% 38.112us 6.352us 6
+ aten::copy_ 2.47% 51.869us 83.58% 1.758ms 293.072us 32.160us 31.33% 38.112us 6.352us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.568us 17.11% 17.568us 5.856us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.592us 14.21% 14.592us 4.864us 3
+ Activity Buffer Request 67.63% 1.423ms 67.63% 1.423ms 1.423ms 5.952us 5.80% 5.952us 5.952us 1
+ aten::empty_strided 1.38% 29.091us 1.38% 29.091us 4.849us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 14.47% 304.542us 14.47% 304.542us 33.838us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.91% 19.049us 1.17% 24.579us 2.731us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.43% 9.070us 0.43% 9.070us 0.605us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.49% 10.351us 0.49% 10.351us 3.450us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.41% 8.621us 0.41% 8.621us 2.874us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.38% 8.050us 0.45% 9.470us 3.157us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.104ms
+Self CUDA time total: 102.656us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.578us 301.93% 340.578us 340.578us 1
+ torch_eager 6.29% 133.214us 99.74% 2.113ms 2.113ms 0.000us 0.00% 118.752us 118.752us 1
+ aten::conv1d 0.31% 6.499us 5.66% 119.974us 39.991us 0.000us 0.00% 80.576us 26.859us 3
+ aten::convolution 0.47% 9.880us 5.36% 113.475us 37.825us 0.000us 0.00% 80.576us 26.859us 3
+ aten::_convolution 1.21% 25.730us 4.89% 103.595us 34.532us 0.000us 0.00% 80.576us 26.859us 3
+ aten::_conv_depthwise2d 1.01% 21.361us 2.87% 60.832us 20.277us 80.576us 71.43% 80.576us 26.859us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.576us 71.43% 80.576us 26.859us 3
+ aten::to 0.33% 7.060us 86.42% 1.831ms 305.149us 0.000us 0.00% 38.176us 6.363us 6
+ aten::_to_copy 1.15% 24.352us 86.09% 1.824ms 303.972us 0.000us 0.00% 38.176us 6.363us 6
+ aten::copy_ 2.34% 49.642us 83.57% 1.770ms 295.075us 32.224us 28.57% 38.176us 6.363us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.664us 15.66% 17.664us 5.888us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.560us 12.91% 14.560us 4.853us 3
+ Activity Buffer Request 68.62% 1.454ms 68.62% 1.454ms 1.454ms 5.952us 5.28% 5.952us 5.952us 1
+ aten::empty_strided 1.37% 29.031us 1.37% 29.031us 4.838us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 13.59% 287.970us 13.59% 287.970us 31.997us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.89% 18.772us 1.17% 24.871us 2.763us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.520us 0.45% 9.520us 0.635us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.46% 9.850us 0.46% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.41% 8.670us 0.41% 8.670us 2.890us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.32% 6.821us 0.38% 8.112us 2.704us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.119ms
+Self CUDA time total: 112.800us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 6.32% 133.665us 99.60% 2.106ms 2.106ms 0.000us 0.00% 433.181us 433.181us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 423.869us 107.93% 423.869us 423.869us 1
+ aten::conv1d 0.30% 6.441us 5.98% 126.475us 42.158us 0.000us 0.00% 252.190us 84.063us 3
+ aten::convolution 0.49% 10.391us 5.68% 120.034us 40.011us 0.000us 0.00% 252.190us 84.063us 3
+ aten::_convolution 1.19% 25.110us 5.19% 109.643us 36.548us 0.000us 0.00% 252.190us 84.063us 3
+ aten::_conv_depthwise2d 1.07% 22.550us 3.14% 66.363us 22.121us 252.190us 64.21% 252.190us 84.063us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 252.190us 64.21% 252.190us 84.063us 3
+ aten::to 0.33% 6.989us 85.86% 1.815ms 302.520us 0.000us 0.00% 180.991us 30.165us 6
+ aten::_to_copy 1.18% 24.921us 85.53% 1.808ms 301.355us 0.000us 0.00% 180.991us 30.165us 6
+ aten::copy_ 2.39% 50.532us 82.93% 1.753ms 292.204us 140.543us 35.79% 180.991us 30.165us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 100.768us 25.66% 100.768us 33.589us 3
+ Activity Buffer Request 67.47% 1.426ms 67.47% 1.426ms 1.426ms 40.448us 10.30% 40.448us 40.448us 1
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.775us 10.13% 39.775us 13.258us 3
+ aten::empty_strided 1.42% 29.990us 1.42% 29.990us 4.998us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 14.15% 299.142us 14.15% 299.142us 33.238us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.92% 19.400us 1.21% 25.500us 2.833us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.49% 10.430us 0.49% 10.430us 0.695us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.55% 11.580us 0.55% 11.580us 3.860us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.44% 9.361us 0.44% 9.361us 3.120us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.34% 7.110us 0.42% 8.900us 2.967us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.114ms
+Self CUDA time total: 392.733us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 6.65% 143.166us 97.03% 2.090ms 2.090ms 0.000us 0.00% 486.301us 486.301us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 477.853us 106.88% 477.853us 477.853us 1
+ aten::conv1d 0.33% 7.110us 5.88% 126.575us 42.192us 0.000us 0.00% 298.557us 99.519us 3
+ aten::convolution 0.51% 11.062us 5.55% 119.465us 39.822us 0.000us 0.00% 298.557us 99.519us 3
+ aten::_convolution 1.16% 25.071us 5.03% 108.403us 36.134us 0.000us 0.00% 298.557us 99.519us 3
+ aten::_conv_depthwise2d 1.05% 22.671us 3.05% 65.592us 21.864us 298.557us 66.78% 298.557us 99.519us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.557us 66.78% 298.557us 99.519us 3
+ aten::to 0.33% 7.030us 83.12% 1.790ms 298.407us 0.000us 0.00% 187.744us 31.291us 6
+ aten::_to_copy 1.22% 26.183us 82.80% 1.783ms 297.235us 0.000us 0.00% 187.744us 31.291us 6
+ aten::copy_ 2.41% 51.979us 80.11% 1.726ms 287.603us 148.544us 33.22% 187.744us 31.291us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 108.768us 24.33% 108.768us 36.256us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.776us 8.90% 39.776us 13.259us 3
+ Activity Buffer Request 66.10% 1.424ms 66.10% 1.424ms 1.424ms 39.200us 8.77% 39.200us 39.200us 1
+ aten::empty_strided 1.47% 31.611us 1.47% 31.611us 5.268us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 12.61% 271.569us 12.61% 271.569us 30.174us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.93% 19.971us 1.21% 26.011us 2.890us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.711us 0.45% 9.711us 0.647us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.47% 10.061us 0.47% 10.061us 3.354us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.51% 11.040us 0.51% 11.040us 3.680us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.28% 5.950us 0.34% 7.400us 2.467us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.154ms
+Self CUDA time total: 447.101us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 355.165us 1897.25% 355.165us 355.165us 1
+ torch_eager 15.24% 136.376us 99.32% 888.600us 888.600us 0.000us 0.00% 20.608us 20.608us 1
+ aten::to 0.80% 7.121us 66.93% 598.831us 99.805us 0.000us 0.00% 13.376us 2.229us 6
+ aten::_to_copy 2.95% 26.380us 66.13% 591.710us 98.618us 0.000us 0.00% 13.376us 2.229us 6
+ aten::copy_ 5.90% 52.793us 59.34% 530.948us 88.491us 11.488us 61.37% 13.376us 2.229us 6
+ aten::conv1d 0.68% 6.050us 13.88% 124.163us 41.388us 0.000us 0.00% 7.232us 2.411us 3
+ aten::convolution 1.23% 10.987us 13.20% 118.113us 39.371us 0.000us 0.00% 7.232us 2.411us 3
+ aten::_convolution 2.78% 24.854us 11.97% 107.126us 35.709us 0.000us 0.00% 7.232us 2.411us 3
+ aten::_conv_depthwise2d 2.73% 24.470us 7.32% 65.481us 21.827us 7.232us 38.63% 7.232us 2.411us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.232us 38.63% 7.232us 2.411us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 31.62% 5.920us 1.973us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 29.74% 5.568us 1.856us 3
+ Activity Buffer Request 26.68% 238.708us 26.68% 238.708us 238.708us 1.888us 10.09% 1.888us 1.888us 1
+ aten::empty_strided 3.84% 34.382us 3.84% 34.382us 5.730us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 29.10% 260.398us 29.10% 260.398us 28.933us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.02% 18.071us 2.57% 22.961us 2.551us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.97% 8.709us 0.97% 8.709us 0.581us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.22% 10.910us 1.22% 10.910us 3.637us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.02% 9.150us 1.02% 9.150us 3.050us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.75% 6.751us 0.92% 8.220us 2.740us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 894.710us
+Self CUDA time total: 18.720us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.578us 1674.05% 323.578us 323.578us 1
+ torch_eager 14.45% 120.436us 99.39% 828.559us 828.559us 0.000us 0.00% 21.217us 21.217us 1
+ aten::to 0.75% 6.271us 67.77% 564.939us 94.156us 0.000us 0.00% 13.377us 2.230us 6
+ aten::_to_copy 2.76% 22.992us 67.02% 558.668us 93.111us 0.000us 0.00% 13.377us 2.230us 6
+ aten::copy_ 5.96% 49.722us 60.74% 506.327us 84.388us 11.489us 59.44% 13.377us 2.230us 6
+ aten::conv1d 0.75% 6.211us 13.83% 115.254us 38.418us 0.000us 0.00% 7.840us 2.613us 3
+ aten::convolution 1.19% 9.930us 13.08% 109.043us 36.348us 0.000us 0.00% 7.840us 2.613us 3
+ aten::_convolution 2.77% 23.131us 11.89% 99.113us 33.038us 0.000us 0.00% 7.840us 2.613us 3
+ aten::_conv_depthwise2d 2.53% 21.092us 7.21% 60.132us 20.044us 7.840us 40.56% 7.840us 2.613us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 40.56% 7.840us 2.613us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.857us 30.30% 5.857us 1.952us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.14% 5.632us 1.877us 3
+ Activity Buffer Request 27.26% 227.207us 27.26% 227.207us 227.207us 1.888us 9.77% 1.888us 1.888us 1
+ aten::empty_strided 3.52% 29.349us 3.52% 29.349us 4.891us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 29.92% 249.418us 29.92% 249.418us 27.713us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.13% 17.749us 2.80% 23.370us 2.597us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.11% 9.261us 1.11% 9.261us 0.617us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.16% 9.660us 1.16% 9.660us 3.220us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.12% 9.360us 1.12% 9.360us 3.120us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.70% 5.810us 0.88% 7.370us 2.457us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 833.619us
+Self CUDA time total: 19.329us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 326.394us 1677.60% 326.394us 326.394us 1
+ torch_eager 14.78% 122.914us 99.34% 825.919us 825.919us 0.000us 0.00% 21.632us 21.632us 1
+ aten::to 0.79% 6.552us 67.16% 558.381us 93.064us 0.000us 0.00% 14.368us 2.395us 6
+ aten::_to_copy 2.94% 24.430us 66.37% 551.829us 91.971us 0.000us 0.00% 14.368us 2.395us 6
+ aten::copy_ 5.83% 48.462us 59.95% 498.427us 83.071us 12.192us 62.66% 14.368us 2.395us 6
+ aten::conv1d 0.71% 5.939us 14.00% 116.404us 38.801us 0.000us 0.00% 7.264us 2.421us 3
+ aten::convolution 1.18% 9.811us 13.29% 110.465us 36.822us 0.000us 0.00% 7.264us 2.421us 3
+ aten::_convolution 2.85% 23.732us 12.11% 100.654us 33.551us 0.000us 0.00% 7.264us 2.421us 3
+ aten::_conv_depthwise2d 2.52% 20.910us 7.24% 60.232us 20.077us 7.264us 37.34% 7.264us 2.421us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.264us 37.34% 7.264us 2.421us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.40% 6.304us 2.101us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 30.26% 5.888us 1.963us 3
+ Activity Buffer Request 26.68% 221.788us 26.68% 221.788us 221.788us 2.176us 11.18% 2.176us 2.176us 1
+ aten::empty_strided 3.48% 28.972us 3.48% 28.972us 4.829us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 30.05% 249.819us 30.05% 249.819us 27.758us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.04% 16.929us 2.67% 22.200us 2.467us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.07% 8.901us 1.07% 8.901us 0.593us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.15% 9.570us 1.15% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.98% 8.110us 0.98% 8.110us 2.703us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.86% 7.190us 1.02% 8.500us 2.833us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 831.399us
+Self CUDA time total: 19.456us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 356.696us 1774.96% 356.696us 356.696us 1
+ torch_eager 13.86% 123.804us 99.36% 887.440us 887.440us 0.000us 0.00% 22.272us 22.272us 1
+ aten::to 0.71% 6.320us 66.62% 595.061us 99.177us 0.000us 0.00% 14.368us 2.395us 6
+ aten::_to_copy 2.82% 25.151us 65.92% 588.741us 98.124us 0.000us 0.00% 14.368us 2.395us 6
+ aten::copy_ 5.73% 51.172us 59.67% 532.958us 88.826us 12.192us 60.67% 14.368us 2.395us 6
+ aten::conv1d 0.70% 6.210us 15.70% 140.195us 46.732us 0.000us 0.00% 7.904us 2.635us 3
+ aten::convolution 1.11% 9.881us 15.00% 133.985us 44.662us 0.000us 0.00% 7.904us 2.635us 3
+ aten::_convolution 2.74% 24.510us 13.89% 124.104us 41.368us 0.000us 0.00% 7.904us 2.635us 3
+ aten::_conv_depthwise2d 2.70% 24.090us 9.26% 82.742us 27.581us 7.904us 39.33% 7.904us 2.635us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 39.33% 7.904us 2.635us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.240us 31.05% 6.240us 2.080us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 29.62% 5.952us 1.984us 3
+ Activity Buffer Request 28.94% 258.459us 28.94% 258.459us 258.459us 2.176us 10.83% 2.176us 2.176us 1
+ aten::empty_strided 3.43% 30.632us 3.43% 30.632us 5.105us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 29.46% 263.129us 29.46% 263.129us 29.237us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.97% 17.620us 2.61% 23.310us 2.590us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.07% 9.580us 1.07% 9.580us 0.639us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.09% 9.720us 1.09% 9.720us 3.240us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.02% 9.130us 1.02% 9.130us 3.043us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.75% 6.702us 0.94% 8.422us 2.807us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 893.171us
+Self CUDA time total: 20.096us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.730us 926.72% 332.730us 332.730us 1
+ torch_eager 14.27% 126.064us 99.42% 878.341us 878.341us 0.000us 0.00% 38.496us 38.496us 1
+ aten::conv1d 0.64% 5.671us 13.39% 118.255us 39.418us 0.000us 0.00% 20.096us 6.699us 3
+ aten::convolution 1.11% 9.840us 12.74% 112.584us 37.528us 0.000us 0.00% 20.096us 6.699us 3
+ aten::_convolution 2.79% 24.681us 11.63% 102.744us 34.248us 0.000us 0.00% 20.096us 6.699us 3
+ aten::_conv_depthwise2d 2.42% 21.390us 7.02% 62.061us 20.687us 20.096us 55.97% 20.096us 6.699us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.096us 55.97% 20.096us 6.699us 3
+ aten::to 0.72% 6.320us 68.61% 606.182us 101.030us 0.000us 0.00% 18.400us 3.067us 6
+ aten::_to_copy 2.82% 24.900us 67.90% 599.862us 99.977us 0.000us 0.00% 18.400us 3.067us 6
+ aten::copy_ 5.62% 49.645us 61.77% 545.702us 90.950us 15.808us 44.03% 18.400us 3.067us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 23.53% 8.448us 2.816us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 20.50% 7.360us 2.453us 3
+ Activity Buffer Request 29.42% 259.919us 29.42% 259.919us 259.919us 2.592us 7.22% 2.592us 2.592us 1
+ aten::empty_strided 3.31% 29.260us 3.31% 29.260us 4.877us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 29.15% 257.559us 29.15% 257.559us 28.618us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.02% 17.842us 2.68% 23.662us 2.629us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.05% 9.271us 1.05% 9.271us 0.618us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.19% 10.540us 1.19% 10.540us 3.513us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.99% 8.710us 0.99% 8.710us 2.903us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.65% 5.719us 0.80% 7.050us 2.350us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 883.481us
+Self CUDA time total: 35.904us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 337.888us 888.80% 337.888us 337.888us 1
+ torch_eager 6.31% 128.615us 99.74% 2.033ms 2.033ms 0.000us 0.00% 40.576us 40.576us 1
+ aten::conv1d 0.31% 6.349us 5.98% 121.885us 40.628us 0.000us 0.00% 22.304us 7.435us 3
+ aten::convolution 0.53% 10.852us 5.67% 115.536us 38.512us 0.000us 0.00% 22.304us 7.435us 3
+ aten::_convolution 1.24% 25.291us 5.14% 104.684us 34.895us 0.000us 0.00% 22.304us 7.435us 3
+ aten::_conv_depthwise2d 1.08% 22.031us 3.01% 61.431us 20.477us 22.304us 58.67% 22.304us 7.435us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.304us 58.67% 22.304us 7.435us 3
+ aten::to 0.34% 6.829us 86.09% 1.755ms 292.477us 0.000us 0.00% 18.272us 3.045us 6
+ aten::_to_copy 1.20% 24.424us 85.75% 1.748ms 291.339us 0.000us 0.00% 18.272us 3.045us 6
+ aten::copy_ 2.48% 50.501us 83.10% 1.694ms 282.331us 15.712us 41.33% 18.272us 3.045us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 21.89% 8.320us 2.773us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 19.44% 7.392us 2.464us 3
+ Activity Buffer Request 69.75% 1.422ms 69.75% 1.422ms 1.422ms 2.560us 6.73% 2.560us 2.560us 1
+ aten::empty_strided 1.45% 29.621us 1.45% 29.621us 4.937us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 11.90% 242.506us 11.90% 242.506us 26.945us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.92% 18.701us 1.17% 23.851us 2.650us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.43% 8.710us 0.43% 8.710us 0.581us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.48% 9.800us 0.48% 9.800us 3.267us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.43% 8.710us 0.43% 8.710us 2.903us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.35% 7.191us 0.42% 8.621us 2.874us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.038ms
+Self CUDA time total: 38.016us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 362.972us 567.16% 362.972us 362.972us 1
+ torch_eager 14.84% 128.544us 99.34% 860.680us 860.680us 0.000us 0.00% 68.061us 68.061us 1
+ aten::conv1d 0.70% 6.079us 16.52% 143.165us 47.722us 0.000us 0.00% 41.728us 13.909us 3
+ aten::convolution 3.42% 29.613us 15.82% 137.086us 45.695us 0.000us 0.00% 41.728us 13.909us 3
+ aten::_convolution 2.86% 24.759us 12.40% 107.473us 35.824us 0.000us 0.00% 41.728us 13.909us 3
+ aten::_conv_depthwise2d 2.59% 22.439us 7.67% 66.492us 22.164us 41.728us 65.20% 41.728us 13.909us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.728us 65.20% 41.728us 13.909us 3
+ aten::to 0.77% 6.631us 64.71% 560.621us 93.437us 0.000us 0.00% 26.333us 4.389us 6
+ aten::_to_copy 2.80% 24.253us 63.94% 553.990us 92.332us 0.000us 0.00% 26.333us 4.389us 6
+ aten::copy_ 5.80% 50.240us 57.50% 498.196us 83.033us 22.270us 34.80% 26.333us 4.389us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.903us 18.60% 11.903us 3.968us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.367us 16.20% 10.367us 3.456us 3
+ Activity Buffer Request 26.05% 225.728us 26.05% 225.728us 225.728us 4.063us 6.35% 4.063us 4.063us 1
+ aten::empty_strided 3.64% 31.541us 3.64% 31.541us 5.257us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 28.31% 245.279us 28.31% 245.279us 27.253us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.11% 18.263us 2.74% 23.752us 2.639us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.06% 9.199us 1.06% 9.199us 0.613us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.26% 10.941us 1.26% 10.941us 3.647us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.16% 10.061us 1.16% 10.061us 3.354us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.66% 5.740us 0.85% 7.330us 2.443us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 866.380us
+Self CUDA time total: 63.998us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.311us 512.91% 357.311us 357.311us 1
+ torch_eager 20.96% 191.619us 99.38% 908.662us 908.662us 0.000us 0.00% 73.696us 73.696us 1
+ aten::conv1d 0.63% 5.760us 15.23% 139.294us 46.431us 0.000us 0.00% 47.296us 15.765us 3
+ aten::convolution 2.87% 26.271us 14.60% 133.534us 44.511us 0.000us 0.00% 47.296us 15.765us 3
+ aten::_convolution 2.77% 25.360us 11.73% 107.263us 35.754us 0.000us 0.00% 47.296us 15.765us 3
+ aten::_conv_depthwise2d 2.38% 21.722us 7.17% 65.523us 21.841us 47.296us 67.89% 47.296us 15.765us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.296us 67.89% 47.296us 15.765us 3
+ aten::to 0.73% 6.650us 60.08% 549.318us 91.553us 0.000us 0.00% 26.400us 4.400us 6
+ aten::_to_copy 2.63% 24.032us 59.35% 542.668us 90.445us 0.000us 0.00% 26.400us 4.400us 6
+ aten::copy_ 5.57% 50.922us 53.46% 488.786us 81.464us 22.368us 32.11% 26.400us 4.400us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.872us 17.04% 11.872us 3.957us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 15.07% 10.496us 3.499us 3
+ Activity Buffer Request 23.91% 218.617us 23.91% 218.617us 218.617us 4.032us 5.79% 4.032us 4.032us 1
+ aten::empty_strided 3.26% 29.850us 3.26% 29.850us 4.975us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 26.57% 242.937us 26.57% 242.937us 26.993us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.04% 18.652us 2.65% 24.251us 2.695us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.01% 9.230us 1.01% 9.230us 0.615us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.08% 9.870us 1.08% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.12% 10.241us 1.12% 10.241us 3.414us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.63% 5.780us 0.80% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 914.323us
+Self CUDA time total: 69.664us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.092us 187.26% 348.092us 348.092us 1
+ torch_eager 14.76% 124.374us 99.29% 836.558us 836.558us 0.000us 0.00% 195.870us 195.870us 1
+ aten::conv1d 0.70% 5.900us 14.42% 121.504us 40.501us 0.000us 0.00% 133.406us 44.469us 3
+ aten::convolution 1.14% 9.610us 13.72% 115.604us 38.535us 0.000us 0.00% 133.406us 44.469us 3
+ aten::_convolution 2.88% 24.263us 12.58% 105.994us 35.331us 0.000us 0.00% 133.406us 44.469us 3
+ aten::_conv_depthwise2d 2.73% 23.010us 7.80% 65.750us 21.917us 133.406us 71.77% 133.406us 44.469us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.406us 71.77% 133.406us 44.469us 3
+ aten::to 0.74% 6.220us 66.83% 563.060us 93.843us 0.000us 0.00% 62.464us 10.411us 6
+ aten::_to_copy 2.83% 23.861us 66.09% 556.840us 92.807us 0.000us 0.00% 62.464us 10.411us 6
+ aten::copy_ 6.03% 50.810us 59.73% 503.287us 83.881us 52.480us 28.23% 62.464us 10.411us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.600us 15.92% 29.600us 9.867us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.880us 12.31% 22.880us 7.627us 3
+ Activity Buffer Request 25.69% 216.468us 25.69% 216.468us 216.468us 9.984us 5.37% 9.984us 9.984us 1
+ aten::empty_strided 3.52% 29.692us 3.52% 29.692us 4.949us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 30.59% 257.739us 30.59% 257.739us 28.638us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.08% 17.540us 2.73% 23.000us 2.556us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.12% 9.412us 1.12% 9.412us 0.627us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.20% 10.110us 1.20% 10.110us 3.370us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.29% 10.900us 1.29% 10.900us 3.633us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.68% 5.719us 0.88% 7.451us 2.484us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 842.539us
+Self CUDA time total: 185.886us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.403us 166.18% 348.403us 348.403us 1
+ torch_eager 14.60% 122.924us 99.33% 836.209us 836.209us 0.000us 0.00% 223.383us 223.383us 1
+ aten::conv1d 0.69% 5.779us 14.01% 117.955us 39.318us 0.000us 0.00% 153.883us 51.294us 3
+ aten::convolution 1.25% 10.491us 13.32% 112.176us 37.392us 0.000us 0.00% 153.883us 51.294us 3
+ aten::_convolution 2.91% 24.484us 12.08% 101.685us 33.895us 0.000us 0.00% 153.883us 51.294us 3
+ aten::_conv_depthwise2d 2.49% 20.928us 7.14% 60.070us 20.023us 153.883us 73.40% 153.883us 51.294us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.883us 73.40% 153.883us 51.294us 3
+ aten::to 0.73% 6.179us 67.37% 567.200us 94.533us 0.000us 0.00% 69.500us 11.583us 6
+ aten::_to_copy 2.75% 23.132us 66.64% 561.021us 93.504us 0.000us 0.00% 69.500us 11.583us 6
+ aten::copy_ 5.91% 49.740us 60.39% 508.377us 84.729us 55.773us 26.60% 69.500us 11.583us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.927us 15.71% 32.927us 10.976us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.846us 10.90% 22.846us 7.615us 3
+ Activity Buffer Request 29.09% 244.869us 29.09% 244.869us 244.869us 13.727us 6.55% 13.727us 13.727us 1
+ aten::empty_strided 3.51% 29.512us 3.51% 29.512us 4.919us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 27.84% 234.420us 27.84% 234.420us 26.047us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.13% 17.973us 2.77% 23.320us 2.591us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.09% 9.167us 1.09% 9.167us 0.611us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.12% 9.440us 1.12% 9.440us 3.147us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.07% 9.050us 1.07% 9.050us 3.017us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.85% 7.121us 1.02% 8.601us 2.867us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 841.880us
+Self CUDA time total: 209.656us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 7.22% 135.785us 57.39% 1.079ms 1.079ms 0.000us 0.00% 1.518ms 1.518ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.419ms 100.41% 1.419ms 1.419ms 1
+ aten::to 0.37% 6.901us 40.86% 768.526us 128.088us 0.000us 0.00% 823.221us 137.204us 6
+ aten::_to_copy 1.63% 30.742us 40.49% 761.625us 126.938us 0.000us 0.00% 823.221us 137.204us 6
+ aten::copy_ 2.94% 55.302us 27.81% 523.157us 87.193us 717.942us 50.81% 823.221us 137.204us 6
+ aten::conv1d 0.33% 6.280us 6.71% 126.144us 42.048us 0.000us 0.00% 695.094us 231.698us 3
+ aten::convolution 0.57% 10.750us 6.37% 119.864us 39.955us 0.000us 0.00% 695.094us 231.698us 3
+ aten::_convolution 1.35% 25.400us 5.80% 109.114us 36.371us 0.000us 0.00% 695.094us 231.698us 3
+ aten::_conv_depthwise2d 1.19% 22.332us 3.55% 66.763us 22.254us 695.094us 49.19% 695.094us 231.698us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 695.094us 49.19% 695.094us 231.698us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 411.706us 29.14% 411.706us 137.235us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 306.236us 21.67% 306.236us 102.079us 3
+ Activity Buffer Request 12.99% 244.238us 12.99% 244.238us 244.238us 105.279us 7.45% 105.279us 105.279us 1
+ aten::empty_strided 2.17% 40.811us 11.04% 207.726us 34.621us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 13.13% 246.997us 13.13% 246.997us 27.444us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.97% 37.133us 2.36% 44.413us 4.935us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.58% 10.889us 0.58% 10.889us 0.726us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.53% 10.051us 0.53% 10.051us 3.350us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.58% 11.000us 0.58% 11.000us 3.667us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.34% 6.350us 0.41% 7.700us 2.567us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.881ms
+Self CUDA time total: 1.413ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 4.25% 132.984us 66.63% 2.083ms 2.083ms 0.000us 0.00% 1.503ms 1.503ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.434ms 100.41% 1.434ms 1.434ms 1
+ aten::to 0.21% 6.470us 57.53% 1.798ms 299.656us 0.000us 0.00% 765.147us 127.524us 6
+ aten::_to_copy 0.80% 25.009us 57.32% 1.791ms 298.577us 0.000us 0.00% 765.147us 127.524us 6
+ aten::copy_ 1.51% 47.155us 55.55% 1.736ms 289.360us 690.492us 48.35% 765.147us 127.524us 6
+ aten::conv1d 0.20% 6.231us 3.91% 122.325us 40.775us 0.000us 0.00% 737.724us 245.908us 3
+ aten::convolution 0.32% 9.920us 3.71% 116.094us 38.698us 0.000us 0.00% 737.724us 245.908us 3
+ aten::_convolution 0.82% 25.623us 3.40% 106.174us 35.391us 0.000us 0.00% 737.724us 245.908us 3
+ aten::_conv_depthwise2d 0.70% 21.899us 1.98% 62.011us 20.670us 737.724us 51.65% 737.724us 245.908us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 737.724us 51.65% 737.724us 245.908us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 398.046us 27.87% 398.046us 132.682us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 292.446us 20.48% 292.446us 97.482us 3
+ Activity Buffer Request 47.19% 1.475ms 47.19% 1.475ms 1.475ms 74.655us 5.23% 74.655us 74.655us 1
+ aten::empty_strided 0.97% 30.293us 0.97% 30.293us 5.049us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 7.52% 235.026us 7.52% 235.026us 26.114us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.60% 18.740us 0.79% 24.820us 2.758us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.32% 10.019us 0.32% 10.019us 0.668us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.32% 9.882us 0.32% 9.882us 3.294us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.29% 9.220us 0.29% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.24% 7.471us 0.29% 9.160us 3.053us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 3.125ms
+Self CUDA time total: 1.428ms
+
+
+impl wl p50(ms) ok
+torch_eager cuda_B2_D2048_S128_W2 0.08 True
+torch_eager cuda_B2_D2048_S128_W4 0.09 True
+torch_eager cuda_B2_D2048_S2048_W2 0.15 True
+torch_eager cuda_B2_D2048_S2048_W4 0.16 True
+torch_eager cuda_B2_D2048_S512_W2 0.08 True
+torch_eager cuda_B2_D2048_S512_W4 0.08 True
+torch_eager cuda_B2_D64_S128_W2 0.07 True
+torch_eager cuda_B2_D64_S128_W4 0.09 True
+torch_eager cuda_B2_D64_S2048_W2 0.09 True
+torch_eager cuda_B2_D64_S2048_W4 0.08 True
+torch_eager cuda_B2_D64_S512_W2 0.09 True
+torch_eager cuda_B2_D64_S512_W4 0.09 True
+torch_eager cuda_B4_D2048_S128_W2 0.09 True
+torch_eager cuda_B4_D2048_S128_W4 0.08 True
+torch_eager cuda_B4_D2048_S2048_W2 0.49 True
+torch_eager cuda_B4_D2048_S2048_W4 0.50 True
+torch_eager cuda_B4_D2048_S512_W2 0.09 True
+torch_eager cuda_B4_D2048_S512_W4 0.10 True
+torch_eager cuda_B4_D64_S128_W2 0.08 True
+torch_eager cuda_B4_D64_S128_W4 0.08 True
+torch_eager cuda_B4_D64_S2048_W2 0.08 True
+torch_eager cuda_B4_D64_S2048_W4 0.09 True
+torch_eager cuda_B4_D64_S512_W2 0.08 True
+torch_eager cuda_B4_D64_S512_W4 0.08 True
+
+
+
+
+Installed 37 packages in 224ms
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/causal_conv1d/index.html b/causal_conv1d/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..41313e21299f746daf8b9b76fbbb22687cf02763
--- /dev/null
+++ b/causal_conv1d/index.html
@@ -0,0 +1,89 @@
+
+
+
+
+
+ Index of /causal_conv1d
+
+
+
+
+ Index of /causal_conv1d
+
+
+
\ No newline at end of file
diff --git a/causal_conv1d/results/artifacts/combine/latency.svg b/causal_conv1d/results/artifacts/combine/latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..73369f1354843caf2bfe58e9b9dbb5f0a7c81b2a
--- /dev/null
+++ b/causal_conv1d/results/artifacts/combine/latency.svg
@@ -0,0 +1,530 @@
+
+
\ No newline at end of file
diff --git a/causal_conv1d/results/cells/combine.py b/causal_conv1d/results/cells/combine.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb0f868d7f37b547e4fd981763d6af8a3bca13dd
--- /dev/null
+++ b/causal_conv1d/results/cells/combine.py
@@ -0,0 +1,26 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "HF Kernels Causal Conv1D": "UVNOTE_FILE_HF_KERNELS_CAUSAL_CONV1D_BENCHMARK",
+ "PyTorch Causal Conv1D": "UVNOTE_FILE_TORCH_CAUSAL_CONV1D_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="causal_conv1d.jsonl",
+ svg_filename="latency.svg"
+)
\ No newline at end of file
diff --git a/causal_conv1d/results/combined_results.html b/causal_conv1d/results/combined_results.html
new file mode 100644
index 0000000000000000000000000000000000000000..b113ec63f04aaf43b21a9692d5686dbfeb8f2510
--- /dev/null
+++ b/causal_conv1d/results/combined_results.html
@@ -0,0 +1,5106 @@
+
+
+
+
+
+ Causal Conv1D Benchmark - Combined Results
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
Causal Conv1D Benchmarks - Aggregated Results
+
This document combines benchmark results from multiple Causal Conv1D implementations.
+
Combined Summary and Visualization
+
+
+
+
+
+
+
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "HF Kernels Causal Conv1D": "UVNOTE_FILE_HF_KERNELS_CAUSAL_CONV1D_BENCHMARK",
+ "PyTorch Causal Conv1D": "UVNOTE_FILE_TORCH_CAUSAL_CONV1D_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="causal_conv1d.jsonl",
+ svg_filename="latency.svg"
+)
+
+
+
+
+
+
+
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ HF Kernels Causal Conv1D : /__w/kernels-benchmarks/kernels-benchmarks/benches/causal_conv1d/impls/.uvnote/cache/7a691bd653e23c412c5d29fbc92ea1454823ea437864cf9473fc561b116ef3d9
+✓ PyTorch Causal Conv1D : /__w/kernels-benchmarks/kernels-benchmarks/benches/causal_conv1d/impls/.uvnote/cache/70757e27f2df1dfde4905a24527bb4ca6f0f8df7dac2e2ecaa0ddc359c7d5e64
+
+ ✓ Found HF Kernels Causal Conv1D
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/causal_conv1d/impls/.uvnote/cache/7a691bd653e23c412c5d29fbc92ea1454823ea437864cf9473fc561b116ef3d9/causal_conv1d.jsonl
+ ✓ Found PyTorch Causal Conv1D
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/causal_conv1d/impls/.uvnote/cache/70757e27f2df1dfde4905a24527bb4ca6f0f8df7dac2e2ecaa0ddc359c7d5e64/causal_conv1d.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl wl p50(ms) ok
+hf_kernels_causal_conv1d cuda_B2_D2048_S128_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D2048_S128_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D2048_S2048_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D2048_S512_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D2048_S512_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S128_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S2048_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B2_D64_S512_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S128_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S128_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S2048_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S512_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D2048_S512_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S128_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S128_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
+hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
+torch_eager cuda_B2_D2048_S128_W2 0.08 True
+torch_eager cuda_B2_D2048_S128_W4 0.09 True
+torch_eager cuda_B2_D2048_S2048_W2 0.15 True
+torch_eager cuda_B2_D2048_S2048_W4 0.16 True
+torch_eager cuda_B2_D2048_S512_W2 0.08 True
+torch_eager cuda_B2_D2048_S512_W4 0.08 True
+torch_eager cuda_B2_D64_S128_W2 0.07 True
+torch_eager cuda_B2_D64_S128_W4 0.09 True
+torch_eager cuda_B2_D64_S2048_W2 0.09 True
+torch_eager cuda_B2_D64_S2048_W4 0.08 True
+torch_eager cuda_B2_D64_S512_W2 0.09 True
+torch_eager cuda_B2_D64_S512_W4 0.09 True
+torch_eager cuda_B4_D2048_S128_W2 0.09 True
+torch_eager cuda_B4_D2048_S128_W4 0.08 True
+torch_eager cuda_B4_D2048_S2048_W2 0.49 True
+torch_eager cuda_B4_D2048_S2048_W4 0.50 True
+torch_eager cuda_B4_D2048_S512_W2 0.09 True
+torch_eager cuda_B4_D2048_S512_W4 0.10 True
+torch_eager cuda_B4_D64_S128_W2 0.08 True
+torch_eager cuda_B4_D64_S128_W4 0.08 True
+torch_eager cuda_B4_D64_S2048_W2 0.08 True
+torch_eager cuda_B4_D64_S2048_W4 0.09 True
+torch_eager cuda_B4_D64_S512_W2 0.08 True
+torch_eager cuda_B4_D64_S512_W4 0.08 True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 48 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+ ✓ HF Kernels Causal Conv1D
+ ✓ PyTorch Causal Conv1D
+
+
+
+
+Installed 37 packages in 239ms
+
+
+
+
Artifacts:
+
latency.svg
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/causal_conv1d/results/index.html b/causal_conv1d/results/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..ad7d58a5d5aa5f369cbfdb2b39008b8deb1383b1
--- /dev/null
+++ b/causal_conv1d/results/index.html
@@ -0,0 +1,88 @@
+
+
+
+
+
+ Index of /causal_conv1d/results
+
+
+
+
+ Index of /causal_conv1d/results
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/artifacts/benchmark/attention.jsonl b/flash_attn/impls/artifacts/benchmark/attention.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..43081d2c5637964960d306de705532da87d93bb1
--- /dev/null
+++ b/flash_attn/impls/artifacts/benchmark/attention.jsonl
@@ -0,0 +1,6 @@
+{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9715130000245153, "p50": 0.9773340000265307, "p90": 0.9788430000412518, "mean": 0.976309200018477, "iqr": 0.005310000005920301, "raw_times": [0.9735330000353315, 0.9773340000265307, 0.9803229999647556, 0.9788430000412518, 0.9715130000245153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9926440000072034, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0154749999742307, "p50": 1.0199449999959143, "p90": 1.0278160000325443, "mean": 1.0223952000046665, "iqr": 0.010921000011876458, "raw_times": [1.0278160000325443, 1.0168950000206678, 1.0318449999999757, 1.0154749999742307, 1.0199449999959143], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0225849999869752, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0612160000391668, "p50": 1.0721770000259312, "p90": 1.075397000022349, "mean": 1.0706886000093618, "iqr": 0.009251000051335723, "raw_times": [1.0612160000391668, 1.0721770000259312, 1.0661459999710132, 1.075397000022349, 1.078506999988349], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0771669999485312, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.075485999990633, "p50": 1.0823069999901236, "p90": 1.084176999995634, "mean": 1.0827727999981107, "iqr": 0.0021099999685247894, "raw_times": [1.075485999990633, 1.0820670000271093, 1.0823069999901236, 1.0898269999870536, 1.084176999995634], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1057869999717695, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2330920000067636, "p50": 1.237381999999343, "p90": 1.239422999958606, "mean": 1.2375224000038543, "iqr": 0.002220999931523693, "raw_times": [1.2405130000274767, 1.2372020000270822, 1.2330920000067636, 1.237381999999343, 1.239422999958606], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.22687200001792, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2296720000222194, "p50": 1.230811999960224, "p90": 1.236231999996562, "mean": 1.2357499999893662, "iqr": 0.005929999986165058, "raw_times": [1.236231999996562, 1.2517319999574283, 1.230811999960224, 1.230302000010397, 1.2296720000222194], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2250920000269616, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..04ae262009c3d6e33aaa3e392d28c903f24c287c
--- /dev/null
+++ b/flash_attn/impls/cells/benchmark.py
@@ -0,0 +1,30 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "xformers",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+import xformers.ops as xops
+
+
+def xformers_attention(q, k, v):
+ """xFormers memory efficient attention"""
+ # xFormers expects [batch, seq_len, heads, head_dim]
+ return xops.memory_efficient_attention(q, k, v)
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="xformers_meff",
+ impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
+ impl_func=xformers_attention,
+)
\ No newline at end of file
diff --git a/flash_attn/impls/cells/nv.py b/flash_attn/impls/cells/nv.py
new file mode 100644
index 0000000000000000000000000000000000000000..80eef60a7536ed875fb21731ab2d059458bd20b4
--- /dev/null
+++ b/flash_attn/impls/cells/nv.py
@@ -0,0 +1,3 @@
+import subprocess
+
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
\ No newline at end of file
diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html
new file mode 100644
index 0000000000000000000000000000000000000000..b4d18851c72f81eb2cee29787f66e13729c42139
--- /dev/null
+++ b/flash_attn/impls/flash_attention.html
@@ -0,0 +1,4177 @@
+
+
+
+
+
+ flash_attention
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
Flash Attention Implementation
+
GPU Info
+
+
+
+
+
import subprocess
+
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+
+
+
+
+
+
Wed Oct 29 00:36:31 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 32C P0 151W / 350W | 0MiB / 46068MiB | 86% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
Flash Attention Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_flash(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="torch_flash_ma",
+ impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
+ impl_func=torch_flash,
+)
+
+
+
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.578ms 102.17% 3.578ms 3.578ms 1
+ torch_flash_ma 6.87% 353.422us 46.38% 2.386ms 2.386ms 0.000us 0.00% 3.542ms 3.542ms 1
+ aten::scaled_dot_product_attention 0.81% 41.691us 4.31% 221.887us 73.962us 0.000us 0.00% 2.788ms 929.262us 3
+ aten::_scaled_dot_product_flash_attention 0.53% 27.420us 3.50% 180.196us 60.065us 0.000us 0.00% 2.788ms 929.262us 3
+ aten::_flash_attention_forward 0.77% 39.803us 2.56% 131.456us 43.819us 2.788ms 79.61% 2.788ms 929.262us 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.788ms 79.61% 2.788ms 929.262us 3
+ aten::contiguous 0.28% 14.581us 33.97% 1.748ms 145.626us 0.000us 0.00% 754.272us 62.856us 12
+ aten::clone 0.77% 39.360us 33.69% 1.733ms 144.411us 0.000us 0.00% 754.272us 62.856us 12
+ aten::copy_ 1.64% 84.313us 31.38% 1.614ms 134.494us 713.920us 20.39% 754.272us 62.856us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 713.920us 20.39% 713.920us 59.493us 12
+ Activity Buffer Request 27.68% 1.424ms 27.68% 1.424ms 1.424ms 40.352us 1.15% 40.352us 40.352us 1
+ aten::transpose 1.22% 62.617us 1.64% 84.135us 3.506us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.42% 21.518us 0.42% 21.518us 0.897us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.49% 25.079us 1.99% 102.243us 6.816us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.77% 91.033us 1.77% 91.033us 3.793us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 2.57% 132.402us 2.57% 132.402us 8.827us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.32% 16.702us 0.32% 16.702us 5.567us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.05% 2.750us 0.05% 2.750us 0.458us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.17% 9.001us 0.17% 9.001us 3.000us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 53.62% 2.758ms 53.62% 2.758ms 2.758ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 5.144ms
+Self CUDA time total: 3.502ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_flash_ma 4.93% 257.698us 42.06% 2.199ms 2.199ms 0.000us 0.00% 3.742ms 3.742ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.698ms 100.30% 3.698ms 3.698ms 1
+ aten::scaled_dot_product_attention 0.48% 25.212us 3.48% 182.067us 60.689us 0.000us 0.00% 2.929ms 976.488us 3
+ aten::_scaled_dot_product_flash_attention 0.39% 20.471us 3.00% 156.855us 52.285us 0.000us 0.00% 2.929ms 976.488us 3
+ aten::_flash_attention_forward 0.74% 38.430us 2.18% 114.074us 38.025us 2.929ms 79.45% 2.929ms 976.488us 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.929ms 79.45% 2.929ms 976.488us 3
+ aten::contiguous 0.17% 9.122us 32.76% 1.713ms 142.713us 0.000us 0.00% 812.318us 67.693us 12
+ aten::clone 0.59% 31.068us 32.59% 1.703ms 141.953us 0.000us 0.00% 812.318us 67.693us 12
+ aten::copy_ 1.50% 78.513us 30.83% 1.612ms 134.315us 757.726us 20.55% 812.318us 67.693us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 757.726us 20.55% 757.726us 63.144us 12
+ Activity Buffer Request 27.74% 1.450ms 27.74% 1.450ms 1.450ms 54.592us 1.48% 54.592us 54.592us 1
+ aten::transpose 0.99% 51.637us 1.32% 68.781us 2.866us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.33% 17.144us 0.33% 17.144us 0.714us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.41% 21.274us 1.52% 79.248us 5.283us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.40% 73.206us 1.40% 73.206us 3.050us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 2.03% 106.061us 2.03% 106.061us 7.071us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.26% 13.410us 0.26% 13.410us 4.470us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.04% 1.900us 0.04% 1.900us 0.317us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.07% 3.830us 0.07% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 57.94% 3.028ms 57.94% 3.028ms 3.028ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 5.227ms
+Self CUDA time total: 3.687ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_flash_ma 4.92% 259.759us 41.31% 2.182ms 2.182ms 0.000us 0.00% 3.825ms 3.825ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.778ms 100.30% 3.778ms 3.778ms 1
+ aten::scaled_dot_product_attention 0.46% 24.480us 3.48% 183.685us 61.228us 0.000us 0.00% 2.990ms 996.566us 3
+ aten::_scaled_dot_product_flash_attention 0.36% 18.972us 3.01% 159.205us 53.068us 0.000us 0.00% 2.990ms 996.566us 3
+ aten::_flash_attention_forward 0.75% 39.470us 2.21% 116.583us 38.861us 2.990ms 79.38% 2.990ms 996.566us 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.990ms 79.38% 2.990ms 996.566us 3
+ aten::contiguous 0.20% 10.370us 32.06% 1.693ms 141.118us 0.000us 0.00% 835.605us 69.634us 12
+ aten::clone 0.56% 29.562us 31.86% 1.683ms 140.254us 0.000us 0.00% 835.605us 69.634us 12
+ aten::copy_ 1.55% 81.613us 30.00% 1.585ms 132.057us 776.758us 20.62% 835.605us 69.634us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 776.758us 20.62% 776.758us 64.730us 12
+ Activity Buffer Request 26.94% 1.423ms 26.94% 1.423ms 1.423ms 58.847us 1.56% 58.847us 58.847us 1
+ aten::transpose 0.97% 51.460us 1.30% 68.660us 2.861us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.33% 17.200us 0.33% 17.200us 0.717us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.39% 20.693us 1.67% 88.333us 5.889us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.54% 81.451us 1.54% 81.451us 3.394us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 1.97% 104.004us 1.97% 104.004us 6.934us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.28% 14.530us 0.28% 14.530us 4.843us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.04% 1.902us 0.04% 1.902us 0.317us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.07% 3.600us 0.07% 3.600us 1.200us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 58.69% 3.100ms 58.69% 3.100ms 3.100ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 5.282ms
+Self CUDA time total: 3.766ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_flash_ma 4.63% 260.119us 43.14% 2.422ms 2.422ms 0.000us 0.00% 3.911ms 3.911ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.865ms 100.31% 3.865ms 3.865ms 1
+ aten::scaled_dot_product_attention 0.43% 24.361us 3.22% 180.586us 60.195us 0.000us 0.00% 3.069ms 1.023ms 3
+ aten::_scaled_dot_product_flash_attention 0.35% 19.401us 2.78% 156.225us 52.075us 0.000us 0.00% 3.069ms 1.023ms 3
+ aten::_flash_attention_forward 0.68% 38.111us 2.03% 114.053us 38.018us 3.069ms 79.64% 3.069ms 1.023ms 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.069ms 79.64% 3.069ms 1.023ms 3
+ aten::contiguous 0.17% 9.669us 34.46% 1.935ms 161.211us 0.000us 0.00% 842.147us 70.179us 12
+ aten::clone 0.54% 30.453us 34.29% 1.925ms 160.405us 0.000us 0.00% 842.147us 70.179us 12
+ aten::copy_ 1.42% 79.471us 32.63% 1.832ms 152.656us 784.675us 20.36% 842.147us 70.179us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 784.675us 20.36% 784.675us 65.390us 12
+ Activity Buffer Request 26.20% 1.471ms 26.20% 1.471ms 1.471ms 57.472us 1.49% 57.472us 57.472us 1
+ aten::transpose 0.92% 51.697us 1.23% 69.261us 2.886us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.31% 17.564us 0.31% 17.564us 0.732us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.36% 20.299us 1.45% 81.452us 5.430us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.34% 75.405us 1.34% 75.405us 3.142us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 5.43% 304.654us 5.43% 304.654us 20.310us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.25% 13.960us 0.25% 13.960us 4.653us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 1.839us 0.03% 1.839us 0.306us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.07% 3.750us 0.07% 3.750us 1.250us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 56.86% 3.192ms 56.86% 3.192ms 3.192ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 5.614ms
+Self CUDA time total: 3.854ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_flash_ma 5.20% 312.192us 40.27% 2.420ms 2.420ms 0.000us 0.00% 4.370ms 4.370ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.320ms 100.26% 4.320ms 4.320ms 1
+ aten::scaled_dot_product_attention 0.42% 25.401us 3.13% 188.317us 62.772us 0.000us 0.00% 3.499ms 1.166ms 3
+ aten::_scaled_dot_product_flash_attention 0.34% 20.373us 2.71% 162.916us 54.305us 0.000us 0.00% 3.499ms 1.166ms 3
+ aten::_flash_attention_forward 0.70% 41.822us 1.99% 119.463us 39.821us 3.499ms 81.21% 3.499ms 1.166ms 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.499ms 81.21% 3.499ms 1.166ms 3
+ aten::contiguous 0.17% 10.061us 31.18% 1.873ms 156.120us 0.000us 0.00% 870.813us 72.568us 12
+ aten::clone 0.51% 30.510us 31.01% 1.863ms 155.281us 0.000us 0.00% 870.813us 72.568us 12
+ aten::copy_ 1.32% 79.253us 29.46% 1.770ms 147.488us 809.726us 18.79% 870.813us 72.568us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 809.726us 18.79% 809.726us 67.477us 12
+ Activity Buffer Request 23.71% 1.425ms 23.71% 1.425ms 1.425ms 61.087us 1.42% 61.087us 61.087us 1
+ aten::transpose 0.85% 51.371us 1.15% 68.940us 2.873us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.29% 17.569us 0.29% 17.569us 0.732us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.34% 20.420us 1.39% 83.415us 5.561us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.27% 76.235us 1.27% 76.235us 3.176us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 4.81% 288.717us 4.81% 288.717us 19.248us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.26% 15.360us 0.26% 15.360us 5.120us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 1.980us 0.03% 1.980us 0.330us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.06% 3.780us 0.06% 3.780us 1.260us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 59.73% 3.589ms 59.73% 3.589ms 3.589ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 6.009ms
+Self CUDA time total: 4.309ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_flash_ma 4.62% 283.749us 39.30% 2.416ms 2.416ms 0.000us 0.00% 4.488ms 4.488ms 1
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.437ms 100.26% 4.437ms 4.437ms 1
+ aten::scaled_dot_product_attention 0.41% 25.050us 2.99% 183.606us 61.202us 0.000us 0.00% 3.606ms 1.202ms 3
+ aten::_scaled_dot_product_flash_attention 0.32% 19.512us 2.58% 158.556us 52.852us 0.000us 0.00% 3.606ms 1.202ms 3
+ aten::_flash_attention_forward 0.64% 39.583us 1.89% 116.223us 38.741us 3.606ms 81.47% 3.606ms 1.202ms 3
+void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.606ms 81.47% 3.606ms 1.202ms 3
+ aten::contiguous 0.16% 9.930us 30.93% 1.901ms 158.420us 0.000us 0.00% 882.206us 73.517us 12
+ aten::clone 0.49% 30.220us 30.76% 1.891ms 157.592us 0.000us 0.00% 882.206us 73.517us 12
+ aten::copy_ 1.34% 82.326us 29.23% 1.797ms 149.726us 820.351us 18.53% 882.206us 73.517us 12
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 820.351us 18.53% 820.351us 68.363us 12
+ Activity Buffer Request 23.42% 1.439ms 23.42% 1.439ms 1.439ms 61.855us 1.40% 61.855us 61.855us 1
+ aten::transpose 0.85% 52.248us 1.14% 70.082us 2.920us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.29% 17.834us 0.29% 17.834us 0.743us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.33% 20.531us 1.36% 83.782us 5.585us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.26% 77.251us 1.26% 77.251us 3.219us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 4.84% 297.592us 4.84% 297.592us 19.839us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_strided 0.24% 14.660us 0.24% 14.660us 4.887us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceGetAttribute 0.03% 1.929us 0.03% 1.929us 0.321us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.06% 3.839us 0.06% 3.839us 1.280us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 60.70% 3.731ms 60.70% 3.731ms 3.731ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 6.147ms
+Self CUDA time total: 4.426ms
+
+
+impl wl p50(ms) ok
+torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
+torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
+torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
+torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
+torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
+torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html
new file mode 100644
index 0000000000000000000000000000000000000000..4817c7014f4b0c071581df7216c8b47278369487
--- /dev/null
+++ b/flash_attn/impls/hf_kernels_flash_attn.html
@@ -0,0 +1,4088 @@
+
+
+
+
+
+ hf_kernels_flash_attn
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
HF Kernels - Flash Attention
+
HuggingFace Kernels Flash Attention Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the flash attention kernel
+hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
+
+
+def hf_flash_attention(query, key, value):
+ """HuggingFace Kernels Flash Attention"""
+ return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="hf_kernels_flash_attn",
+ impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
+ impl_func=hf_flash_attention,
+)
+
+
+
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn 3.54% 153.223us 41.10% 1.781ms 1.781ms 0.000us 0.00% 3.710ms 3.710ms 1
+ _flash_attn_9e27194::fwd 1.64% 71.013us 37.57% 1.628ms 542.522us 2.765ms 100.00% 3.710ms 1.237ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.766ms 100.05% 2.766ms 2.766ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.765ms 100.00% 2.765ms 921.626us 3
+ Activity Buffer Request 32.85% 1.423ms 32.85% 1.423ms 1.423ms 945.530us 34.20% 945.530us 945.530us 1
+ cudaDeviceGetAttribute 0.11% 4.920us 0.11% 4.920us 0.328us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.37% 16.201us 1.19% 51.582us 17.194us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.82% 35.381us 0.82% 35.381us 11.794us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.55% 23.891us 0.55% 23.891us 2.655us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.27% 11.501us 0.27% 11.501us 3.834us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.96% 41.661us 0.96% 41.661us 13.887us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 58.90% 2.552ms 58.90% 2.552ms 2.552ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.332ms
+Self CUDA time total: 2.765ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn 1.95% 87.173us 36.43% 1.628ms 1.628ms 0.000us 0.00% 3.993ms 3.993ms 1
+ _flash_attn_9e27194::fwd 1.10% 49.286us 34.48% 1.541ms 513.554us 2.982ms 100.00% 3.993ms 1.331ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.984ms 100.06% 2.984ms 2.984ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.982ms 100.00% 2.982ms 993.983us 3
+ Activity Buffer Request 31.65% 1.414ms 31.65% 1.414ms 1.414ms 1.011ms 33.92% 1.011ms 1.011ms 1
+ cudaDeviceGetAttribute 0.09% 3.827us 0.09% 3.827us 0.255us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.16% 7.330us 0.51% 22.831us 7.610us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.35% 15.501us 0.35% 15.501us 5.167us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.46% 20.669us 0.46% 20.669us 2.297us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.08% 3.520us 0.08% 3.520us 1.173us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.59% 26.211us 0.59% 26.211us 8.737us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 63.57% 2.841ms 63.57% 2.841ms 2.841ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.469ms
+Self CUDA time total: 2.982ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn 2.39% 107.943us 36.87% 1.664ms 1.664ms 0.000us 0.00% 4.011ms 4.011ms 1
+ _flash_attn_9e27194::fwd 1.08% 48.663us 34.47% 1.556ms 518.528us 2.994ms 100.00% 4.011ms 1.337ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.996ms 100.05% 2.996ms 2.996ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.994ms 100.00% 2.994ms 998.054us 3
+ Activity Buffer Request 31.64% 1.428ms 31.64% 1.428ms 1.428ms 1.017ms 33.96% 1.017ms 1.017ms 1
+ cudaDeviceGetAttribute 0.09% 4.050us 0.09% 4.050us 0.270us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.16% 7.029us 0.54% 24.521us 8.174us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.39% 17.492us 0.39% 17.492us 5.831us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.46% 20.589us 0.46% 20.589us 2.288us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.08% 3.660us 0.08% 3.660us 1.220us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.59% 26.452us 0.59% 26.452us 8.817us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 63.13% 2.849ms 63.13% 2.849ms 2.849ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.512ms
+Self CUDA time total: 2.994ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn 2.37% 113.154us 39.04% 1.864ms 1.864ms 0.000us 0.00% 4.086ms 4.086ms 1
+ _flash_attn_9e27194::fwd 1.02% 48.863us 36.67% 1.751ms 583.543us 3.059ms 100.00% 4.086ms 1.362ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.060ms 100.05% 3.060ms 3.060ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.059ms 100.00% 3.059ms 1.020ms 3
+ Activity Buffer Request 29.92% 1.429ms 29.92% 1.429ms 1.429ms 1.027ms 33.57% 1.027ms 1.027ms 1
+ cudaDeviceGetAttribute 0.08% 3.821us 0.08% 3.821us 0.255us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.16% 7.819us 0.54% 25.920us 8.640us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.38% 18.101us 0.38% 18.101us 6.034us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.44% 21.109us 0.44% 21.109us 2.345us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.08% 3.840us 0.08% 3.840us 1.280us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 4.58% 218.538us 4.58% 218.538us 72.846us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 60.96% 2.910ms 60.96% 2.910ms 2.910ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.774ms
+Self CUDA time total: 3.059ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn 2.11% 109.115us 34.87% 1.804ms 1.804ms 0.000us 0.00% 4.702ms 4.702ms 1
+ _flash_attn_9e27194::fwd 0.94% 48.879us 32.76% 1.695ms 565.076us 3.518ms 100.00% 4.702ms 1.567ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.519ms 100.04% 3.519ms 3.519ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.518ms 100.00% 3.518ms 1.173ms 3
+ Activity Buffer Request 27.57% 1.427ms 27.57% 1.427ms 1.427ms 1.184ms 33.66% 1.184ms 1.184ms 1
+ cudaDeviceGetAttribute 0.07% 3.810us 0.07% 3.810us 0.254us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.14% 7.040us 0.48% 25.061us 8.354us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.35% 18.021us 0.35% 18.021us 6.007us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.40% 20.762us 0.40% 20.762us 2.307us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.07% 3.731us 0.07% 3.731us 1.244us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.21% 166.285us 3.21% 166.285us 55.428us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 65.13% 3.370ms 65.13% 3.370ms 3.370ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 5.175ms
+Self CUDA time total: 3.518ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn 2.00% 105.404us 33.86% 1.781ms 1.781ms 0.000us 0.00% 4.846ms 4.846ms 1
+ _flash_attn_9e27194::fwd 0.97% 50.822us 31.86% 1.675ms 558.446us 3.623ms 100.00% 4.846ms 1.615ms 3
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.624ms 100.04% 3.624ms 3.624ms 1
+void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.623ms 100.00% 3.623ms 1.208ms 3
+ Activity Buffer Request 26.72% 1.405ms 26.72% 1.405ms 1.405ms 1.223ms 33.77% 1.223ms 1.223ms 1
+ cudaDeviceGetAttribute 0.08% 4.369us 0.08% 4.369us 0.291us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty_like 0.15% 7.679us 0.48% 25.141us 8.380us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty_strided 0.33% 17.462us 0.33% 17.462us 5.821us 0.000us 0.00% 0.000us 0.000us 3
+ aten::empty 0.40% 21.081us 0.40% 21.081us 2.342us 0.000us 0.00% 0.000us 0.000us 9
+ cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.13% 164.746us 3.13% 164.746us 54.915us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 66.14% 3.478ms 66.14% 3.478ms 3.478ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 5.259ms
+Self CUDA time total: 3.623ms
+
+
+impl wl p50(ms) ok
+hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
+hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True
+hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True
+hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
+hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
+hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
+
+
+Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
+Fetching 20 files: 10%|█ | 2/20 [00:01<00:16, 1.12it/s]
+Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.15it/s]
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html
new file mode 100644
index 0000000000000000000000000000000000000000..5b8a9bc38528f0c5161a3d259e80bf519be70a90
--- /dev/null
+++ b/flash_attn/impls/hf_kernels_flash_attn3.html
@@ -0,0 +1,4081 @@
+
+
+
+
+
+ hf_kernels_flash_attn3
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
HF Kernels - Flash Attention 3
+
HuggingFace Kernels Flash Attention 3 Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the flash attention 3 kernel
+hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
+
+
+def hf_flash_attention3(query, key, value):
+ return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="hf_kernels_flash_attn3",
+ impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
+ impl_func=hf_flash_attention3,
+)
+
+
+
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn3 3.80% 163.585us 44.55% 1.916ms 1.916ms 0.000us 0.00% 3.598ms 3.598ms 1
+ FlashAttnFunc 3.38% 145.315us 40.75% 1.753ms 584.213us 0.000us 0.00% 3.598ms 1.199ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.86% 80.133us 37.37% 1.607ms 535.775us 2.702ms 100.00% 3.598ms 1.199ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.704ms 100.05% 2.704ms 2.704ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.702ms 100.00% 2.702ms 900.800us 3
+ Activity Buffer Request 33.08% 1.423ms 33.08% 1.423ms 1.423ms 895.776us 33.15% 895.776us 895.776us 1
+ aten::empty 1.02% 43.812us 1.02% 43.812us 7.302us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.30% 13.081us 0.30% 13.081us 4.360us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.10% 47.211us 1.10% 47.211us 15.737us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 55.45% 2.385ms 55.45% 2.385ms 2.385ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.301ms
+Self CUDA time total: 2.702ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn3 2.35% 101.013us 40.06% 1.725ms 1.725ms 0.000us 0.00% 3.751ms 3.751ms 1
+ FlashAttnFunc 2.16% 92.983us 37.71% 1.624ms 541.352us 0.000us 0.00% 3.751ms 1.250ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.19% 51.175us 35.55% 1.531ms 510.358us 2.802ms 100.00% 3.751ms 1.250ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.803ms 100.06% 2.803ms 2.803ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.802ms 100.00% 2.802ms 933.921us 3
+ Activity Buffer Request 32.90% 1.417ms 32.90% 1.417ms 1.417ms 949.686us 33.90% 949.686us 949.686us 1
+ aten::empty 0.63% 27.091us 0.63% 27.091us 4.515us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.239us 0.12% 5.239us 1.746us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.72% 30.870us 0.72% 30.870us 10.290us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 59.94% 2.581ms 59.94% 2.581ms 2.581ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.306ms
+Self CUDA time total: 2.802ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn3 2.33% 100.994us 40.09% 1.739ms 1.739ms 0.000us 0.00% 3.778ms 3.778ms 1
+ FlashAttnFunc 2.19% 94.944us 37.76% 1.638ms 545.852us 0.000us 0.00% 3.778ms 1.259ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.20% 52.112us 35.57% 1.543ms 514.204us 2.819ms 100.00% 3.778ms 1.259ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.820ms 100.05% 2.820ms 2.820ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.819ms 100.00% 2.819ms 939.550us 3
+ Activity Buffer Request 32.79% 1.422ms 32.79% 1.422ms 1.422ms 959.198us 34.03% 959.198us 959.198us 1
+ aten::empty 0.60% 26.051us 0.60% 26.051us 4.342us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.409us 0.12% 5.409us 1.803us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.85% 36.931us 0.85% 36.931us 12.310us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 59.91% 2.599ms 59.91% 2.599ms 2.599ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.337ms
+Self CUDA time total: 2.819ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn3 2.88% 135.094us 43.08% 2.020ms 2.020ms 0.000us 0.00% 3.874ms 3.874ms 1
+ FlashAttnFunc 2.10% 98.504us 40.20% 1.885ms 628.185us 0.000us 0.00% 3.874ms 1.291ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.10% 51.632us 38.10% 1.786ms 595.350us 2.895ms 100.00% 3.874ms 1.291ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.897ms 100.06% 2.897ms 2.897ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.895ms 100.00% 2.895ms 965.011us 3
+ Activity Buffer Request 30.58% 1.434ms 30.58% 1.434ms 1.434ms 979.229us 33.82% 979.229us 979.229us 1
+ aten::empty 0.58% 27.080us 0.58% 27.080us 4.513us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.11% 5.380us 0.11% 5.380us 1.793us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 5.72% 268.289us 5.72% 268.289us 89.430us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 56.92% 2.668ms 56.92% 2.668ms 2.668ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.688ms
+Self CUDA time total: 2.895ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn3 2.52% 128.963us 37.26% 1.903ms 1.903ms 0.000us 0.00% 4.575ms 4.575ms 1
+ FlashAttnFunc 1.87% 95.425us 34.74% 1.774ms 591.441us 0.000us 0.00% 4.575ms 1.525ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.01% 51.593us 32.87% 1.679ms 559.632us 3.427ms 100.00% 4.575ms 1.525ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.429ms 100.05% 3.429ms 3.429ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.427ms 100.00% 3.427ms 1.142ms 3
+ Activity Buffer Request 27.82% 1.421ms 27.82% 1.421ms 1.421ms 1.148ms 33.49% 1.148ms 1.148ms 1
+ aten::empty 0.55% 28.251us 0.55% 28.251us 4.709us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.10% 5.249us 0.10% 5.249us 1.750us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.38% 172.866us 3.38% 172.866us 57.622us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 62.74% 3.205ms 62.74% 3.205ms 3.205ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 5.108ms
+Self CUDA time total: 3.427ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_flash_attn3 2.37% 119.165us 36.69% 1.842ms 1.842ms 0.000us 0.00% 4.545ms 4.545ms 1
+ FlashAttnFunc 1.86% 93.463us 34.32% 1.723ms 574.423us 0.000us 0.00% 4.545ms 1.515ms 3
+ _flash_attn3_48fe103_dirty::fwd 1.01% 50.561us 32.46% 1.630ms 543.268us 3.398ms 100.00% 4.545ms 1.515ms 3
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.400ms 100.05% 3.400ms 3.400ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3
+ Activity Buffer Request 27.47% 1.379ms 27.47% 1.379ms 1.379ms 1.147ms 33.76% 1.147ms 1.147ms 1
+ aten::empty 0.56% 28.202us 0.56% 28.202us 4.700us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.10% 5.090us 0.10% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.32% 166.515us 3.32% 166.515us 55.505us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 63.31% 3.179ms 63.31% 3.179ms 3.179ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 5.022ms
+Self CUDA time total: 3.398ms
+
+
+impl wl p50(ms) ok
+hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
+hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
+hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
+hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
+hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
+hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
+
+
+
+
+Downloading hf-xet (3.2MiB)
+ Downloading hf-xet
+Installed 15 packages in 15ms
+
+
+
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.06it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.12it/s]
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/index.html b/flash_attn/impls/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..59a4fb994db1d910bbc3e0e4a28c04e81908a615
--- /dev/null
+++ b/flash_attn/impls/index.html
@@ -0,0 +1,93 @@
+
+
+
+
+
+ Index of /flash_attn/impls
+
+
+
+
+ Index of /flash_attn/impls
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html
new file mode 100644
index 0000000000000000000000000000000000000000..9802f3f658c65e57e8526f2af8160462ea71be6e
--- /dev/null
+++ b/flash_attn/impls/mem_efficient_attention.html
@@ -0,0 +1,4175 @@
+
+
+
+
+
+ mem_efficient_attention
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
Memory Efficient Attention Implementation
+
Memory Efficient SDPA Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_mem_eff(q, k, v):
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
+ with torch.nn.attention.sdpa_kernel(
+ torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION
+ ):
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
+ return o.transpose(1, 2).contiguous()
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="torch_mem_eff",
+ impl_tags={"family": "torch-sdpa", "backend": "EFFICIENT", "compile": "none"},
+ impl_func=torch_mem_eff,
+)
+
+
+
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_mem_eff 4.95% 352.351us 32.76% 2.334ms 2.334ms 0.000us 0.00% 5.540ms 5.540ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.523ms 100.61% 5.523ms 5.523ms 1
+ aten::scaled_dot_product_attention 0.42% 30.002us 2.65% 188.407us 62.802us 0.000us 0.00% 4.866ms 1.622ms 3
+ aten::_scaled_dot_product_efficient_attention 0.34% 24.112us 2.22% 158.405us 52.802us 0.000us 0.00% 4.866ms 1.622ms 3
+ aten::_efficient_attention_forward 0.50% 35.512us 1.50% 106.553us 35.518us 4.866ms 88.65% 4.866ms 1.622ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.866ms 88.65% 4.866ms 1.622ms 3
+ aten::contiguous 0.17% 12.230us 24.19% 1.723ms 191.466us 0.000us 0.00% 673.885us 74.876us 9
+ aten::clone 0.48% 34.032us 24.02% 1.711ms 190.107us 0.000us 0.00% 673.885us 74.876us 9
+ aten::copy_ 1.04% 73.980us 22.51% 1.603ms 178.136us 623.037us 11.35% 673.885us 74.876us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 623.037us 11.35% 623.037us 69.226us 9
+ Activity Buffer Request 20.23% 1.441ms 20.23% 1.441ms 1.441ms 50.848us 0.93% 50.848us 50.848us 1
+ aten::transpose 1.03% 73.058us 1.37% 97.392us 4.058us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.34% 24.334us 0.34% 24.334us 1.014us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.28% 19.590us 1.03% 73.701us 8.189us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 1.26% 89.621us 1.26% 89.621us 4.268us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 1.58% 112.598us 1.58% 112.598us 9.383us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.04% 3.160us 0.04% 3.160us 1.053us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.12% 8.400us 0.12% 8.400us 2.800us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 67.24% 4.789ms 67.24% 4.789ms 4.789ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 7.123ms
+Self CUDA time total: 5.489ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_mem_eff 3.15% 231.099us 27.84% 2.044ms 2.044ms 0.000us 0.00% 5.902ms 5.902ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.856ms 100.14% 5.856ms 5.856ms 1
+ aten::scaled_dot_product_attention 0.26% 19.041us 1.91% 140.484us 46.828us 0.000us 0.00% 5.210ms 1.737ms 3
+ aten::_scaled_dot_product_efficient_attention 0.25% 18.340us 1.65% 121.443us 40.481us 0.000us 0.00% 5.210ms 1.737ms 3
+ aten::_efficient_attention_forward 0.40% 29.263us 1.10% 80.783us 26.928us 5.210ms 89.09% 5.210ms 1.737ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.210ms 89.09% 5.210ms 1.737ms 3
+ aten::contiguous 0.10% 7.239us 22.19% 1.629ms 181.023us 0.000us 0.00% 692.607us 76.956us 9
+ aten::clone 0.29% 21.632us 22.09% 1.622ms 180.219us 0.000us 0.00% 692.607us 76.956us 9
+ aten::copy_ 0.87% 63.554us 21.13% 1.551ms 172.359us 638.271us 10.91% 692.607us 76.956us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 638.271us 10.91% 638.271us 70.919us 9
+ Activity Buffer Request 19.39% 1.423ms 19.39% 1.423ms 1.423ms 54.336us 0.93% 54.336us 54.336us 1
+ aten::transpose 0.66% 48.509us 0.89% 65.581us 2.733us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.23% 17.072us 0.23% 17.072us 0.711us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.16% 11.700us 0.67% 49.102us 5.456us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.83% 61.232us 0.83% 61.232us 2.916us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 1.18% 86.372us 1.18% 86.372us 7.198us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.340us 0.03% 2.340us 0.780us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.05% 3.500us 0.05% 3.500us 1.167us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 72.16% 5.297ms 72.16% 5.297ms 5.297ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 7.341ms
+Self CUDA time total: 5.848ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_mem_eff 2.94% 229.483us 29.69% 2.318ms 2.318ms 0.000us 0.00% 6.099ms 6.099ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.049ms 100.14% 6.049ms 6.049ms 1
+ aten::scaled_dot_product_attention 0.23% 17.971us 1.79% 139.464us 46.488us 0.000us 0.00% 5.384ms 1.795ms 3
+ aten::_scaled_dot_product_efficient_attention 0.23% 18.090us 1.56% 121.493us 40.498us 0.000us 0.00% 5.384ms 1.795ms 3
+ aten::_efficient_attention_forward 0.36% 27.830us 1.04% 80.963us 26.988us 5.384ms 89.13% 5.384ms 1.795ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.384ms 89.13% 5.384ms 1.795ms 3
+ aten::contiguous 0.09% 7.278us 24.41% 1.906ms 211.734us 0.000us 0.00% 714.652us 79.406us 9
+ aten::clone 0.28% 21.781us 24.31% 1.898ms 210.925us 0.000us 0.00% 714.652us 79.406us 9
+ aten::copy_ 0.80% 62.662us 23.36% 1.824ms 202.683us 656.540us 10.87% 714.652us 79.406us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.540us 10.87% 656.540us 72.949us 9
+ Activity Buffer Request 21.74% 1.697ms 21.74% 1.697ms 1.697ms 58.112us 0.96% 58.112us 58.112us 1
+ aten::transpose 0.63% 48.810us 0.84% 65.850us 2.744us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.22% 17.040us 0.22% 17.040us 0.710us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.14% 11.161us 0.67% 52.392us 5.821us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.87% 67.583us 0.87% 67.583us 3.218us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 1.09% 85.261us 1.09% 85.261us 7.105us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.451us 0.03% 2.451us 0.817us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.04% 3.290us 0.04% 3.290us 1.097us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 70.31% 5.490ms 70.31% 5.490ms 5.490ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 7.808ms
+Self CUDA time total: 6.041ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_mem_eff 2.96% 232.645us 28.95% 2.277ms 2.277ms 0.000us 0.00% 6.207ms 6.207ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.157ms 100.14% 6.157ms 6.157ms 1
+ aten::scaled_dot_product_attention 0.23% 18.052us 1.76% 138.596us 46.199us 0.000us 0.00% 5.492ms 1.831ms 3
+ aten::_scaled_dot_product_efficient_attention 0.23% 17.731us 1.53% 120.544us 40.181us 0.000us 0.00% 5.492ms 1.831ms 3
+ aten::_efficient_attention_forward 0.35% 27.329us 1.02% 80.113us 26.704us 5.492ms 89.32% 5.492ms 1.831ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.492ms 89.32% 5.492ms 1.831ms 3
+ aten::contiguous 0.09% 7.269us 23.67% 1.862ms 206.848us 0.000us 0.00% 714.624us 79.403us 9
+ aten::clone 0.28% 21.997us 23.58% 1.854ms 206.041us 0.000us 0.00% 714.624us 79.403us 9
+ aten::copy_ 0.89% 69.616us 22.61% 1.779ms 197.614us 656.513us 10.68% 714.624us 79.403us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.513us 10.68% 656.513us 72.946us 9
+ Activity Buffer Request 17.99% 1.415ms 17.99% 1.415ms 1.415ms 58.111us 0.95% 58.111us 58.111us 1
+ aten::transpose 0.63% 49.422us 0.84% 66.332us 2.764us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.22% 16.910us 0.22% 16.910us 0.705us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.15% 11.593us 0.68% 53.843us 5.983us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.87% 68.381us 0.87% 68.381us 3.256us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 4.00% 314.941us 4.00% 314.941us 26.245us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.380us 0.03% 2.380us 0.793us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.04% 3.242us 0.04% 3.242us 1.081us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 71.05% 5.588ms 71.05% 5.588ms 5.588ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 7.865ms
+Self CUDA time total: 6.149ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_mem_eff 2.91% 232.917us 28.19% 2.257ms 2.257ms 0.000us 0.00% 6.364ms 6.364ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.313ms 100.13% 6.313ms 6.313ms 1
+ aten::scaled_dot_product_attention 0.22% 17.912us 1.77% 142.075us 47.358us 0.000us 0.00% 5.641ms 1.880ms 3
+ aten::_scaled_dot_product_efficient_attention 0.23% 18.730us 1.55% 124.163us 41.388us 0.000us 0.00% 5.641ms 1.880ms 3
+ aten::_efficient_attention_forward 0.36% 29.090us 1.02% 81.873us 27.291us 5.641ms 89.47% 5.641ms 1.880ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.641ms 89.47% 5.641ms 1.880ms 3
+ aten::contiguous 0.09% 7.221us 22.98% 1.840ms 204.428us 0.000us 0.00% 723.455us 80.384us 9
+ aten::clone 0.27% 21.690us 22.89% 1.833ms 203.626us 0.000us 0.00% 723.455us 80.384us 9
+ aten::copy_ 0.78% 62.812us 21.99% 1.761ms 195.631us 663.839us 10.53% 723.455us 80.384us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 663.839us 10.53% 663.839us 73.760us 9
+ Activity Buffer Request 18.37% 1.471ms 18.37% 1.471ms 1.471ms 59.616us 0.95% 59.616us 59.616us 1
+ aten::transpose 0.60% 48.283us 0.82% 65.922us 2.747us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.22% 17.639us 0.22% 17.639us 0.735us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.15% 11.816us 0.63% 50.264us 5.585us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.80% 63.840us 0.80% 63.840us 3.040us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 3.11% 249.257us 3.11% 249.257us 20.771us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.260us 0.03% 2.260us 0.753us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.04% 3.100us 0.04% 3.100us 1.033us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 71.81% 5.750ms 71.81% 5.750ms 5.750ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 8.007ms
+Self CUDA time total: 6.304ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_mem_eff 3.10% 262.407us 28.45% 2.407ms 2.407ms 0.000us 0.00% 6.700ms 6.700ms 1
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.648ms 100.13% 6.648ms 6.648ms 1
+ aten::scaled_dot_product_attention 0.22% 18.361us 1.72% 145.216us 48.405us 0.000us 0.00% 5.968ms 1.989ms 3
+ aten::_scaled_dot_product_efficient_attention 0.22% 18.717us 1.50% 126.855us 42.285us 0.000us 0.00% 5.968ms 1.989ms 3
+ aten::_efficient_attention_forward 0.34% 29.081us 1.00% 84.393us 28.131us 5.968ms 89.89% 5.968ms 1.989ms 3
+fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.968ms 89.89% 5.968ms 1.989ms 3
+ aten::contiguous 0.09% 7.641us 23.04% 1.949ms 216.566us 0.000us 0.00% 731.964us 81.329us 9
+ aten::clone 0.29% 24.377us 22.95% 1.941ms 215.717us 0.000us 0.00% 731.964us 81.329us 9
+ aten::copy_ 0.80% 68.015us 22.01% 1.862ms 206.906us 670.941us 10.11% 731.964us 81.329us 9
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 670.941us 10.11% 670.941us 74.549us 9
+ Activity Buffer Request 17.04% 1.441ms 17.04% 1.441ms 1.441ms 61.023us 0.92% 61.023us 61.023us 1
+ aten::transpose 0.67% 56.417us 0.87% 73.607us 3.067us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.20% 17.190us 0.20% 17.190us 0.716us 0.000us 0.00% 0.000us 0.000us 24
+ aten::empty_like 0.14% 12.051us 0.65% 54.922us 6.102us 0.000us 0.00% 0.000us 0.000us 9
+ aten::empty 0.83% 69.821us 0.83% 69.821us 3.325us 0.000us 0.00% 0.000us 0.000us 21
+ cudaLaunchKernel 4.44% 375.855us 4.44% 375.855us 31.321us 0.000us 0.00% 0.000us 0.000us 12
+ cudaStreamIsCapturing 0.03% 2.230us 0.03% 2.230us 0.743us 0.000us 0.00% 0.000us 0.000us 3
+ cudaFuncSetAttribute 0.04% 3.250us 0.04% 3.250us 1.083us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 71.55% 6.053ms 71.55% 6.053ms 6.053ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 8.459ms
+Self CUDA time total: 6.639ms
+
+
+impl wl p50(ms) ok
+torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
+torch_mem_eff cuda_attn_L256_bfloat16 1.99 True
+torch_mem_eff cuda_attn_L320_bfloat16 2.02 True
+torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
+torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
+torch_mem_eff cuda_attn_L512_bfloat16 2.22 True
+
+
+
+
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+Downloading setuptools (1.1MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading matplotlib (8.3MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading pillow (6.7MiB)
+Downloading fonttools (4.7MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading sympy (6.0MiB)
+Downloading triton (148.3MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading setuptools
+ Downloading fonttools
+ Downloading networkx
+ Downloading pillow
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 37 packages in 223ms
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html
new file mode 100644
index 0000000000000000000000000000000000000000..f3b4585cfd217bc6f3bfc855913c711a4d339cc0
--- /dev/null
+++ b/flash_attn/impls/sage_attention.html
@@ -0,0 +1,3949 @@
+
+
+
+
+
+ sage_attention
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
SageAttention Implementation
+
SageAttention Benchmark (INT8 Quantized)
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the sage attention kernel
+hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention")
+
+
+def sage_attention(query, key, value):
+ """SageAttention with INT8 Q/K quantization and FP16 P/V"""
+ return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="sage_int8_fp16",
+ impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"},
+ impl_func=sage_attention,
+)
+
+
+
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+impl wl p50(ms) ok
+sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+
+
+Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
+Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 12.32it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 16.93it/s]
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html
new file mode 100644
index 0000000000000000000000000000000000000000..b12c5ba9380949bde34bbedbe6ea5dddcebc46fe
--- /dev/null
+++ b/flash_attn/impls/xformers.html
@@ -0,0 +1,4088 @@
+
+
+
+
+
+ xformers
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
xFormers Memory Efficient Attention
+
xFormers Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "xformers",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+import xformers.ops as xops
+
+
+def xformers_attention(q, k, v):
+ """xFormers memory efficient attention"""
+ # xFormers expects [batch, seq_len, heads, head_dim]
+ return xops.memory_efficient_attention(q, k, v)
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="xformers_meff",
+ impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
+ impl_func=xformers_attention,
+)
+
+
+
+
+
+
+
Running attention benchmark on cuda with 6 workloads.
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ xformers_meff 10.99% 493.828us 51.93% 2.334ms 2.334ms 0.000us 0.00% 3.600ms 3.600ms 1
+ xformers_flash3::flash_fwd 4.32% 194.118us 40.08% 1.801ms 600.437us 0.000us 0.00% 3.600ms 1.200ms 3
+ flash_attn_3::fwd 1.81% 81.292us 35.76% 1.607ms 535.731us 2.714ms 100.00% 3.600ms 1.200ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.716ms 100.05% 2.716ms 2.716ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.714ms 100.00% 2.714ms 904.730us 3
+ Activity Buffer Request 31.96% 1.436ms 31.96% 1.436ms 1.436ms 885.349us 32.62% 885.349us 885.349us 1
+ aten::empty 0.86% 38.850us 0.86% 38.850us 6.475us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.25% 11.022us 0.25% 11.022us 3.674us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.88% 39.751us 0.88% 39.751us 13.250us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.26% 11.630us 0.87% 38.970us 6.495us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.61% 27.340us 0.61% 27.340us 4.557us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 48.07% 2.160ms 48.07% 2.160ms 2.160ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.494ms
+Self CUDA time total: 2.714ms
+
+
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ xformers_meff 7.45% 327.551us 47.96% 2.108ms 2.108ms 0.000us 0.00% 3.684ms 3.684ms 1
+ xformers_flash3::flash_fwd 3.56% 156.647us 39.91% 1.754ms 584.750us 0.000us 0.00% 3.684ms 1.228ms 3
+ flash_attn_3::fwd 1.31% 57.602us 36.35% 1.598ms 532.534us 2.754ms 100.00% 3.684ms 1.228ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.06% 2.755ms 2.755ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.00% 2.754ms 917.895us 3
+ Activity Buffer Request 33.31% 1.464ms 33.31% 1.464ms 1.464ms 930.812us 33.80% 930.812us 930.812us 1
+ aten::empty 0.76% 33.251us 0.76% 33.251us 5.542us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.14% 6.040us 0.14% 6.040us 2.013us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.83% 36.590us 0.83% 36.590us 12.197us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.23% 10.130us 0.60% 26.441us 4.407us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.37% 16.311us 0.37% 16.311us 2.719us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 52.04% 2.287ms 52.04% 2.287ms 2.287ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.395ms
+Self CUDA time total: 2.754ms
+
+
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ xformers_meff 6.93% 309.631us 45.92% 2.051ms 2.051ms 0.000us 0.00% 3.806ms 3.806ms 1
+ xformers_flash3::flash_fwd 3.88% 173.206us 38.45% 1.717ms 572.356us 0.000us 0.00% 3.806ms 1.269ms 3
+ flash_attn_3::fwd 1.30% 58.031us 34.57% 1.544ms 514.621us 2.838ms 100.00% 3.806ms 1.269ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.840ms 100.06% 2.840ms 2.840ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.838ms 100.00% 2.838ms 945.948us 3
+ Activity Buffer Request 31.70% 1.416ms 31.70% 1.416ms 1.416ms 968.572us 34.13% 968.572us 968.572us 1
+ aten::empty 0.70% 31.373us 0.70% 31.373us 5.229us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.510us 0.12% 5.510us 1.837us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.74% 33.081us 0.74% 33.081us 11.027us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.19% 8.679us 0.54% 24.060us 4.010us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.34% 15.381us 0.34% 15.381us 2.564us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 54.08% 2.416ms 54.08% 2.416ms 2.416ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.466ms
+Self CUDA time total: 2.838ms
+
+
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ xformers_meff 6.70% 313.562us 47.60% 2.227ms 2.227ms 0.000us 0.00% 3.863ms 3.863ms 1
+ xformers_flash3::flash_fwd 3.24% 151.796us 40.34% 1.888ms 629.212us 0.000us 0.00% 3.863ms 1.288ms 3
+ flash_attn_3::fwd 1.25% 58.574us 37.10% 1.736ms 578.613us 2.888ms 100.00% 3.863ms 1.288ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.06% 2.890ms 2.890ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.888ms 100.00% 2.888ms 962.743us 3
+ Activity Buffer Request 30.65% 1.434ms 30.65% 1.434ms 1.434ms 974.434us 33.74% 974.434us 974.434us 1
+ aten::empty 0.64% 30.051us 0.64% 30.051us 5.008us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 5.730us 0.12% 5.730us 1.910us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 4.43% 207.206us 4.43% 207.206us 69.069us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.22% 10.139us 0.56% 26.119us 4.353us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.34% 15.980us 0.34% 15.980us 2.663us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 52.40% 2.452ms 52.40% 2.452ms 2.452ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.679ms
+Self CUDA time total: 2.888ms
+
+
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ xformers_meff 6.05% 310.689us 42.88% 2.201ms 2.201ms 0.000us 0.00% 4.489ms 4.489ms 1
+ xformers_flash3::flash_fwd 2.93% 150.475us 36.35% 1.866ms 622.001us 0.000us 0.00% 4.489ms 1.496ms 3
+ flash_attn_3::fwd 1.04% 53.593us 33.42% 1.716ms 571.843us 3.365ms 100.00% 4.489ms 1.496ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.367ms 100.05% 3.367ms 3.367ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.365ms 100.00% 3.365ms 1.122ms 3
+ Activity Buffer Request 28.02% 1.439ms 28.02% 1.439ms 1.439ms 1.123ms 33.38% 1.123ms 1.123ms 1
+ aten::empty 0.59% 30.191us 0.59% 30.191us 5.032us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.12% 6.030us 0.12% 6.030us 2.010us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.65% 187.166us 3.65% 187.166us 62.389us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.18% 9.272us 0.47% 24.322us 4.054us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.29% 15.050us 0.29% 15.050us 2.508us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 57.12% 2.932ms 57.12% 2.932ms 2.932ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 5.133ms
+Self CUDA time total: 3.365ms
+
+
+
+======================================================================
+PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ xformers_meff 6.40% 331.462us 43.16% 2.236ms 2.236ms 0.000us 0.00% 4.557ms 4.557ms 1
+ xformers_flash3::flash_fwd 2.99% 154.686us 36.26% 1.879ms 626.255us 0.000us 0.00% 4.557ms 1.519ms 3
+ flash_attn_3::fwd 1.13% 58.511us 33.27% 1.724ms 574.693us 3.413ms 100.00% 4.557ms 1.519ms 3
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.415ms 100.05% 3.415ms 3.415ms 1
+void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.413ms 100.00% 3.413ms 1.138ms 3
+ Activity Buffer Request 27.70% 1.435ms 27.70% 1.435ms 1.435ms 1.144ms 33.52% 1.144ms 1.144ms 1
+ aten::empty 0.61% 31.572us 0.61% 31.572us 5.262us 0.000us 0.00% 0.000us 0.000us 6
+ cudaFuncSetAttribute 0.11% 5.890us 0.11% 5.890us 1.963us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 3.72% 192.906us 3.72% 192.906us 64.302us 0.000us 0.00% 0.000us 0.000us 3
+ aten::reshape 0.18% 9.270us 0.50% 26.000us 4.333us 0.000us 0.00% 0.000us 0.000us 6
+ aten::view 0.32% 16.730us 0.32% 16.730us 2.788us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 56.84% 2.946ms 56.84% 2.946ms 2.946ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 5.182ms
+Self CUDA time total: 3.413ms
+
+
+impl wl p50(ms) ok
+xformers_meff cuda_attn_L128_bfloat16 0.98 True
+xformers_meff cuda_attn_L256_bfloat16 1.02 True
+xformers_meff cuda_attn_L320_bfloat16 1.07 True
+xformers_meff cuda_attn_L384_bfloat16 1.08 True
+xformers_meff cuda_attn_L448_bfloat16 1.24 True
+xformers_meff cuda_attn_L512_bfloat16 1.23 True
+
+
+
+
+Downloading xformers (111.8MiB)
+ Downloading xformers
+Installed 1 package in 13ms
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/index.html b/flash_attn/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..eea7df846d9f2d44c6c6e03a5ac30d00cecd90cf
--- /dev/null
+++ b/flash_attn/index.html
@@ -0,0 +1,89 @@
+
+
+
+
+
+ Index of /flash_attn
+
+
+
+
+ Index of /flash_attn
+
+
+
\ No newline at end of file
diff --git a/flash_attn/results/artifacts/combine/latency.svg b/flash_attn/results/artifacts/combine/latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..2d31b4481d7f215abd56492ac08378eb5fcc9988
--- /dev/null
+++ b/flash_attn/results/artifacts/combine/latency.svg
@@ -0,0 +1,355 @@
+
+
\ No newline at end of file
diff --git a/flash_attn/results/cells/combine.py b/flash_attn/results/cells/combine.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b3cb6ed6cf078138bf247e29ddf57bb5c9e7f82
--- /dev/null
+++ b/flash_attn/results/cells/combine.py
@@ -0,0 +1,30 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "Flash (PyTorch SDPA)": "UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK",
+ "MemEff (PyTorch SDPA)": "UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK",
+ "xFormers": "UVNOTE_FILE_XFORMERS_BENCHMARK",
+ "HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
+ "HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
+ "SageAttention": "UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="attention.jsonl",
+ svg_filename="latency.svg"
+)
\ No newline at end of file
diff --git a/flash_attn/results/combined_results.html b/flash_attn/results/combined_results.html
new file mode 100644
index 0000000000000000000000000000000000000000..3d799c214609ad2665c954956251cb91819359f8
--- /dev/null
+++ b/flash_attn/results/combined_results.html
@@ -0,0 +1,4774 @@
+
+
+
+
+
+ Flash Attention Benchmark - Combined Results
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
Flash Attention Benchmarks - Aggregated Results
+
This document combines benchmark results from multiple Flash Attention implementations.
+
Combined Summary and Visualization
+
+
+
+
+
+
+
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "Flash (PyTorch SDPA)": "UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK",
+ "MemEff (PyTorch SDPA)": "UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK",
+ "xFormers": "UVNOTE_FILE_XFORMERS_BENCHMARK",
+ "HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
+ "HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
+ "SageAttention": "UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="attention.jsonl",
+ svg_filename="latency.svg"
+)
+
+
+
+
+
+
+
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ Flash (PyTorch SDPA) : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04
+✓ MemEff (PyTorch SDPA) : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f
+✓ xFormers : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58
+✓ HF Kernels Flash Attn : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849
+✓ HF Kernels Flash Attn3 : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20
+✓ SageAttention : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/f6be24aff45575cad8d1df490ac5fe9ec944103fb255665c71719ca2d7efea4e
+
+ ✓ Found Flash (PyTorch SDPA)
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04/attention.jsonl
+ ✓ Found MemEff (PyTorch SDPA)
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f/attention.jsonl
+ ✓ Found xFormers
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58/attention.jsonl
+ ✓ Found HF Kernels Flash Attn
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849/attention.jsonl
+ ✓ Found HF Kernels Flash Attn3
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20/attention.jsonl
+ ✓ Found SageAttention
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/f6be24aff45575cad8d1df490ac5fe9ec944103fb255665c71719ca2d7efea4e/attention.jsonl
+
+======================================================================
+Summary: 6 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl wl p50(ms) ok
+hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
+hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True
+hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True
+hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
+hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
+hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
+hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
+hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
+hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
+hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
+hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
+hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
+sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
+ Error: module 'sage_attention_5f1806f1e6d9e7bd' has no attribute 'fwd'
+torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
+torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
+torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
+torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
+torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
+torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
+torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
+torch_mem_eff cuda_attn_L256_bfloat16 1.99 True
+torch_mem_eff cuda_attn_L320_bfloat16 2.02 True
+torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
+torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
+torch_mem_eff cuda_attn_L512_bfloat16 2.22 True
+xformers_meff cuda_attn_L128_bfloat16 0.98 True
+xformers_meff cuda_attn_L256_bfloat16 1.02 True
+xformers_meff cuda_attn_L320_bfloat16 1.07 True
+xformers_meff cuda_attn_L384_bfloat16 1.08 True
+xformers_meff cuda_attn_L448_bfloat16 1.24 True
+xformers_meff cuda_attn_L512_bfloat16 1.23 True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 36 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 6
+
+Implementations included:
+ ✓ Flash (PyTorch SDPA)
+ ✓ MemEff (PyTorch SDPA)
+ ✓ xFormers
+ ✓ HF Kernels Flash Attn
+ ✓ HF Kernels Flash Attn3
+ ✓ SageAttention
+
+
+
+
+Installed 37 packages in 221ms
+
+
+
+
Artifacts:
+
latency.svg
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flash_attn/results/index.html b/flash_attn/results/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..b87b6002f4b781572dbb50f91850e50ee98130ab
--- /dev/null
+++ b/flash_attn/results/index.html
@@ -0,0 +1,88 @@
+
+
+
+
+
+ Index of /flash_attn/results
+
+
+
+
+ Index of /flash_attn/results
+
+
+
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..1061b4b3222caa3480fdd412bcf6f18bb97b54f9
--- /dev/null
+++ b/index.html
@@ -0,0 +1,4029 @@
+
+
+
+
+
+ index
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
All Benchmarks Aggregated Report
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+| Implementation |
+Description |
+
+
+
+
+| HF Kernels SwiGLU |
+HuggingFace kernels SwiGLU implementation |
+
+
+| PyTorch SwiGLU |
+PyTorch native SwiGLU implementation |
+
+
+
+
+
+
+
+
+
+
+
+| Implementation |
+Description |
+
+
+
+
+| HF Kernels ReLU |
+HuggingFace kernels ReLU implementation |
+
+
+| PyTorch ReLU |
+PyTorch native ReLU implementation |
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..42766b88dac4a0e2fb70b1966497dfa03856e571
--- /dev/null
+++ b/layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
@@ -0,0 +1,4 @@
+{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8265980000032869, "p50": 0.8294890000115629, "p90": 0.8318879999933415, "mean": 0.8305783999958294, "iqr": 0.0024899999857552757, "raw_times": [0.8318879999933415, 0.8294890000115629, 0.8293980000075862, 0.8355189999633694, 0.8265980000032869], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8372490000283506, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6484859999650325, "p50": 1.6553460000068299, "p90": 1.6562569999791776, "mean": 1.654196599986335, "iqr": 0.004349999983332964, "raw_times": [1.6589869999847906, 1.6484859999650325, 1.6553460000068299, 1.6519069999958447, 1.6562569999791776], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6548570000054497, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6374860000496483, "p50": 1.6479959999742277, "p90": 1.650296000036633, "mean": 1.6462442000261035, "iqr": 0.007159000006140559, "raw_times": [1.6479959999742277, 1.6374860000496483, 1.6523060000395162, 1.6431370000304923, 1.650296000036633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.658577000000605, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
+{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2406110000238186, "p50": 3.2579909999981282, "p90": 3.259831999969265, "mean": 3.2558895999954984, "iqr": 0.00626999997166422, "raw_times": [3.259831999969265, 3.2579909999981282, 3.2674519999886797, 3.2535619999976007, 3.2406110000238186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2579709999822626, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
diff --git a/layer_norm/impls/cells/benchmark.py b/layer_norm/impls/cells/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..d871d1b25fedf8b294c567e9ac582decb62f3cde
--- /dev/null
+++ b/layer_norm/impls/cells/benchmark.py
@@ -0,0 +1,49 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the layer norm kernel
+layer_norm_kernel = get_kernel("kernels-community/layer-norm")
+
+
+def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
+ B, S, D = x.shape
+ # The kernel expects [N, D] input; support beta (bias) if provided.
+ out = layer_norm_kernel.dropout_add_ln_fwd(
+ input=x.view(-1, D),
+ gamma=weight,
+ beta=bias,
+ rowscale=None,
+ colscale=None,
+ x0_subset=None,
+ z_subset=None,
+ dropout_p=0.0,
+ epsilon=eps,
+ rowscale_const=1.0,
+ z_numrows=S,
+ gen=None,
+ residual_in_fp32=False,
+ is_rms_norm=False,
+ )[0].view(B, S, D)
+ return out
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.LAYER_NORM,
+ impl_name="hf_kernels_layer_norm",
+ impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
+ impl_func=hf_kernels_layer_norm,
+)
\ No newline at end of file
diff --git a/layer_norm/impls/cells/nv.py b/layer_norm/impls/cells/nv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5
--- /dev/null
+++ b/layer_norm/impls/cells/nv.py
@@ -0,0 +1,2 @@
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
\ No newline at end of file
diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html
new file mode 100644
index 0000000000000000000000000000000000000000..b0e606786c9541a6e20ae9d4a9aef137dc63aaa8
--- /dev/null
+++ b/layer_norm/impls/hf_kernels_layer_norm.html
@@ -0,0 +1,4052 @@
+
+
+
+
+
+ hf_kernels_layer_norm
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
HF Kernels LayerNorm Implementation
+
Based on kernels-community layer-norm kernel.
+
LayerNorm Benchmark (HF Kernels)
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the layer norm kernel
+layer_norm_kernel = get_kernel("kernels-community/layer-norm")
+
+
+def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
+ B, S, D = x.shape
+ # The kernel expects [N, D] input; support beta (bias) if provided.
+ out = layer_norm_kernel.dropout_add_ln_fwd(
+ input=x.view(-1, D),
+ gamma=weight,
+ beta=bias,
+ rowscale=None,
+ colscale=None,
+ x0_subset=None,
+ z_subset=None,
+ dropout_p=0.0,
+ epsilon=eps,
+ rowscale_const=1.0,
+ z_numrows=S,
+ gen=None,
+ residual_in_fp32=False,
+ is_rms_norm=False,
+ )[0].view(B, S, D)
+ return out
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.LAYER_NORM,
+ impl_name="hf_kernels_layer_norm",
+ impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
+ impl_func=hf_kernels_layer_norm,
+)
+
+
+
+
+
+
+
Running layer_norm benchmark on cuda with 4 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_layer_norm 4.63% 185.406us 46.16% 1.847ms 1.847ms 0.000us 0.00% 3.120ms 3.120ms 1
+ _layer_norm_f8ec252::dropout_add_ln_fwd 1.69% 67.562us 40.98% 1.640ms 546.562us 2.384ms 100.00% 3.120ms 1.040ms 3
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.385ms 100.06% 2.385ms 2.385ms 1
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.384ms 100.00% 2.384ms 794.642us 3
+ Activity Buffer Request 36.92% 1.477ms 36.92% 1.477ms 1.477ms 735.676us 30.86% 735.676us 735.676us 1
+ aten::view 0.54% 21.751us 0.54% 21.751us 3.625us 0.000us 0.00% 0.000us 0.000us 6
+ aten::empty 1.11% 44.581us 1.11% 44.581us 4.953us 0.000us 0.00% 0.000us 0.000us 9
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.360us 0.23% 9.360us 3.120us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 1.03% 41.042us 1.03% 41.042us 13.681us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 53.84% 2.154ms 53.84% 2.154ms 2.154ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 4.001ms
+Self CUDA time total: 2.384ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_layer_norm 2.29% 145.447us 26.95% 1.711ms 1.711ms 0.000us 0.00% 6.386ms 6.386ms 1
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 47.652us 24.47% 1.553ms 517.784us 4.812ms 100.00% 6.386ms 2.129ms 3
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.814ms 100.03% 4.814ms 4.814ms 1
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.812ms 100.00% 4.812ms 1.604ms 3
+ Activity Buffer Request 22.77% 1.446ms 22.77% 1.446ms 1.446ms 1.574ms 32.71% 1.574ms 1.574ms 1
+ aten::view 0.19% 11.759us 0.19% 11.759us 1.960us 0.000us 0.00% 0.000us 0.000us 6
+ aten::empty 0.46% 29.151us 0.46% 29.151us 3.239us 0.000us 0.00% 0.000us 0.000us 9
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.860us 0.08% 4.860us 1.620us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.41% 26.131us 0.41% 26.131us 8.710us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 73.05% 4.638ms 73.05% 4.638ms 4.638ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 6.348ms
+Self CUDA time total: 4.812ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_layer_norm 2.00% 126.827us 27.00% 1.712ms 1.712ms 0.000us 0.00% 6.353ms 6.353ms 1
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.76% 48.491us 24.80% 1.572ms 524.088us 4.792ms 100.00% 6.353ms 2.118ms 3
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.793ms 100.03% 4.793ms 4.793ms 1
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.792ms 100.00% 4.792ms 1.597ms 3
+ Activity Buffer Request 23.05% 1.462ms 23.05% 1.462ms 1.462ms 1.561ms 32.58% 1.561ms 1.561ms 1
+ aten::view 0.20% 12.869us 0.20% 12.869us 2.145us 0.000us 0.00% 0.000us 0.000us 6
+ aten::empty 0.48% 30.222us 0.48% 30.222us 3.358us 0.000us 0.00% 0.000us 0.000us 9
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.090us 0.08% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 0.42% 26.901us 0.42% 26.901us 8.967us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 73.00% 4.628ms 73.00% 4.628ms 4.628ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 6.340ms
+Self CUDA time total: 4.792ms
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_layer_norm 1.24% 144.853us 19.15% 2.240ms 2.240ms 0.000us 0.00% 12.815ms 12.815ms 1
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.39% 45.741us 17.80% 2.083ms 694.211us 9.628ms 100.00% 12.815ms 4.272ms 3
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.629ms 100.01% 9.629ms 9.629ms 1
+void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.628ms 100.00% 9.628ms 3.209ms 3
+ Activity Buffer Request 14.62% 1.710ms 14.62% 1.710ms 1.710ms 3.188ms 33.11% 3.188ms 3.188ms 1
+ aten::view 0.11% 12.972us 0.11% 12.972us 2.162us 0.000us 0.00% 0.000us 0.000us 6
+ aten::empty 0.26% 30.501us 0.26% 30.501us 3.389us 0.000us 0.00% 0.000us 0.000us 9
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 5.220us 0.04% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3
+ cudaLaunchKernel 2.49% 291.291us 2.49% 291.291us 97.097us 0.000us 0.00% 0.000us 0.000us 3
+ cudaDeviceSynchronize 80.85% 9.456ms 80.85% 9.456ms 9.456ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 11.697ms
+Self CUDA time total: 9.628ms
+
+
+impl wl p50(ms) ok
+hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
+hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
+hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
+hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
+
+
+
+
+Installed 15 packages in 14ms
+
+
+
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
+Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 6.81it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.12it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.56it/s]
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/layer_norm/impls/index.html b/layer_norm/impls/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..51ba6dd6789d67e2ffa1e3f02dea720dbda17216
--- /dev/null
+++ b/layer_norm/impls/index.html
@@ -0,0 +1,89 @@
+
+
+
+
+
+ Index of /layer_norm/impls
+
+
+
+
+ Index of /layer_norm/impls
+
+
+
\ No newline at end of file
diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html
new file mode 100644
index 0000000000000000000000000000000000000000..b27efb4b5a46f85ef083153354b6c1b716511ffb
--- /dev/null
+++ b/layer_norm/impls/torch_layer_norm.html
@@ -0,0 +1,4073 @@
+
+
+
+
+
+ torch_layer_norm
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
Torch LayerNorm Implementation
+
GPU Info
+
+
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+
+
+
+
+
+
Wed Oct 29 00:36:39 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 33C P0 128W / 350W | 0MiB / 46068MiB | 100% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
LayerNorm Benchmark (PyTorch)
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
+ return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.LAYER_NORM,
+ impl_name="torch_layer_norm",
+ impl_tags={"family": "torch", "op": "layer_norm"},
+ impl_func=torch_layer_norm,
+)
+
+
+
+
+
+
+
Running layer_norm benchmark on cuda with 4 workloads.
+
+======================================================================
+PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_layer_norm 3.94% 153.226us 45.99% 1.787ms 1.787ms 0.000us 0.00% 3.036ms 3.036ms 1
+ aten::layer_norm 0.41% 15.819us 42.05% 1.634ms 544.665us 0.000us 0.00% 3.036ms 1.012ms 3
+ aten::native_layer_norm 2.10% 81.554us 41.64% 1.618ms 539.392us 2.323ms 100.00% 3.036ms 1.012ms 3
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.325ms 100.06% 2.325ms 2.325ms 1
+void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.323ms 100.00% 2.323ms 774.498us 3
+ Activity Buffer Request 36.88% 1.433ms 36.88% 1.433ms 1.433ms 712.322us 30.66% 712.322us 712.322us 1
+ aten::empty 1.28% 49.611us 1.28% 49.611us 5.512us 0.000us 0.00% 0.000us 0.000us 9
+ cudaLaunchKernel 1.19% 46.322us 1.19% 46.322us 15.441us 0.000us 0.00% 0.000us 0.000us 3
+ aten::view 0.19% 7.380us 0.19% 7.380us 1.230us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 54.01% 2.099ms 54.01% 2.099ms 2.099ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 3.886ms
+Self CUDA time total: 2.323ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_layer_norm 1.13% 72.543us 25.40% 1.627ms 1.627ms 0.000us 0.00% 6.533ms 6.533ms 1
+ aten::layer_norm 0.14% 8.900us 24.27% 1.554ms 518.074us 0.000us 0.00% 6.533ms 2.178ms 3
+ aten::native_layer_norm 0.84% 53.651us 24.13% 1.545ms 515.108us 4.915ms 100.00% 6.533ms 2.178ms 3
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.917ms 100.03% 4.917ms 4.917ms 1
+void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.915ms 100.00% 4.915ms 1.638ms 3
+ Activity Buffer Request 22.32% 1.430ms 22.32% 1.430ms 1.430ms 1.618ms 32.92% 1.618ms 1.618ms 1
+ aten::empty 0.44% 28.460us 0.44% 28.460us 3.162us 0.000us 0.00% 0.000us 0.000us 9
+ cudaLaunchKernel 0.46% 29.343us 0.46% 29.343us 9.781us 0.000us 0.00% 0.000us 0.000us 3
+ aten::view 0.07% 4.330us 0.07% 4.330us 0.722us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 74.60% 4.777ms 74.60% 4.777ms 4.777ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 6.403ms
+Self CUDA time total: 4.915ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_layer_norm 1.16% 72.353us 26.06% 1.624ms 1.624ms 0.000us 0.00% 6.259ms 6.259ms 1
+ aten::layer_norm 0.14% 8.650us 24.90% 1.551ms 517.051us 0.000us 0.00% 6.259ms 2.086ms 3
+ aten::native_layer_norm 0.85% 52.692us 24.76% 1.543ms 514.168us 4.742ms 100.00% 6.259ms 2.086ms 3
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.744ms 100.03% 4.744ms 4.744ms 1
+void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.742ms 100.00% 4.742ms 1.581ms 3
+ Activity Buffer Request 22.91% 1.427ms 22.91% 1.427ms 1.427ms 1.517ms 31.99% 1.517ms 1.517ms 1
+ aten::empty 0.47% 29.452us 0.47% 29.452us 3.272us 0.000us 0.00% 0.000us 0.000us 9
+ cudaLaunchKernel 0.47% 29.331us 0.47% 29.331us 9.777us 0.000us 0.00% 0.000us 0.000us 3
+ aten::view 0.06% 4.009us 0.06% 4.009us 0.668us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 73.94% 4.606ms 73.94% 4.606ms 4.606ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 6.229ms
+Self CUDA time total: 4.742ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_layer_norm 0.67% 74.863us 13.13% 1.463ms 1.463ms 0.000us 0.00% 13.036ms 13.036ms 1
+ aten::layer_norm 0.09% 9.640us 12.46% 1.388ms 462.622us 0.000us 0.00% 13.036ms 4.345ms 3
+ aten::native_layer_norm 0.46% 51.640us 12.37% 1.378ms 459.409us 9.812ms 100.00% 13.036ms 4.345ms 3
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.814ms 100.01% 9.814ms 9.814ms 1
+void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.812ms 100.00% 9.812ms 3.271ms 3
+ Activity Buffer Request 9.60% 1.069ms 9.60% 1.069ms 1.069ms 3.224ms 32.85% 3.224ms 3.224ms 1
+ aten::empty 0.26% 29.363us 0.26% 29.363us 3.263us 0.000us 0.00% 0.000us 0.000us 9
+ cudaLaunchKernel 2.01% 223.547us 2.01% 223.547us 74.516us 0.000us 0.00% 0.000us 0.000us 3
+ aten::view 0.04% 4.180us 0.04% 4.180us 0.697us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 86.87% 9.675ms 86.87% 9.675ms 9.675ms 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 11.138ms
+Self CUDA time total: 9.812ms
+
+
+impl wl p50(ms) ok
+torch_layer_norm LN_B16_S2048_D4096 0.82 True
+torch_layer_norm LN_B16_S2048_D8192 1.68 True
+torch_layer_norm LN_B16_S4096_D4096 1.61 True
+torch_layer_norm LN_B16_S4096_D8192 3.32 True
+
+
+
+
+Installed 37 packages in 222ms
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/layer_norm/index.html b/layer_norm/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..12f60968be235270e079aa5c48545ec9a928579b
--- /dev/null
+++ b/layer_norm/index.html
@@ -0,0 +1,89 @@
+
+
+
+
+
+ Index of /layer_norm
+
+
+
+
+ Index of /layer_norm
+
+
+
\ No newline at end of file
diff --git a/layer_norm/results/artifacts/combine/latency.svg b/layer_norm/results/artifacts/combine/latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..2b2c749f1fb0d4e53be110d2207865dbdced18be
--- /dev/null
+++ b/layer_norm/results/artifacts/combine/latency.svg
@@ -0,0 +1,230 @@
+
+
\ No newline at end of file
diff --git a/layer_norm/results/cells/combine.py b/layer_norm/results/cells/combine.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6dbd0a965ba54848e36671a564ac6122b6790b8
--- /dev/null
+++ b/layer_norm/results/cells/combine.py
@@ -0,0 +1,26 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "PyTorch LayerNorm": "UVNOTE_FILE_TORCH_LAYER_NORM_BENCHMARK",
+ "HF Kernels LayerNorm": "UVNOTE_FILE_HF_KERNELS_LAYER_NORM_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="layer_norm.jsonl",
+ svg_filename="latency.svg"
+)
\ No newline at end of file
diff --git a/layer_norm/results/combined_results.html b/layer_norm/results/combined_results.html
new file mode 100644
index 0000000000000000000000000000000000000000..a979f564165585cc88a008eb1fbc5bfd5aa6bef5
--- /dev/null
+++ b/layer_norm/results/combined_results.html
@@ -0,0 +1,4466 @@
+
+
+
+
+
+ LayerNorm Benchmark - Combined Results
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
LayerNorm Benchmarks - Aggregated Results
+
This document combines benchmark results from multiple LayerNorm implementations.
+
Combined Summary and Visualization
+
+
+
+
+
+
+
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "PyTorch LayerNorm": "UVNOTE_FILE_TORCH_LAYER_NORM_BENCHMARK",
+ "HF Kernels LayerNorm": "UVNOTE_FILE_HF_KERNELS_LAYER_NORM_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="layer_norm.jsonl",
+ svg_filename="latency.svg"
+)
+
+
+
+
+
+
+
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ PyTorch LayerNorm : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3
+✓ HF Kernels LayerNorm : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74
+
+ ✓ Found PyTorch LayerNorm
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3/layer_norm.jsonl
+ ✓ Found HF Kernels LayerNorm
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74/layer_norm.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl wl p50(ms) ok
+hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
+hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
+hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
+hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
+torch_layer_norm LN_B16_S2048_D4096 0.82 True
+torch_layer_norm LN_B16_S2048_D8192 1.68 True
+torch_layer_norm LN_B16_S4096_D4096 1.61 True
+torch_layer_norm LN_B16_S4096_D8192 3.32 True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 8 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+ ✓ PyTorch LayerNorm
+ ✓ HF Kernels LayerNorm
+
+
+
+
+Installed 37 packages in 195ms
+
+
+
+
Artifacts:
+
latency.svg
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/layer_norm/results/index.html b/layer_norm/results/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..5b6bcefdc3dcaa949d66002abc2672c3de221470
--- /dev/null
+++ b/layer_norm/results/index.html
@@ -0,0 +1,88 @@
+
+
+
+
+
+ Index of /layer_norm/results
+
+
+
+
+ Index of /layer_norm/results
+
+
+
\ No newline at end of file
diff --git a/rotary/impls/artifacts/benchmark/rotary.jsonl b/rotary/impls/artifacts/benchmark/rotary.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a21dbdf17991e5611014fcf9c02138c37ed1901
--- /dev/null
+++ b/rotary/impls/artifacts/benchmark/rotary.jsonl
@@ -0,0 +1,24 @@
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07538300002352116, "p50": 0.07777199999736695, "p90": 0.07795200002647107, "mean": 0.07717860000866494, "iqr": 0.0014790000477660215, "raw_times": [0.07777199999736695, 0.07647299997870505, 0.07795200002647107, 0.07831300001726049, 0.07538300002352116], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0837029999729566, "peak_bytes": 1720320, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.00153350830078125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09504299998752685, "p50": 0.09633299998768052, "p90": 0.09746300003143915, "mean": 0.0966769999877215, "iqr": 0.0013000000649299182, "raw_times": [0.09504299998752685, 0.09633299998768052, 0.09838299996545175, 0.09616299996650923, 0.09746300003143915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09918300003164404, "peak_bytes": 3440640, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.00154876708984375, "mse_q": 1.5854835510253906e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0929430000269349, "p50": 0.09560399996644264, "p90": 0.09620299999824056, "mean": 0.09600920000139013, "iqr": 0.0026899999738816405, "raw_times": [0.09620299999824056, 0.09560399996644264, 0.10178299999097362, 0.09351300002435892, 0.0929430000269349], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10062299998025992, "peak_bytes": 6832128, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09350300001642609, "p50": 0.09415400000989393, "p90": 0.09585299994796515, "mean": 0.09842139999136634, "iqr": 0.001959999963219161, "raw_times": [0.09350300001642609, 0.09585299994796515, 0.09415400000989393, 0.11470399999780057, 0.09389299998474598], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09742299999970783, "peak_bytes": 13664256, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09248300000308518, "p50": 0.09347299999262759, "p90": 0.09500300001263895, "mean": 0.09405499998820233, "iqr": 0.0018000000636675395, "raw_times": [0.09248300000308518, 0.09500300001263895, 0.0961129999836885, 0.09347299999262759, 0.09320299994897141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09855400003289105, "peak_bytes": 6881280, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09233299999777955, "p50": 0.09477300000071409, "p90": 0.09477400004698211, "mean": 0.09424540002100912, "iqr": 0.0021910000214120373, "raw_times": [0.09233299999777955, 0.09477400004698211, 0.09477300000071409, 0.09676400003399976, 0.09258300002557007], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09677399998508918, "peak_bytes": 13762560, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09216400002287628, "p50": 0.09306300000844203, "p90": 0.09349300000849325, "mean": 0.09324520001428027, "iqr": 0.0005399999736255268, "raw_times": [0.09216400002287628, 0.09306300000844203, 0.09455299999672206, 0.09349300000849325, 0.09295300003486773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10914400002093316, "peak_bytes": 27328512, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.00153350830078125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.5854835510253906e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09248299994624176, "p50": 0.09334300000318763, "p90": 0.09355399998867142, "mean": 0.0935691999870869, "iqr": 0.00066100000140068, "raw_times": [0.09355399998867142, 0.09557300001006297, 0.09334300000318763, 0.09248299994624176, 0.09289299998727074], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09584299999687573, "peak_bytes": 54657024, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.00154876708984375, "mse_q": 1.621246337890625e-05, "mse_k": 1.621246337890625e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09247299999515235, "p50": 0.09385300000985808, "p90": 0.09445400002050519, "mean": 0.09405140001490508, "iqr": 0.001121000025250396, "raw_times": [0.09247299999515235, 0.09445400002050519, 0.0933329999952548, 0.09385300000985808, 0.09614400005375501], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09844400000247333, "peak_bytes": 27525120, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09372300002041811, "p50": 0.094173000036335, "p90": 0.09575299998232367, "mean": 0.09506720000445057, "iqr": 0.0020299999619055598, "raw_times": [0.09796399996275795, 0.09575299998232367, 0.094173000036335, 0.09372300002041811, 0.09372300002041811], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09865399999853253, "peak_bytes": 55050240, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09140299999899071, "p50": 0.092913999992561, "p90": 0.09422299996231231, "mean": 0.09330119999049202, "iqr": 0.0015199999552351073, "raw_times": [0.09140299999899071, 0.09526299999151888, 0.092913999992561, 0.09422299996231231, 0.09270300000707721], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09514300001001175, "peak_bytes": 109314048, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09479400000600435, "p50": 0.09623299996519563, "p90": 0.09679300001153024, "mean": 0.09610519999796452, "iqr": 0.000919999990856013, "raw_times": [0.09587300002067423, 0.09679300001153024, 0.09479400000600435, 0.09623299996519563, 0.09683299998641814], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09740300004068558, "peak_bytes": 218628096, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09216300003345168, "p50": 0.09397300004820863, "p90": 0.09462299999540846, "mean": 0.09381320001011773, "iqr": 0.0016889999869817984, "raw_times": [0.09293400000842666, 0.09537299996509319, 0.09397300004820863, 0.09216300003345168, 0.09462299999540846], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10023300001194002, "peak_bytes": 68698112, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0913630000241028, "p50": 0.0930929999753971, "p90": 0.09448299999803567, "mean": 0.09361499999158696, "iqr": 0.0023500000452258973, "raw_times": [0.0913630000241028, 0.09700300000758943, 0.09213299995280977, 0.09448299999803567, 0.0930929999753971], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09703300003138793, "peak_bytes": 6848512, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.5974044799804688e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0902330000371876, "p50": 0.09208300002683245, "p90": 0.0927039999965018, "mean": 0.0920254000220666, "iqr": 0.0007599999776175537, "raw_times": [0.0902330000371876, 0.09194400001888425, 0.09208300002683245, 0.09316300003092692, 0.0927039999965018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09501400000999638, "peak_bytes": 13647872, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09339300004285178, "p50": 0.09388300003365657, "p90": 0.09438299997555077, "mean": 0.09392300001991316, "iqr": 0.0009499999578110874, "raw_times": [0.09388300003365657, 0.09452300002976699, 0.09438299997555077, 0.09339300004285178, 0.09343300001773969], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09746399996402033, "peak_bytes": 27295744, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.621246337890625e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09369299999661962, "p50": 0.09495300002981821, "p90": 0.09641299999429975, "mean": 0.09557120000636132, "iqr": 0.001839999981712026, "raw_times": [0.09457300001258773, 0.09495300002981821, 0.09641299999429975, 0.09369299999661962, 0.0982239999984813], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09584299999687573, "peak_bytes": 13697024, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09207300001889962, "p50": 0.09441299999934927, "p90": 0.09493300001395255, "mean": 0.09826719999637135, "iqr": 0.0009000000318337698, "raw_times": [0.09207300001889962, 0.11588399996753651, 0.09441299999934927, 0.09493300001395255, 0.09403299998211878], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09803300002886317, "peak_bytes": 27394048, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.00153350830078125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.5974044799804688e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09320300000581483, "p50": 0.09509299997034759, "p90": 0.0968430000511944, "mean": 0.0957752000090295, "iqr": 0.0027100000465907215, "raw_times": [0.0968430000511944, 0.09413300000460367, 0.09509299997034759, 0.09960400001318703, 0.09320300000581483], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09855399997604763, "peak_bytes": 54591488, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0926630000321893, "p50": 0.09438299997555077, "p90": 0.09443299995837151, "mean": 0.09837319998950989, "iqr": 0.0016799999684735667, "raw_times": [0.09275299998989794, 0.09438299997555077, 0.09443299995837151, 0.0926630000321893, 0.1176339999915399], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09754300003805838, "peak_bytes": 109182976, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09100299996589456, "p50": 0.09359299997413473, "p90": 0.09518299998489965, "mean": 0.09356119999210932, "iqr": 0.0025699999355310865, "raw_times": [0.09100299996589456, 0.09518299998489965, 0.09261300004936857, 0.09541399998624911, 0.09359299997413473], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.11789399997041983, "peak_bytes": 54788096, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.125, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09348399998998502, "p50": 0.09433299999273004, "p90": 0.09580299996514441, "mean": 0.09473540000044522, "iqr": 0.0016299999288094114, "raw_times": [0.09433299999273004, 0.09580299996514441, 0.09588400001803166, 0.09348399998998502, 0.094173000036335], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09657300000753821, "peak_bytes": 109576192, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0974529999666629, "p50": 0.09860399995886837, "p90": 0.09875400002101742, "mean": 0.09851759998582565, "iqr": 0.0008510000384376326, "raw_times": [0.09790299998257979, 0.0974529999666629, 0.09860399995886837, 0.09875400002101742, 0.0998739999999998], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10540400000991212, "peak_bytes": 218365952, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
+{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2809499999898435, "p50": 0.28135000002293964, "p90": 0.2840199999809556, "mean": 0.28239179999900443, "iqr": 0.0029809999659846653, "raw_times": [0.2809499999898435, 0.28459999998631247, 0.2840199999809556, 0.28103900001497095, 0.28135000002293964], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.28416999998626125, "peak_bytes": 436731904, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.125, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
diff --git a/rotary/impls/cells/benchmark.py b/rotary/impls/cells/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e8216119f01d4dff50b7e1571fac564b8d33892
--- /dev/null
+++ b/rotary/impls/cells/benchmark.py
@@ -0,0 +1,47 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the rotary kernel
+rotary = get_kernel("kernels-community/rotary")
+
+
+def hf_kernels_rotary(query, key, cos, sin, conj=False):
+ rotary_dim = cos.shape[-1]
+
+ # Clone to avoid modifying inputs
+ q_out = query.clone()
+ k_out = key.clone()
+
+ # Apply rotation to query
+ q1 = q_out[..., :rotary_dim]
+ q2 = q_out[..., rotary_dim : 2 * rotary_dim]
+ rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
+
+ # Apply rotation to key
+ k1 = k_out[..., :rotary_dim]
+ k2 = k_out[..., rotary_dim : 2 * rotary_dim]
+ rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
+
+ return q_out, k_out
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ROTARY,
+ impl_name="hf_kernels_rotary",
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
+ impl_func=hf_kernels_rotary,
+)
\ No newline at end of file
diff --git a/rotary/impls/cells/nv.py b/rotary/impls/cells/nv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5
--- /dev/null
+++ b/rotary/impls/cells/nv.py
@@ -0,0 +1,2 @@
+import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
\ No newline at end of file
diff --git a/rotary/impls/hf_kernels_rotary.html b/rotary/impls/hf_kernels_rotary.html
new file mode 100644
index 0000000000000000000000000000000000000000..e458bcc5786ea404df91ec4027149e3ec0c0a5aa
--- /dev/null
+++ b/rotary/impls/hf_kernels_rotary.html
@@ -0,0 +1,4653 @@
+
+
+
+
+
+ hf_kernels_rotary
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
HF Kernels - Rotary Position Embeddings
+
GPU Info
+
+
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+
+
+
+
+
+
Wed Oct 29 00:36:23 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 31C P0 86W / 350W | 0MiB / 46068MiB | 22% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
Rotary Embeddings Benchmark
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the rotary kernel
+rotary = get_kernel("kernels-community/rotary")
+
+
+def hf_kernels_rotary(query, key, cos, sin, conj=False):
+ rotary_dim = cos.shape[-1]
+
+ # Clone to avoid modifying inputs
+ q_out = query.clone()
+ k_out = key.clone()
+
+ # Apply rotation to query
+ q1 = q_out[..., :rotary_dim]
+ q2 = q_out[..., rotary_dim : 2 * rotary_dim]
+ rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
+
+ # Apply rotation to key
+ k1 = k_out[..., :rotary_dim]
+ k2 = k_out[..., rotary_dim : 2 * rotary_dim]
+ rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
+
+ return q_out, k_out
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ROTARY,
+ impl_name="hf_kernels_rotary",
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
+ impl_func=hf_kernels_rotary,
+)
+
+
+
+
+
+
+
Running rotary benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 417.663us 1766.17% 417.663us 417.663us 1
+ hf_kernels_rotary 11.92% 243.797us 99.67% 2.039ms 2.039ms 0.000us 0.00% 24.864us 24.864us 1
+ _rotary_dba7d1e::apply_rotary 2.64% 54.054us 5.06% 103.576us 17.263us 16.992us 71.85% 16.992us 2.832us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 71.85% 16.992us 2.832us 6
+ aten::clone 2.02% 41.272us 79.82% 1.633ms 272.116us 0.000us 0.00% 7.872us 1.312us 6
+ aten::copy_ 1.82% 37.200us 74.94% 1.533ms 255.467us 6.656us 28.15% 7.872us 1.312us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 28.15% 6.656us 1.109us 6
+ Activity Buffer Request 69.47% 1.421ms 69.47% 1.421ms 1.421ms 1.216us 5.14% 1.216us 1.216us 1
+ aten::empty_strided 2.87% 58.622us 2.87% 58.622us 9.770us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 3.65% 74.674us 3.65% 74.674us 12.446us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.25% 46.121us 2.87% 58.631us 4.886us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.61% 12.510us 0.61% 12.510us 1.042us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.42% 49.522us 2.42% 49.522us 8.254us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.33% 6.691us 0.33% 6.691us 6.691us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.045ms
+Self CUDA time total: 23.648us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 362.684us 1535.76% 362.684us 362.684us 1
+ hf_kernels_rotary 9.63% 184.044us 99.76% 1.906ms 1.906ms 0.000us 0.00% 24.736us 24.736us 1
+ _rotary_dba7d1e::apply_rotary 2.64% 50.383us 5.03% 96.065us 16.011us 16.864us 71.41% 16.864us 2.811us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.864us 71.41% 16.864us 2.811us 6
+ aten::clone 1.50% 28.618us 82.74% 1.581ms 263.486us 0.000us 0.00% 7.872us 1.312us 6
+ aten::copy_ 1.95% 37.192us 79.54% 1.520ms 253.297us 6.752us 28.59% 7.872us 1.312us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 28.59% 6.752us 1.125us 6
+ Activity Buffer Request 74.55% 1.424ms 74.55% 1.424ms 1.424ms 1.120us 4.74% 1.120us 1.120us 1
+ aten::empty_strided 1.70% 32.513us 1.70% 32.513us 5.419us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 3.05% 58.263us 3.05% 58.263us 9.710us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.86% 35.461us 2.36% 45.051us 3.754us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.50% 9.590us 0.50% 9.590us 0.799us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.39% 45.682us 2.39% 45.682us 7.614us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.24% 4.600us 0.24% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.911ms
+Self CUDA time total: 23.616us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.889us 1406.66% 352.889us 352.889us 1
+ hf_kernels_rotary 9.52% 180.074us 99.73% 1.887ms 1.887ms 0.000us 0.00% 26.399us 26.399us 1
+ _rotary_dba7d1e::apply_rotary 2.26% 42.841us 4.55% 86.004us 14.334us 17.248us 68.75% 17.248us 2.875us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.248us 68.75% 17.248us 2.875us 6
+ aten::clone 1.50% 28.330us 83.30% 1.576ms 262.706us 0.000us 0.00% 9.151us 1.525us 6
+ aten::copy_ 1.91% 36.070us 80.06% 1.515ms 252.487us 7.839us 31.25% 9.151us 1.525us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.839us 31.25% 7.839us 1.307us 6
+ Activity Buffer Request 75.19% 1.423ms 75.19% 1.423ms 1.423ms 1.312us 5.23% 1.312us 1.312us 1
+ aten::empty_strided 1.74% 32.981us 1.74% 32.981us 5.497us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.97% 56.174us 2.97% 56.174us 9.362us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.86% 35.224us 2.36% 44.742us 3.729us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.50% 9.518us 0.50% 9.518us 0.793us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.28% 43.163us 2.28% 43.163us 7.194us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.27% 5.081us 0.27% 5.081us 5.081us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.892ms
+Self CUDA time total: 25.087us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 353.892us 1375.46% 353.892us 353.892us 1
+ hf_kernels_rotary 8.61% 178.135us 99.77% 2.063ms 2.063ms 0.000us 0.00% 27.041us 27.041us 1
+ _rotary_dba7d1e::apply_rotary 2.02% 41.741us 4.14% 85.532us 14.255us 17.985us 69.90% 17.985us 2.997us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.985us 69.90% 17.985us 2.997us 6
+ aten::clone 1.32% 27.361us 84.83% 1.754ms 292.410us 0.000us 0.00% 9.056us 1.509us 6
+ aten::copy_ 1.77% 36.582us 81.87% 1.693ms 282.183us 7.744us 30.10% 9.056us 1.509us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 30.10% 7.744us 1.291us 6
+ Activity Buffer Request 68.36% 1.414ms 68.36% 1.414ms 1.414ms 1.312us 5.10% 1.312us 1.312us 1
+ aten::empty_strided 1.64% 34.001us 1.64% 34.001us 5.667us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 11.73% 242.678us 11.73% 242.678us 40.446us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.70% 35.202us 2.18% 45.153us 3.763us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.48% 9.951us 0.48% 9.951us 0.829us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.12% 43.791us 2.12% 43.791us 7.299us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.23% 4.830us 0.23% 4.830us 4.830us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.068ms
+Self CUDA time total: 25.729us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 351.422us 1397.30% 351.422us 351.422us 1
+ hf_kernels_rotary 8.84% 180.886us 99.76% 2.041ms 2.041ms 0.000us 0.00% 26.462us 26.462us 1
+ _rotary_dba7d1e::apply_rotary 2.10% 42.971us 4.17% 85.245us 14.208us 17.214us 68.45% 17.214us 2.869us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.214us 68.45% 17.214us 2.869us 6
+ aten::clone 1.43% 29.360us 84.55% 1.730ms 288.328us 0.000us 0.00% 9.248us 1.541us 6
+ aten::copy_ 1.75% 35.821us 81.51% 1.668ms 277.955us 7.936us 31.55% 9.248us 1.541us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 31.55% 7.936us 1.323us 6
+ Activity Buffer Request 69.89% 1.430ms 69.89% 1.430ms 1.430ms 1.312us 5.22% 1.312us 1.312us 1
+ aten::empty_strided 1.61% 32.881us 1.61% 32.881us 5.480us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.87% 201.958us 9.87% 201.958us 33.660us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.76% 36.050us 2.20% 45.010us 3.751us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.44% 8.960us 0.44% 8.960us 0.747us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.07% 42.274us 2.07% 42.274us 7.046us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.24% 4.920us 0.24% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.046ms
+Self CUDA time total: 25.150us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.166us 1351.10% 347.166us 347.166us 1
+ hf_kernels_rotary 21.36% 176.235us 99.42% 820.279us 820.279us 0.000us 0.00% 27.039us 27.039us 1
+ _rotary_dba7d1e::apply_rotary 5.20% 42.901us 10.31% 85.044us 14.174us 17.951us 69.86% 17.951us 2.992us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.951us 69.86% 17.951us 2.992us 6
+ aten::clone 2.62% 21.601us 62.49% 515.608us 85.935us 0.000us 0.00% 9.088us 1.515us 6
+ aten::copy_ 4.36% 35.950us 55.96% 461.697us 76.950us 7.744us 30.14% 9.088us 1.515us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 30.14% 7.744us 1.291us 6
+ Activity Buffer Request 27.88% 230.028us 27.88% 230.028us 230.028us 1.344us 5.23% 1.344us 1.344us 1
+ aten::empty_strided 3.92% 32.310us 3.92% 32.310us 5.385us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 23.72% 195.719us 23.72% 195.719us 32.620us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.18% 34.481us 5.26% 43.392us 3.616us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.08% 8.911us 1.08% 8.911us 0.743us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.11% 42.143us 5.11% 42.143us 7.024us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.58% 4.821us 0.58% 4.821us 4.821us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 825.100us
+Self CUDA time total: 25.695us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 348.595us 1078.61% 348.595us 348.595us 1
+ hf_kernels_rotary 21.56% 162.014us 99.35% 746.516us 746.516us 0.000us 0.00% 34.111us 34.111us 1
+ _rotary_dba7d1e::apply_rotary 5.56% 41.814us 11.41% 85.705us 14.284us 21.792us 67.43% 21.792us 3.632us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.792us 67.43% 21.792us 3.632us 6
+ aten::clone 2.84% 21.362us 60.59% 455.236us 75.873us 0.000us 0.00% 12.319us 2.053us 6
+ aten::copy_ 5.05% 37.942us 53.37% 401.033us 66.839us 10.527us 32.57% 12.319us 2.053us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.527us 32.57% 10.527us 1.755us 6
+ Activity Buffer Request 22.09% 165.945us 22.09% 165.945us 165.945us 1.792us 5.54% 1.792us 1.792us 1
+ aten::empty_strided 4.37% 32.841us 4.37% 32.841us 5.474us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 26.24% 197.146us 26.24% 197.146us 32.858us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.61% 34.610us 5.80% 43.561us 3.630us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.19% 8.951us 1.19% 8.951us 0.746us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.84% 43.891us 5.84% 43.891us 7.315us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.65% 4.870us 0.65% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 751.386us
+Self CUDA time total: 32.319us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 353.018us 687.35% 353.018us 353.018us 1
+ hf_kernels_rotary 20.18% 167.279us 99.43% 824.358us 824.358us 0.000us 0.00% 54.175us 54.175us 1
+ _rotary_dba7d1e::apply_rotary 5.18% 42.971us 10.43% 86.461us 14.410us 34.432us 67.04% 34.432us 5.739us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.432us 67.04% 34.432us 5.739us 6
+ aten::clone 2.72% 22.563us 63.67% 527.908us 87.985us 0.000us 0.00% 19.743us 3.290us 6
+ aten::copy_ 4.40% 36.441us 57.12% 473.605us 78.934us 16.927us 32.96% 19.743us 3.290us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.927us 32.96% 16.927us 2.821us 6
+ Activity Buffer Request 29.36% 243.449us 29.36% 243.449us 243.449us 2.816us 5.48% 2.816us 2.816us 1
+ aten::empty_strided 3.83% 31.740us 3.83% 31.740us 5.290us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 23.37% 193.715us 23.37% 193.715us 32.286us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.09% 33.928us 5.15% 42.710us 3.559us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.06% 8.782us 1.06% 8.782us 0.732us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.25% 43.490us 5.25% 43.490us 7.248us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.57% 4.720us 0.57% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 829.078us
+Self CUDA time total: 51.359us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 380.387us 1176.94% 380.387us 380.387us 1
+ hf_kernels_rotary 9.88% 201.876us 99.77% 2.039ms 2.039ms 0.000us 0.00% 34.144us 34.144us 1
+ _rotary_dba7d1e::apply_rotary 2.25% 45.971us 4.47% 91.374us 15.229us 21.760us 67.33% 21.760us 3.627us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.760us 67.33% 21.760us 3.627us 6
+ aten::clone 1.35% 27.641us 83.24% 1.701ms 283.513us 0.000us 0.00% 12.384us 2.064us 6
+ aten::copy_ 1.82% 37.221us 80.29% 1.641ms 273.476us 10.560us 32.67% 12.384us 2.064us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 32.67% 10.560us 1.760us 6
+ Activity Buffer Request 69.28% 1.416ms 69.28% 1.416ms 1.416ms 1.824us 5.64% 1.824us 1.824us 1
+ aten::empty_strided 1.59% 32.582us 1.59% 32.582us 5.430us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.19% 187.866us 9.19% 187.866us 31.311us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.75% 35.720us 2.18% 44.611us 3.718us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.44% 8.891us 0.44% 8.891us 0.741us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.22% 45.403us 2.22% 45.403us 7.567us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.23% 4.671us 0.23% 4.671us 4.671us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.044ms
+Self CUDA time total: 32.320us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 358.145us 697.76% 358.145us 358.145us 1
+ hf_kernels_rotary 9.30% 187.776us 99.78% 2.015ms 2.015ms 0.000us 0.00% 54.208us 54.208us 1
+ _rotary_dba7d1e::apply_rotary 2.06% 41.530us 4.25% 85.754us 14.292us 34.401us 67.02% 34.401us 5.734us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.401us 67.02% 34.401us 5.734us 6
+ aten::clone 1.47% 29.652us 84.14% 1.699ms 283.188us 0.000us 0.00% 19.807us 3.301us 6
+ aten::copy_ 1.88% 38.042us 81.10% 1.638ms 272.963us 16.927us 32.98% 19.807us 3.301us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.927us 32.98% 16.927us 2.821us 6
+ Activity Buffer Request 70.14% 1.416ms 70.14% 1.416ms 1.416ms 2.880us 5.61% 2.880us 2.880us 1
+ aten::empty_strided 1.57% 31.700us 1.57% 31.700us 5.283us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.08% 183.316us 9.08% 183.316us 30.553us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.65% 33.410us 2.09% 42.241us 3.520us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.44% 8.831us 0.44% 8.831us 0.736us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.19% 44.224us 2.19% 44.224us 7.371us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.22% 4.480us 0.22% 4.480us 4.480us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.019ms
+Self CUDA time total: 51.328us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 361.565us 334.59% 361.565us 361.565us 1
+ hf_kernels_rotary 8.80% 177.873us 99.76% 2.017ms 2.017ms 0.000us 0.00% 126.174us 126.174us 1
+ aten::clone 1.36% 27.530us 84.48% 1.708ms 284.721us 0.000us 0.00% 69.727us 11.621us 6
+ aten::copy_ 1.83% 37.081us 81.46% 1.647ms 274.541us 51.615us 47.76% 69.727us 11.621us 6
+ _rotary_dba7d1e::apply_rotary 2.15% 43.402us 4.34% 87.665us 14.611us 56.447us 52.24% 56.447us 9.408us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 56.447us 52.24% 56.447us 9.408us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.615us 47.76% 51.615us 8.603us 6
+ Activity Buffer Request 70.51% 1.426ms 70.51% 1.426ms 1.426ms 18.112us 16.76% 18.112us 18.112us 1
+ aten::empty_strided 1.66% 33.551us 1.66% 33.551us 5.592us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.12% 184.328us 9.12% 184.328us 30.721us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.73% 34.962us 2.15% 43.472us 3.623us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.42% 8.510us 0.42% 8.510us 0.709us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.19% 44.263us 2.19% 44.263us 7.377us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.24% 4.810us 0.24% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.022ms
+Self CUDA time total: 108.062us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 374.332us 209.83% 374.332us 374.332us 1
+ hf_kernels_rotary 8.69% 176.335us 99.78% 2.024ms 2.024ms 0.000us 0.00% 202.046us 202.046us 1
+ aten::clone 1.35% 27.382us 84.12% 1.707ms 284.468us 0.000us 0.00% 102.112us 17.019us 6
+ aten::copy_ 1.89% 38.342us 81.18% 1.647ms 274.513us 78.464us 43.98% 102.112us 17.019us 6
+ _rotary_dba7d1e::apply_rotary 2.26% 45.922us 4.48% 90.874us 15.146us 99.934us 56.02% 99.934us 16.656us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 99.934us 56.02% 99.934us 16.656us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.464us 43.98% 78.464us 13.077us 6
+ Activity Buffer Request 70.36% 1.428ms 70.36% 1.428ms 1.428ms 23.648us 13.26% 23.648us 23.648us 1
+ aten::empty_strided 1.59% 32.350us 1.59% 32.350us 5.392us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.93% 181.117us 8.93% 181.117us 30.186us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.68% 34.110us 2.48% 50.391us 4.199us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.80% 16.281us 0.80% 16.281us 1.357us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.22% 44.952us 2.22% 44.952us 7.492us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.22% 4.521us 0.22% 4.521us 4.521us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.029ms
+Self CUDA time total: 178.398us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.716us 1341.48% 350.716us 350.716us 1
+ hf_kernels_rotary 8.88% 178.684us 99.76% 2.007ms 2.007ms 0.000us 0.00% 27.264us 27.264us 1
+ _rotary_dba7d1e::apply_rotary 2.16% 43.370us 4.24% 85.224us 14.204us 19.393us 74.18% 19.393us 3.232us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.393us 74.18% 19.393us 3.232us 6
+ aten::clone 1.56% 31.330us 84.58% 1.702ms 283.596us 0.000us 0.00% 7.871us 1.312us 6
+ aten::copy_ 1.80% 36.292us 81.38% 1.637ms 272.881us 6.751us 25.82% 7.871us 1.312us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 25.82% 6.751us 1.125us 6
+ Activity Buffer Request 70.41% 1.417ms 70.41% 1.417ms 1.417ms 1.120us 4.28% 1.120us 1.120us 1
+ aten::empty_strided 1.64% 32.961us 1.64% 32.961us 5.494us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.17% 184.457us 9.17% 184.457us 30.743us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.63% 32.712us 2.06% 41.532us 3.461us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.44% 8.820us 0.44% 8.820us 0.735us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.08% 41.854us 2.08% 41.854us 6.976us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.24% 4.830us 0.24% 4.830us 4.830us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.012ms
+Self CUDA time total: 26.144us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 377.307us 1377.44% 377.307us 377.307us 1
+ hf_kernels_rotary 21.29% 163.294us 99.28% 761.426us 761.426us 0.000us 0.00% 28.704us 28.704us 1
+ _rotary_dba7d1e::apply_rotary 5.68% 43.540us 11.49% 88.163us 14.694us 19.584us 71.50% 19.584us 3.264us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.584us 71.50% 19.584us 3.264us 6
+ aten::clone 3.08% 23.620us 60.95% 467.436us 77.906us 0.000us 0.00% 9.120us 1.520us 6
+ aten::copy_ 5.00% 38.311us 53.59% 411.005us 68.501us 7.808us 28.50% 9.120us 1.520us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 28.50% 7.808us 1.301us 6
+ Activity Buffer Request 21.08% 161.645us 21.08% 161.645us 161.645us 1.312us 4.79% 1.312us 1.312us 1
+ aten::empty_strided 4.28% 32.811us 4.28% 32.811us 5.468us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 27.52% 211.049us 27.52% 211.049us 35.175us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.33% 33.234us 5.55% 42.533us 3.544us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.21% 9.299us 1.21% 9.299us 0.775us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.82% 44.623us 5.82% 44.623us 7.437us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.72% 5.550us 0.72% 5.550us 5.550us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 766.976us
+Self CUDA time total: 27.392us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.153us 1234.28% 349.153us 349.153us 1
+ hf_kernels_rotary 19.50% 158.266us 99.38% 806.788us 806.788us 0.000us 0.00% 29.600us 29.600us 1
+ _rotary_dba7d1e::apply_rotary 5.36% 43.530us 10.78% 87.514us 14.586us 20.544us 72.62% 20.544us 3.424us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.544us 72.62% 20.544us 3.424us 6
+ aten::clone 2.63% 21.380us 63.75% 517.547us 86.258us 0.000us 0.00% 9.056us 1.509us 6
+ aten::copy_ 4.60% 37.352us 57.23% 464.607us 77.434us 7.744us 27.38% 9.056us 1.509us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 27.38% 7.744us 1.291us 6
+ Activity Buffer Request 29.79% 241.838us 29.79% 241.838us 241.838us 1.312us 4.64% 1.312us 1.312us 1
+ aten::empty_strided 3.89% 31.560us 3.89% 31.560us 5.260us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 22.84% 185.417us 22.84% 185.417us 30.903us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.24% 34.459us 5.35% 43.461us 3.622us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.11% 9.002us 1.11% 9.002us 0.750us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.42% 43.984us 5.42% 43.984us 7.331us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.62% 5.020us 0.62% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 811.808us
+Self CUDA time total: 28.288us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 348.027us 976.29% 348.027us 348.027us 1
+ hf_kernels_rotary 20.53% 156.455us 99.34% 757.166us 757.166us 0.000us 0.00% 37.440us 37.440us 1
+ _rotary_dba7d1e::apply_rotary 5.63% 42.881us 11.27% 85.894us 14.316us 25.184us 70.65% 25.184us 4.197us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.184us 70.65% 25.184us 4.197us 6
+ aten::clone 3.00% 22.853us 61.65% 469.877us 78.313us 0.000us 0.00% 12.256us 2.043us 6
+ aten::copy_ 4.74% 36.121us 54.50% 415.394us 69.232us 10.464us 29.35% 12.256us 2.043us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.464us 29.35% 10.464us 1.744us 6
+ Activity Buffer Request 25.88% 197.217us 25.88% 197.217us 197.217us 1.792us 5.03% 1.792us 1.792us 1
+ aten::empty_strided 4.15% 31.630us 4.15% 31.630us 5.272us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 23.89% 182.056us 23.89% 182.056us 30.343us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.53% 34.528us 5.90% 44.940us 3.745us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.37% 10.412us 1.37% 10.412us 0.868us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.64% 43.013us 5.64% 43.013us 7.169us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.66% 5.020us 0.66% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 762.186us
+Self CUDA time total: 35.648us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.012us 1220.37% 346.012us 346.012us 1
+ hf_kernels_rotary 19.32% 159.865us 99.40% 822.269us 822.269us 0.000us 0.00% 29.665us 29.665us 1
+ _rotary_dba7d1e::apply_rotary 5.23% 43.230us 10.32% 85.383us 14.231us 20.577us 72.57% 20.577us 3.429us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.577us 72.57% 20.577us 3.429us 6
+ aten::clone 2.67% 22.091us 64.52% 533.759us 88.960us 0.000us 0.00% 9.088us 1.515us 6
+ aten::copy_ 4.35% 36.002us 57.93% 479.208us 79.868us 7.776us 27.43% 9.088us 1.515us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 27.43% 7.776us 1.296us 6
+ Activity Buffer Request 31.47% 260.369us 31.47% 260.369us 260.369us 1.312us 4.63% 1.312us 1.312us 1
+ aten::empty_strided 3.92% 32.460us 3.92% 32.460us 5.410us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 22.10% 182.837us 22.10% 182.837us 30.473us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.12% 34.091us 5.23% 43.262us 3.605us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.11% 9.171us 1.11% 9.171us 0.764us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.10% 42.153us 5.10% 42.153us 7.026us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.60% 4.990us 0.60% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 827.259us
+Self CUDA time total: 28.353us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 379.611us 1063.04% 379.611us 379.611us 1
+ hf_kernels_rotary 17.54% 182.966us 99.53% 1.038ms 1.038ms 0.000us 0.00% 37.470us 37.470us 1
+ _rotary_dba7d1e::apply_rotary 4.31% 44.959us 8.52% 88.913us 14.819us 25.247us 70.70% 25.247us 4.208us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.247us 70.70% 25.247us 4.208us 6
+ aten::clone 2.14% 22.291us 69.13% 721.275us 120.212us 0.000us 0.00% 12.223us 2.037us 6
+ aten::copy_ 3.58% 37.312us 63.91% 666.784us 111.131us 10.463us 29.30% 12.223us 2.037us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.463us 29.30% 10.463us 1.744us 6
+ Activity Buffer Request 42.63% 444.746us 42.63% 444.746us 444.746us 1.760us 4.93% 1.760us 1.760us 1
+ aten::empty_strided 3.09% 32.200us 3.09% 32.200us 5.367us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 17.71% 184.726us 17.71% 184.726us 30.788us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.45% 36.000us 4.33% 45.221us 3.768us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.88% 9.221us 0.88% 9.221us 0.768us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 4.21% 43.954us 4.21% 43.954us 7.326us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.47% 4.940us 0.47% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.043ms
+Self CUDA time total: 35.710us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.330us 621.69% 350.330us 350.330us 1
+ hf_kernels_rotary 20.69% 166.654us 99.40% 800.657us 800.657us 0.000us 0.00% 59.231us 59.231us 1
+ _rotary_dba7d1e::apply_rotary 5.43% 43.738us 10.71% 86.292us 14.382us 39.327us 69.79% 39.327us 6.554us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.327us 69.79% 39.327us 6.554us 6
+ aten::clone 2.60% 20.920us 62.50% 503.467us 83.911us 0.000us 0.00% 19.904us 3.317us 6
+ aten::copy_ 4.42% 35.631us 55.79% 449.427us 74.904us 17.024us 30.21% 19.904us 3.317us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 30.21% 17.024us 2.837us 6
+ Activity Buffer Request 28.71% 231.299us 28.71% 231.299us 231.299us 2.880us 5.11% 2.880us 2.880us 1
+ aten::empty_strided 4.11% 33.120us 4.11% 33.120us 5.520us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 22.66% 182.497us 22.66% 182.497us 30.416us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.34% 34.964us 5.49% 44.244us 3.687us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.15% 9.280us 1.15% 9.280us 0.773us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.28% 42.554us 5.28% 42.554us 7.092us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.60% 4.850us 0.60% 4.850us 4.850us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 805.507us
+Self CUDA time total: 56.351us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 363.291us 308.26% 363.291us 363.291us 1
+ hf_kernels_rotary 19.60% 166.384us 99.43% 844.179us 844.179us 0.000us 0.00% 134.846us 134.846us 1
+ aten::clone 2.55% 21.670us 64.54% 547.969us 91.328us 0.000us 0.00% 70.143us 11.691us 6
+ aten::copy_ 4.54% 38.561us 58.31% 495.019us 82.503us 53.151us 45.10% 70.143us 11.691us 6
+ _rotary_dba7d1e::apply_rotary 4.97% 42.172us 10.27% 87.155us 14.526us 64.703us 54.90% 64.703us 10.784us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.703us 54.90% 64.703us 10.784us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.151us 45.10% 53.151us 8.859us 6
+ Activity Buffer Request 32.22% 273.530us 32.22% 273.530us 273.530us 16.992us 14.42% 16.992us 16.992us 1
+ aten::empty_strided 3.68% 31.280us 3.68% 31.280us 5.213us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 21.55% 182.928us 21.55% 182.928us 30.488us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.96% 33.580us 5.03% 42.671us 3.556us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.07% 9.091us 1.07% 9.091us 0.758us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 5.30% 44.983us 5.30% 44.983us 7.497us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.57% 4.820us 0.57% 4.820us 4.820us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 848.999us
+Self CUDA time total: 117.854us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 370.462us 657.41% 370.462us 370.462us 1
+ hf_kernels_rotary 9.39% 189.846us 99.77% 2.018ms 2.018ms 0.000us 0.00% 59.200us 59.200us 1
+ _rotary_dba7d1e::apply_rotary 2.15% 43.502us 4.33% 87.525us 14.588us 39.360us 69.85% 39.360us 6.560us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.360us 69.85% 39.360us 6.560us 6
+ aten::clone 1.41% 28.463us 83.80% 1.695ms 282.475us 0.000us 0.00% 19.840us 3.307us 6
+ aten::copy_ 1.87% 37.890us 80.77% 1.634ms 272.251us 16.992us 30.15% 19.840us 3.307us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 30.15% 16.992us 2.832us 6
+ Activity Buffer Request 69.77% 1.411ms 69.77% 1.411ms 1.411ms 2.848us 5.05% 2.848us 2.848us 1
+ aten::empty_strided 1.63% 32.881us 1.63% 32.881us 5.480us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.13% 184.676us 9.13% 184.676us 30.779us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.76% 35.550us 2.25% 45.480us 3.790us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.49% 9.930us 0.49% 9.930us 0.827us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.18% 44.023us 2.18% 44.023us 7.337us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.23% 4.690us 0.23% 4.690us 4.690us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.022ms
+Self CUDA time total: 56.352us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 359.680us 306.26% 359.680us 359.680us 1
+ hf_kernels_rotary 9.06% 182.622us 99.75% 2.011ms 2.011ms 0.000us 0.00% 134.753us 134.753us 1
+ aten::clone 1.36% 27.350us 84.30% 1.700ms 283.278us 0.000us 0.00% 70.114us 11.686us 6
+ aten::copy_ 1.85% 37.232us 81.34% 1.640ms 273.341us 52.802us 44.96% 70.114us 11.686us 6
+ _rotary_dba7d1e::apply_rotary 2.09% 42.192us 4.26% 85.926us 14.321us 64.639us 55.04% 64.639us 10.773us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.639us 55.04% 64.639us 10.773us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.802us 44.96% 52.802us 8.800us 6
+ Activity Buffer Request 70.45% 1.420ms 70.45% 1.420ms 1.420ms 17.312us 14.74% 17.312us 17.312us 1
+ aten::empty_strided 1.60% 32.271us 1.60% 32.271us 5.379us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.05% 182.507us 9.05% 182.507us 30.418us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 1.67% 33.712us 2.12% 42.832us 3.569us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.45% 9.120us 0.45% 9.120us 0.760us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 2.17% 43.734us 2.17% 43.734us 7.289us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.25% 5.130us 0.25% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.016ms
+Self CUDA time total: 117.441us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 363.604us 186.68% 363.604us 363.604us 1
+ hf_kernels_rotary 18.95% 159.454us 99.42% 836.628us 836.628us 0.000us 0.00% 218.425us 218.425us 1
+ _rotary_dba7d1e::apply_rotary 5.11% 42.982us 10.01% 84.264us 14.044us 114.460us 58.76% 114.460us 19.077us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 114.460us 58.76% 114.460us 19.077us 6
+ aten::clone 2.64% 22.190us 65.28% 549.368us 91.561us 0.000us 0.00% 103.965us 17.328us 6
+ aten::copy_ 4.30% 36.168us 58.92% 495.836us 82.639us 80.318us 41.24% 103.965us 17.328us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 80.318us 41.24% 80.318us 13.386us 6
+ Activity Buffer Request 32.31% 271.900us 32.31% 271.900us 271.900us 23.647us 12.14% 23.647us 23.647us 1
+ aten::empty_strided 3.72% 31.342us 3.72% 31.342us 5.224us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 22.31% 187.768us 22.31% 187.768us 31.295us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.01% 33.772us 5.17% 43.542us 3.628us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 1.16% 9.770us 1.16% 9.770us 0.814us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 4.91% 41.282us 4.91% 41.282us 6.880us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 0.58% 4.880us 0.58% 4.880us 4.880us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 841.508us
+Self CUDA time total: 194.778us
+
+
+
+======================================================================
+PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ hf_kernels_rotary 13.69% 161.817us 65.35% 772.637us 772.637us 0.000us 0.00% 853.016us 853.016us 1
+ hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 794.618us 101.00% 794.618us 794.618us 1
+ aten::clone 1.91% 22.540us 40.85% 482.956us 80.493us 0.000us 0.00% 580.923us 96.820us 6
+ aten::copy_ 3.05% 36.119us 36.34% 429.636us 71.606us 514.652us 65.42% 580.923us 96.820us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 514.652us 65.42% 514.652us 85.775us 6
+ _rotary_dba7d1e::apply_rotary 3.53% 41.772us 7.15% 84.524us 14.087us 272.093us 34.58% 272.093us 45.349us 6
+void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 272.093us 34.58% 272.093us 45.349us 6
+ Activity Buffer Request 17.75% 209.918us 17.75% 209.918us 209.918us 66.271us 8.42% 66.271us 66.271us 1
+ aten::empty_strided 2.60% 30.780us 2.60% 30.780us 5.130us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.53% 183.599us 15.53% 183.599us 30.600us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.92% 34.511us 3.67% 43.340us 3.612us 0.000us 0.00% 0.000us 0.000us 12
+ aten::as_strided 0.75% 8.829us 0.75% 8.829us 0.736us 0.000us 0.00% 0.000us 0.000us 12
+ cudaLaunchKernel 3.62% 42.752us 3.62% 42.752us 7.125us 0.000us 0.00% 0.000us 0.000us 6
+ cudaDeviceSynchronize 34.65% 409.744us 34.65% 409.744us 409.744us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.182ms
+Self CUDA time total: 786.745us
+
+
+impl wl p50(ms) ok
+hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False
+hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.10 False
+hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
+hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
+hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.10 False
+hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False
+hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 False
+hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 False
+hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 False
+hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 False
+hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 False
+hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 False
+hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 False
+hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.28 False
+hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.10 False
+hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 False
+hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 False
+hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.10 False
+hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
+
+
+
+
+Installed 15 packages in 15ms
+
+
+
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.09it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.09it/s]
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/rotary/impls/index.html b/rotary/impls/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..02ebba5c766f3ff734bd85f035b2321b75e143f7
--- /dev/null
+++ b/rotary/impls/index.html
@@ -0,0 +1,89 @@
+
+
+
+
+
+ Index of /rotary/impls
+
+
+
+
+ Index of /rotary/impls
+
+
+
\ No newline at end of file
diff --git a/rotary/impls/torch_rotary.html b/rotary/impls/torch_rotary.html
new file mode 100644
index 0000000000000000000000000000000000000000..aa154efd76b2499f7b4f91ee8db4a21e33418431
--- /dev/null
+++ b/rotary/impls/torch_rotary.html
@@ -0,0 +1,4756 @@
+
+
+
+
+
+ torch_rotary
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
PyTorch Native - Rotary Position Embeddings
+
GPU Info
+
+
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+
+
+
+
+
+
Wed Oct 29 00:36:23 2025
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
+|-----------------------------------------+------------------------+----------------------+
+| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|=========================================+========================+======================|
+| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
+| N/A 31C P0 86W / 350W | 0MiB / 46068MiB | 22% Default |
+| | | N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=========================================================================================|
+| No running processes found |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+
+
Rotary Embeddings Benchmark (PyTorch Native)
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def apply_rotary_torch(x1, x2, cos, sin, conj=False):
+ """Reference rotary implementation."""
+ if not conj:
+ out1 = x1 * cos - x2 * sin
+ out2 = x1 * sin + x2 * cos
+ else:
+ out1 = x1 * cos + x2 * sin
+ out2 = -x1 * sin + x2 * cos
+ return out1, out2
+
+
+def torch_rotary(query, key, cos, sin, conj=False):
+ rotary_dim = cos.shape[-1]
+
+ # Clone inputs to avoid modifying them
+ q_out = query.clone()
+ k_out = key.clone()
+
+ # Apply rotation to query
+ q1 = q_out[..., :rotary_dim]
+ q2 = q_out[..., rotary_dim : 2 * rotary_dim]
+ q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
+ q_out[..., :rotary_dim] = q_out_1
+ q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
+
+ # Apply rotation to key
+ k1 = k_out[..., :rotary_dim]
+ k2 = k_out[..., rotary_dim : 2 * rotary_dim]
+ k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
+ k_out[..., :rotary_dim] = k_out_1
+ k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
+
+ return q_out, k_out
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ROTARY,
+ impl_name="torch_eager",
+ impl_tags={"family": "pytorch", "backend": "eager"},
+ impl_func=torch_rotary,
+)
+
+
+
+
+
+
+
Running rotary benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.078ms 1207.69% 1.078ms 1.078ms 1
+ torch_eager 14.52% 400.522us 99.68% 2.750ms 2.750ms 0.000us 0.00% 90.462us 90.462us 1
+ aten::mul 6.17% 170.271us 10.64% 293.512us 12.230us 46.944us 52.60% 46.944us 1.956us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.944us 52.60% 46.944us 1.956us 24
+ aten::copy_ 4.22% 116.515us 62.04% 1.711ms 95.079us 28.991us 32.48% 30.207us 1.678us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.335us 25.03% 22.335us 1.861us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.311us 14.91% 13.311us 1.109us 12
+ aten::clone 1.56% 42.898us 60.57% 1.671ms 278.496us 0.000us 0.00% 7.872us 1.312us 6
+ aten::sub 1.56% 43.002us 2.52% 69.413us 11.569us 6.688us 7.49% 6.688us 1.115us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 7.46% 6.656us 1.109us 6
+ aten::add 1.26% 34.801us 2.08% 57.392us 9.565us 6.623us 7.42% 6.623us 1.104us 6
+ Activity Buffer Request 52.61% 1.451ms 52.61% 1.451ms 1.451ms 1.216us 1.36% 1.216us 1.216us 1
+ aten::empty_strided 2.18% 60.243us 2.18% 60.243us 10.040us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.73% 75.213us 2.73% 75.213us 12.535us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.16% 87.293us 4.15% 114.414us 4.767us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.98% 27.121us 0.98% 27.121us 1.130us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.73% 240.716us 8.73% 240.716us 5.015us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.32% 8.731us 0.32% 8.731us 8.731us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.759ms
+Self CUDA time total: 89.246us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 965.083us 1066.40% 965.083us 965.083us 1
+ torch_eager 12.32% 311.423us 99.78% 2.522ms 2.522ms 0.000us 0.00% 91.619us 91.619us 1
+ aten::mul 6.03% 152.369us 10.74% 271.352us 11.306us 47.843us 52.87% 47.843us 1.993us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.843us 52.87% 47.843us 1.993us 24
+ aten::copy_ 4.27% 107.805us 66.47% 1.680ms 93.342us 29.280us 32.35% 30.400us 1.689us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.528us 24.89% 22.528us 1.877us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.376us 14.78% 13.376us 1.115us 12
+ aten::clone 0.93% 23.570us 63.24% 1.599ms 266.432us 0.000us 0.00% 7.872us 1.312us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 7.46% 6.752us 1.125us 6
+ aten::sub 1.46% 36.933us 2.41% 60.984us 10.164us 6.720us 7.43% 6.720us 1.120us 6
+ aten::add 1.19% 30.203us 2.05% 51.743us 8.624us 6.656us 7.35% 6.656us 1.109us 6
+ Activity Buffer Request 57.43% 1.452ms 57.43% 1.452ms 1.452ms 1.120us 1.24% 1.120us 1.120us 1
+ aten::empty_strided 1.24% 31.422us 1.24% 31.422us 5.237us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.19% 55.410us 2.19% 55.410us 9.235us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.81% 71.135us 3.62% 91.386us 3.808us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.80% 20.251us 0.80% 20.251us 0.844us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 9.10% 229.986us 9.10% 229.986us 4.791us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.22% 5.600us 0.22% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.528ms
+Self CUDA time total: 90.499us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 958.300us 1018.96% 958.300us 958.300us 1
+ torch_eager 12.46% 312.732us 99.79% 2.504ms 2.504ms 0.000us 0.00% 95.391us 95.391us 1
+ aten::mul 5.95% 149.403us 10.59% 265.726us 11.072us 48.799us 51.89% 48.799us 2.033us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.799us 51.89% 48.799us 2.033us 24
+ aten::copy_ 4.14% 103.773us 66.47% 1.668ms 92.665us 30.815us 32.77% 32.159us 1.787us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.976us 24.43% 22.976us 1.915us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.433us 15.35% 14.433us 1.203us 12
+ aten::clone 0.91% 22.712us 63.16% 1.585ms 264.144us 0.000us 0.00% 9.183us 1.530us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.839us 8.34% 7.839us 1.306us 6
+ aten::sub 1.38% 34.722us 2.34% 58.713us 9.786us 7.233us 7.69% 7.233us 1.206us 6
+ aten::add 1.22% 30.569us 2.11% 52.831us 8.805us 7.200us 7.66% 7.200us 1.200us 6
+ Activity Buffer Request 57.45% 1.442ms 57.45% 1.442ms 1.442ms 1.344us 1.43% 1.344us 1.344us 1
+ aten::empty_strided 1.24% 31.042us 1.24% 31.042us 5.174us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 2.19% 55.002us 2.19% 55.002us 9.167us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.81% 70.525us 3.67% 92.215us 3.842us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.86% 21.690us 0.86% 21.690us 0.904us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 9.17% 230.176us 9.17% 230.176us 4.795us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.21% 5.390us 0.21% 5.390us 5.390us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.509ms
+Self CUDA time total: 94.047us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 950.973us 939.57% 950.973us 950.973us 1
+ torch_eager 12.66% 301.065us 99.78% 2.372ms 2.372ms 0.000us 0.00% 102.526us 102.526us 1
+ aten::mul 6.27% 149.075us 11.17% 265.677us 11.070us 52.831us 52.20% 52.831us 2.201us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.831us 52.20% 52.831us 2.201us 24
+ aten::copy_ 4.35% 103.343us 65.02% 1.546ms 85.882us 32.383us 31.99% 33.695us 1.872us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.639us 24.34% 24.639us 2.053us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.000us 15.81% 16.000us 1.333us 12
+ aten::clone 0.92% 21.771us 61.57% 1.464ms 243.982us 0.000us 0.00% 9.056us 1.509us 6
+ aten::add 1.30% 30.988us 2.24% 53.211us 8.868us 8.001us 7.91% 8.001us 1.333us 6
+ aten::sub 1.50% 35.711us 2.60% 61.701us 10.284us 7.999us 7.90% 7.999us 1.333us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.65% 7.744us 1.291us 6
+ Activity Buffer Request 45.51% 1.082ms 45.51% 1.082ms 1.082ms 1.312us 1.30% 1.312us 1.312us 1
+ aten::empty_strided 1.34% 31.870us 1.34% 31.870us 5.312us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 12.33% 293.180us 12.33% 293.180us 48.863us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.97% 70.623us 3.83% 91.173us 3.799us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.86% 20.550us 0.86% 20.550us 0.856us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 9.76% 232.079us 9.76% 232.079us 4.835us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.22% 5.310us 0.22% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.378ms
+Self CUDA time total: 101.214us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 979.418us 1041.05% 979.418us 979.418us 1
+ torch_eager 12.27% 347.559us 99.79% 2.828ms 2.828ms 0.000us 0.00% 95.392us 95.392us 1
+ aten::mul 5.36% 151.975us 9.52% 269.888us 11.245us 49.087us 52.18% 49.087us 2.045us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.087us 52.18% 49.087us 2.045us 24
+ aten::copy_ 3.87% 109.552us 68.68% 1.946ms 108.124us 30.817us 32.76% 32.129us 1.785us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.945us 24.39% 22.945us 1.912us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.176us 15.07% 14.176us 1.181us 12
+ aten::clone 0.99% 27.952us 65.99% 1.870ms 311.676us 0.000us 0.00% 9.184us 1.531us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 8.37% 7.872us 1.312us 6
+ aten::add 1.09% 30.843us 1.89% 53.454us 8.909us 7.104us 7.55% 7.104us 1.184us 6
+ aten::sub 1.23% 34.731us 2.14% 60.592us 10.099us 7.072us 7.52% 7.072us 1.179us 6
+ Activity Buffer Request 50.62% 1.434ms 50.62% 1.434ms 1.434ms 1.312us 1.39% 1.312us 1.312us 1
+ aten::empty_strided 1.13% 31.881us 1.13% 31.881us 5.314us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 11.86% 336.003us 11.86% 336.003us 56.000us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.47% 69.892us 3.18% 90.023us 3.751us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.71% 20.131us 0.71% 20.131us 0.839us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.21% 232.618us 8.21% 232.618us 4.846us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.21% 6.050us 0.21% 6.050us 6.050us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.834ms
+Self CUDA time total: 94.080us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 948.701us 937.33% 948.701us 948.701us 1
+ torch_eager 11.33% 313.022us 99.82% 2.758ms 2.758ms 0.000us 0.00% 102.525us 102.525us 1
+ aten::mul 5.41% 149.533us 9.55% 263.868us 10.995us 52.638us 52.01% 52.638us 2.193us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.638us 52.01% 52.638us 2.193us 24
+ aten::copy_ 3.84% 106.183us 69.66% 1.925ms 106.940us 32.512us 32.12% 33.824us 1.879us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.736us 24.44% 24.736us 2.061us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.063us 15.87% 16.063us 1.339us 12
+ aten::clone 0.85% 23.503us 66.72% 1.844ms 307.294us 0.000us 0.00% 9.088us 1.515us 6
+ aten::add 1.12% 31.031us 1.92% 53.131us 8.855us 8.064us 7.97% 8.064us 1.344us 6
+ aten::sub 1.28% 35.373us 2.17% 59.842us 9.974us 7.999us 7.90% 7.999us 1.333us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 7.68% 7.776us 1.296us 6
+ Activity Buffer Request 51.85% 1.433ms 51.85% 1.433ms 1.433ms 1.312us 1.30% 1.312us 1.312us 1
+ aten::empty_strided 1.12% 30.890us 1.12% 30.890us 5.148us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 11.57% 319.641us 11.57% 319.641us 53.273us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.48% 68.444us 3.23% 89.144us 3.714us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.75% 20.700us 0.75% 20.700us 0.863us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.22% 227.175us 8.22% 227.175us 4.733us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.18% 5.010us 0.18% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.763ms
+Self CUDA time total: 101.213us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 959.141us 795.45% 959.141us 959.141us 1
+ torch_eager 11.32% 313.154us 99.78% 2.759ms 2.759ms 0.000us 0.00% 122.369us 122.369us 1
+ aten::mul 5.48% 151.445us 9.68% 267.778us 11.157us 61.986us 51.41% 61.986us 2.583us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.986us 51.41% 61.986us 2.583us 24
+ aten::copy_ 3.75% 103.760us 69.32% 1.917ms 106.485us 39.329us 32.62% 41.120us 2.284us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.898us 23.97% 28.898us 2.408us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.263us 15.98% 19.263us 1.605us 12
+ aten::clone 0.88% 24.203us 66.44% 1.837ms 306.209us 0.000us 0.00% 12.222us 2.037us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.431us 8.65% 10.431us 1.739us 6
+ aten::add 1.09% 30.212us 1.88% 52.093us 8.682us 9.695us 8.04% 9.695us 1.616us 6
+ aten::sub 1.36% 37.662us 2.24% 62.032us 10.339us 9.568us 7.94% 9.568us 1.595us 6
+ Activity Buffer Request 52.07% 1.440ms 52.07% 1.440ms 1.440ms 1.791us 1.49% 1.791us 1.791us 1
+ aten::empty_strided 1.11% 30.761us 1.11% 30.761us 5.127us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 11.08% 306.470us 11.08% 306.470us 51.078us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.59% 71.623us 3.35% 92.502us 3.854us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.76% 20.879us 0.76% 20.879us 0.870us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.29% 229.176us 8.29% 229.176us 4.774us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.22% 5.960us 0.22% 5.960us 5.960us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.765ms
+Self CUDA time total: 120.578us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 936.986us 546.18% 936.986us 936.986us 1
+ torch_eager 19.74% 302.858us 99.67% 1.529ms 1.529ms 0.000us 0.00% 174.370us 174.370us 1
+ aten::mul 9.62% 147.674us 16.98% 260.548us 10.856us 89.250us 52.02% 89.250us 3.719us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.250us 52.02% 89.250us 3.719us 24
+ aten::copy_ 6.71% 102.945us 46.27% 710.024us 39.446us 57.601us 33.58% 60.418us 3.357us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.543us 23.63% 40.543us 3.379us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.702us 14.40% 24.702us 2.059us 12
+ aten::clone 1.46% 22.434us 41.15% 631.323us 105.220us 0.000us 0.00% 19.875us 3.312us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.058us 9.94% 17.058us 2.843us 6
+ aten::sub 2.30% 35.263us 3.87% 59.363us 9.894us 12.352us 7.20% 12.352us 2.059us 6
+ aten::add 1.99% 30.582us 3.46% 53.142us 8.857us 12.350us 7.20% 12.350us 2.058us 6
+ Activity Buffer Request 16.56% 254.079us 16.56% 254.079us 254.079us 2.817us 1.64% 2.817us 2.817us 1
+ aten::empty_strided 1.94% 29.811us 1.94% 29.811us 4.968us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 18.86% 289.319us 18.86% 289.319us 48.220us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.62% 70.853us 5.94% 91.142us 3.798us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.32% 20.289us 1.32% 20.289us 0.845us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 14.55% 223.215us 14.55% 223.215us 4.650us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.33% 5.040us 0.33% 5.040us 5.040us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.534ms
+Self CUDA time total: 171.553us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 965.148us 800.00% 965.148us 965.148us 1
+ torch_eager 19.51% 299.410us 99.63% 1.529ms 1.529ms 0.000us 0.00% 122.467us 122.467us 1
+ aten::mul 9.83% 150.825us 17.48% 268.249us 11.177us 62.048us 51.43% 62.048us 2.585us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.048us 51.43% 62.048us 2.585us 24
+ aten::copy_ 7.55% 115.928us 45.67% 700.806us 38.934us 39.490us 32.73% 41.314us 2.295us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.961us 24.01% 28.961us 2.413us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.105us 15.84% 19.105us 1.592us 12
+ aten::clone 1.36% 20.940us 39.52% 606.529us 101.088us 0.000us 0.00% 12.353us 2.059us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.529us 8.73% 10.529us 1.755us 6
+ aten::add 2.06% 31.661us 3.57% 54.801us 9.133us 9.568us 7.93% 9.568us 1.595us 6
+ aten::sub 2.41% 36.983us 4.07% 62.503us 10.417us 9.537us 7.91% 9.537us 1.589us 6
+ Activity Buffer Request 16.28% 249.768us 16.28% 249.768us 249.768us 1.824us 1.51% 1.824us 1.824us 1
+ aten::empty_strided 1.98% 30.440us 1.98% 30.440us 5.073us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 17.54% 269.148us 17.54% 269.148us 44.858us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.63% 71.053us 5.99% 91.854us 3.827us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.36% 20.801us 1.36% 20.801us 0.867us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.12% 232.046us 15.12% 232.046us 4.834us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.37% 5.660us 0.37% 5.660us 5.660us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.535ms
+Self CUDA time total: 120.643us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 953.270us 555.27% 953.270us 953.270us 1
+ torch_eager 11.16% 301.267us 99.78% 2.693ms 2.693ms 0.000us 0.00% 174.555us 174.555us 1
+ aten::mul 5.45% 147.123us 9.90% 267.330us 11.139us 88.990us 51.84% 88.990us 3.708us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 88.990us 51.84% 88.990us 3.708us 24
+ aten::copy_ 3.86% 104.254us 69.09% 1.865ms 103.603us 57.726us 33.62% 60.605us 3.367us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.575us 23.63% 40.575us 3.381us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.960us 14.54% 24.960us 2.080us 12
+ aten::clone 0.88% 23.712us 66.26% 1.789ms 298.097us 0.000us 0.00% 20.030us 3.338us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.151us 9.99% 17.151us 2.858us 6
+ aten::add 1.11% 29.833us 1.90% 51.253us 8.542us 12.512us 7.29% 12.512us 2.085us 6
+ aten::sub 1.37% 36.961us 2.28% 61.643us 10.274us 12.448us 7.25% 12.448us 2.075us 6
+ Activity Buffer Request 53.11% 1.433ms 53.11% 1.433ms 1.433ms 2.879us 1.68% 2.879us 2.879us 1
+ aten::empty_strided 1.15% 30.972us 1.15% 30.972us 5.162us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.79% 264.150us 9.79% 264.150us 44.025us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.62% 70.779us 3.41% 92.149us 3.840us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.79% 21.370us 0.79% 21.370us 0.890us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.50% 229.301us 8.50% 229.301us 4.777us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.22% 6.011us 0.22% 6.011us 6.011us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.699ms
+Self CUDA time total: 171.676us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.705us 333.64% 943.705us 943.705us 1
+ torch_eager 19.68% 292.650us 99.63% 1.482ms 1.482ms 0.000us 0.00% 301.376us 301.376us 1
+ aten::mul 9.80% 145.836us 17.38% 258.449us 10.769us 132.447us 46.83% 132.447us 5.519us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.447us 46.83% 132.447us 5.519us 24
+ aten::copy_ 7.01% 104.213us 45.19% 672.153us 37.342us 109.183us 38.60% 127.711us 7.095us 18
+ aten::clone 1.46% 21.712us 39.66% 589.861us 98.310us 0.000us 0.00% 70.593us 11.766us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.118us 20.19% 57.118us 4.760us 12
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.065us 18.41% 52.065us 8.678us 6
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.218us 14.57% 41.218us 3.435us 12
+ aten::sub 2.40% 35.641us 4.03% 59.963us 9.994us 20.704us 7.32% 20.704us 3.451us 6
+ aten::add 2.14% 31.871us 3.63% 53.951us 8.992us 20.514us 7.25% 20.514us 3.419us 6
+ Activity Buffer Request 16.79% 249.768us 16.79% 249.768us 249.768us 18.528us 6.55% 18.528us 18.528us 1
+ aten::empty_strided 2.08% 30.950us 2.08% 30.950us 5.158us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 17.02% 253.139us 17.02% 253.139us 42.190us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.80% 71.455us 6.19% 92.023us 3.834us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.38% 20.568us 1.38% 20.568us 0.857us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.06% 224.048us 15.06% 224.048us 4.668us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.37% 5.550us 0.37% 5.550us 5.550us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.487ms
+Self CUDA time total: 282.848us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.008ms 178.28% 1.008ms 1.008ms 1
+ torch_eager 20.19% 309.543us 99.64% 1.528ms 1.528ms 0.000us 0.00% 589.177us 589.177us 1
+ aten::copy_ 6.92% 106.132us 42.74% 655.343us 36.408us 274.429us 48.53% 298.108us 16.562us 18
+ aten::mul 10.35% 158.718us 18.57% 284.772us 11.866us 225.374us 39.85% 225.374us 9.391us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 225.374us 39.85% 225.374us 9.391us 24
+ aten::clone 1.37% 21.073us 36.86% 565.269us 94.211us 0.000us 0.00% 207.356us 34.559us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 183.677us 32.48% 183.677us 30.613us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.752us 16.05% 90.752us 7.563us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.695us 11.62% 65.695us 5.475us 12
+ aten::sub 2.38% 36.444us 4.07% 62.445us 10.407us 33.376us 5.90% 33.376us 5.563us 6
+ aten::add 2.04% 31.281us 3.53% 54.151us 9.025us 32.319us 5.72% 32.319us 5.387us 6
+ Activity Buffer Request 15.09% 231.317us 15.09% 231.317us 231.317us 23.679us 4.19% 23.679us 23.679us 1
+ aten::empty_strided 1.92% 29.470us 1.92% 29.470us 4.912us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 16.26% 249.288us 16.26% 249.288us 41.548us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 5.79% 88.836us 7.24% 111.045us 4.627us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.45% 22.209us 1.45% 22.209us 0.925us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.88% 243.531us 15.88% 243.531us 5.074us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.36% 5.560us 0.36% 5.560us 5.560us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.533ms
+Self CUDA time total: 565.498us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 978.269us 1057.44% 978.269us 978.269us 1
+ torch_eager 11.40% 307.808us 99.80% 2.694ms 2.694ms 0.000us 0.00% 93.633us 93.633us 1
+ aten::mul 5.67% 153.175us 10.04% 271.038us 11.293us 49.695us 53.72% 49.695us 2.071us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.695us 53.72% 49.695us 2.071us 24
+ aten::copy_ 3.91% 105.435us 68.36% 1.845ms 102.519us 29.377us 31.75% 30.497us 1.694us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.625us 24.46% 22.625us 1.885us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.441us 14.53% 13.441us 1.120us 12
+ aten::clone 0.87% 23.532us 65.28% 1.762ms 293.710us 0.000us 0.00% 7.872us 1.312us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 7.30% 6.752us 1.125us 6
+ aten::sub 1.66% 44.684us 2.62% 70.744us 11.791us 6.721us 7.26% 6.721us 1.120us 6
+ aten::add 1.10% 29.730us 1.92% 51.961us 8.660us 6.720us 7.26% 6.720us 1.120us 6
+ Activity Buffer Request 52.82% 1.426ms 52.82% 1.426ms 1.426ms 1.120us 1.21% 1.120us 1.120us 1
+ aten::empty_strided 1.12% 30.311us 1.12% 30.311us 5.052us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 9.14% 246.769us 9.14% 246.769us 41.128us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.68% 72.284us 3.46% 93.345us 3.889us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.78% 21.061us 0.78% 21.061us 0.878us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.64% 233.365us 8.64% 233.365us 4.862us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.20% 5.520us 0.20% 5.520us 5.520us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.700ms
+Self CUDA time total: 92.513us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 938.603us 972.86% 938.603us 938.603us 1
+ torch_eager 19.93% 287.519us 99.56% 1.436ms 1.436ms 0.000us 0.00% 97.823us 97.823us 1
+ aten::mul 10.40% 150.056us 18.25% 263.188us 10.966us 51.362us 53.24% 51.362us 2.140us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.362us 53.24% 51.362us 2.140us 24
+ aten::copy_ 7.16% 103.273us 43.48% 627.121us 34.840us 30.911us 32.04% 32.255us 1.792us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.008us 23.85% 23.008us 1.917us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.206us 14.72% 14.206us 1.184us 12
+ aten::clone 1.50% 21.587us 37.74% 544.337us 90.723us 0.000us 0.00% 9.247us 1.541us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.903us 8.19% 7.903us 1.317us 6
+ aten::sub 2.45% 35.381us 4.12% 59.382us 9.897us 7.103us 7.36% 7.103us 1.184us 6
+ aten::add 2.21% 31.862us 3.86% 55.642us 9.274us 7.103us 7.36% 7.103us 1.184us 6
+ Activity Buffer Request 14.93% 215.407us 14.93% 215.407us 215.407us 1.344us 1.39% 1.344us 1.344us 1
+ aten::empty_strided 2.08% 30.053us 2.08% 30.053us 5.009us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 16.77% 241.899us 16.77% 241.899us 40.317us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.91% 70.826us 6.34% 91.477us 3.812us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.43% 20.651us 1.43% 20.651us 0.860us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.77% 227.455us 15.77% 227.455us 4.739us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.44% 6.400us 0.44% 6.400us 6.400us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.442ms
+Self CUDA time total: 96.479us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 982.735us 947.89% 982.735us 982.735us 1
+ torch_eager 20.01% 296.499us 99.62% 1.476ms 1.476ms 0.000us 0.00% 105.019us 105.019us 1
+ aten::mul 11.54% 171.043us 19.63% 290.918us 12.122us 55.326us 53.36% 55.326us 2.305us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.326us 53.36% 55.326us 2.305us 24
+ aten::copy_ 7.11% 105.421us 42.19% 625.268us 34.737us 32.415us 31.27% 33.758us 1.875us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.672us 23.80% 24.672us 2.056us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.935us 15.37% 15.935us 1.328us 12
+ aten::clone 1.46% 21.629us 36.49% 540.678us 90.113us 0.000us 0.00% 9.086us 1.514us 6
+ aten::sub 2.54% 37.603us 4.22% 62.603us 10.434us 7.968us 7.69% 7.968us 1.328us 6
+ aten::add 2.14% 31.750us 3.67% 54.341us 9.057us 7.967us 7.68% 7.967us 1.328us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 7.47% 7.743us 1.291us 6
+ Activity Buffer Request 14.41% 213.507us 14.41% 213.507us 213.507us 1.343us 1.30% 1.343us 1.343us 1
+ aten::empty_strided 2.05% 30.383us 2.05% 30.383us 5.064us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 16.24% 240.608us 16.24% 240.608us 40.101us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.91% 72.718us 6.38% 94.560us 3.940us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.47% 21.842us 1.47% 21.842us 0.910us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.74% 233.198us 15.74% 233.198us 4.858us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.38% 5.681us 0.38% 5.681us 5.681us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.482ms
+Self CUDA time total: 103.676us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 979.903us 792.35% 979.903us 979.903us 1
+ torch_eager 11.44% 307.736us 99.80% 2.685ms 2.685ms 0.000us 0.00% 125.495us 125.495us 1
+ aten::mul 5.76% 155.021us 10.44% 280.767us 11.699us 65.018us 52.57% 65.018us 2.709us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.018us 52.57% 65.018us 2.709us 24
+ aten::copy_ 4.05% 108.834us 68.12% 1.833ms 101.807us 39.389us 31.85% 41.213us 2.290us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.830us 23.31% 28.830us 2.403us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.264us 15.58% 19.264us 1.605us 12
+ aten::clone 0.88% 23.603us 65.09% 1.751ms 291.863us 0.000us 0.00% 12.383us 2.064us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.559us 8.54% 10.559us 1.760us 6
+ aten::sub 1.31% 35.349us 2.21% 59.490us 9.915us 9.633us 7.79% 9.633us 1.606us 6
+ aten::add 1.16% 31.200us 1.98% 53.350us 8.892us 9.631us 7.79% 9.631us 1.605us 6
+ Activity Buffer Request 52.85% 1.422ms 52.85% 1.422ms 1.422ms 1.824us 1.47% 1.824us 1.824us 1
+ aten::empty_strided 1.16% 31.331us 1.16% 31.331us 5.222us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.81% 236.968us 8.81% 236.968us 39.495us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.73% 73.381us 3.57% 95.963us 3.998us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.84% 22.582us 0.84% 22.582us 0.941us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.81% 236.979us 8.81% 236.979us 4.937us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.20% 5.460us 0.20% 5.460us 5.460us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.690ms
+Self CUDA time total: 123.671us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 945.998us 913.29% 945.998us 945.998us 1
+ torch_eager 20.62% 293.766us 99.60% 1.419ms 1.419ms 0.000us 0.00% 104.893us 104.893us 1
+ aten::mul 10.57% 150.564us 18.69% 266.299us 11.096us 55.198us 53.29% 55.198us 2.300us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.198us 53.29% 55.198us 2.300us 24
+ aten::copy_ 7.32% 104.233us 42.25% 601.777us 33.432us 32.416us 31.30% 33.728us 1.874us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 23.79% 24.640us 2.053us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.967us 15.41% 15.967us 1.331us 12
+ aten::clone 1.47% 20.971us 36.37% 518.086us 86.348us 0.000us 0.00% 9.088us 1.515us 6
+ aten::sub 2.48% 35.340us 4.16% 59.262us 9.877us 8.000us 7.72% 8.000us 1.333us 6
+ aten::add 2.24% 31.871us 3.82% 54.371us 9.062us 7.967us 7.69% 7.967us 1.328us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 7.51% 7.776us 1.296us 6
+ Activity Buffer Request 13.80% 196.526us 13.80% 196.526us 196.526us 1.312us 1.27% 1.312us 1.312us 1
+ aten::empty_strided 2.11% 29.991us 2.11% 29.991us 4.999us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 16.46% 234.477us 16.46% 234.477us 39.079us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.98% 70.892us 6.48% 92.342us 3.848us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.51% 21.450us 1.51% 21.450us 0.894us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.06% 228.698us 16.06% 228.698us 4.765us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.40% 5.670us 0.40% 5.670us 5.670us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.424ms
+Self CUDA time total: 103.581us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 967.032us 780.87% 967.032us 967.032us 1
+ torch_eager 20.22% 292.458us 99.59% 1.441ms 1.441ms 0.000us 0.00% 125.633us 125.633us 1
+ aten::mul 10.50% 151.918us 18.64% 269.672us 11.236us 65.056us 52.53% 65.056us 2.711us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.056us 52.53% 65.056us 2.711us 24
+ aten::copy_ 8.04% 116.307us 42.64% 616.762us 34.265us 39.457us 31.86% 41.250us 2.292us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.928us 23.36% 28.928us 2.411us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.327us 15.61% 19.327us 1.611us 12
+ aten::clone 1.53% 22.111us 35.99% 520.557us 86.759us 0.000us 0.00% 12.322us 2.054us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.529us 8.50% 10.529us 1.755us 6
+ aten::add 2.26% 32.642us 3.84% 55.612us 9.269us 9.696us 7.83% 9.696us 1.616us 6
+ aten::sub 2.60% 37.653us 4.38% 63.343us 10.557us 9.631us 7.78% 9.631us 1.605us 6
+ Activity Buffer Request 13.64% 197.336us 13.64% 197.336us 197.336us 1.793us 1.45% 1.793us 1.793us 1
+ aten::empty_strided 2.06% 29.750us 2.06% 29.750us 4.958us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 16.31% 235.989us 16.31% 235.989us 39.331us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.84% 69.978us 6.29% 90.981us 3.791us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.45% 21.003us 1.45% 21.003us 0.875us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.14% 233.544us 16.14% 233.544us 4.866us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.41% 5.890us 0.41% 5.890us 5.890us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.447ms
+Self CUDA time total: 123.840us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 959.509us 542.22% 959.509us 959.509us 1
+ torch_eager 11.19% 299.303us 99.80% 2.668ms 2.668ms 0.000us 0.00% 179.839us 179.839us 1
+ aten::mul 5.70% 152.426us 10.09% 269.786us 11.241us 94.591us 53.45% 94.591us 3.941us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.591us 53.45% 94.591us 3.941us 24
+ aten::copy_ 4.03% 107.815us 68.77% 1.839ms 102.157us 57.793us 32.66% 60.672us 3.371us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.674us 22.98% 40.674us 3.389us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.576us 13.89% 24.576us 2.048us 12
+ aten::clone 0.89% 23.682us 65.69% 1.756ms 292.710us 0.000us 0.00% 19.998us 3.333us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.119us 9.67% 17.119us 2.853us 6
+ aten::add 1.15% 30.841us 2.01% 53.673us 8.945us 12.288us 6.94% 12.288us 2.048us 6
+ aten::sub 1.30% 34.671us 2.23% 59.581us 9.930us 12.288us 6.94% 12.288us 2.048us 6
+ Activity Buffer Request 53.49% 1.430ms 53.49% 1.430ms 1.430ms 2.879us 1.63% 2.879us 2.879us 1
+ aten::empty_strided 1.18% 31.429us 1.18% 31.429us 5.238us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.73% 233.388us 8.73% 233.388us 38.898us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.65% 70.737us 3.45% 92.144us 3.839us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.80% 21.407us 0.80% 21.407us 0.892us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.70% 232.603us 8.70% 232.603us 4.846us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.20% 5.340us 0.20% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.674ms
+Self CUDA time total: 176.960us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 971.862us 328.16% 971.862us 971.862us 1
+ torch_eager 12.15% 329.142us 99.82% 2.705ms 2.705ms 0.000us 0.00% 313.305us 313.305us 1
+ aten::mul 5.49% 148.746us 9.75% 264.179us 11.007us 144.477us 48.78% 144.477us 6.020us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 144.477us 48.78% 144.477us 6.020us 24
+ aten::copy_ 3.89% 105.362us 68.19% 1.848ms 102.658us 110.590us 37.34% 127.741us 7.097us 18
+ aten::clone 1.05% 28.441us 65.26% 1.769ms 294.758us 0.000us 0.00% 70.398us 11.733us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.343us 19.36% 57.343us 4.779us 12
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.247us 17.98% 53.247us 8.874us 6
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.087us 13.87% 41.087us 3.424us 12
+ aten::sub 1.30% 35.153us 2.18% 59.153us 9.859us 20.672us 6.98% 20.672us 3.445us 6
+ aten::add 1.16% 31.441us 1.98% 53.651us 8.942us 20.415us 6.89% 20.415us 3.402us 6
+ Activity Buffer Request 53.08% 1.438ms 53.08% 1.438ms 1.438ms 17.151us 5.79% 17.151us 17.151us 1
+ aten::empty_strided 1.17% 31.740us 1.17% 31.740us 5.290us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 8.63% 233.787us 8.63% 233.787us 38.964us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 2.59% 70.073us 3.35% 90.793us 3.783us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.76% 20.720us 0.76% 20.720us 0.863us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 8.56% 231.958us 8.56% 231.958us 4.832us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.18% 5.010us 0.18% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.710ms
+Self CUDA time total: 296.154us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 945.200us 534.23% 945.200us 945.200us 1
+ torch_eager 20.38% 296.401us 99.59% 1.448ms 1.448ms 0.000us 0.00% 179.808us 179.808us 1
+ aten::mul 10.46% 152.181us 18.12% 263.514us 10.980us 94.525us 53.43% 94.525us 3.939us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.525us 53.43% 94.525us 3.939us 24
+ aten::copy_ 7.17% 104.245us 43.33% 630.244us 35.014us 57.666us 32.59% 60.547us 3.364us 18
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.673us 22.99% 40.673us 3.389us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.736us 13.98% 24.736us 2.061us 12
+ aten::clone 1.49% 21.691us 37.84% 550.400us 91.733us 0.000us 0.00% 19.874us 3.312us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.993us 9.60% 16.993us 2.832us 6
+ aten::sub 2.47% 35.942us 4.11% 59.792us 9.965us 12.384us 7.00% 12.384us 2.064us 6
+ aten::add 2.09% 30.411us 3.88% 56.491us 9.415us 12.352us 6.98% 12.352us 2.059us 6
+ Activity Buffer Request 15.10% 219.557us 15.10% 219.557us 219.557us 2.881us 1.63% 2.881us 2.881us 1
+ aten::empty_strided 2.03% 29.530us 2.03% 29.530us 4.922us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 16.76% 243.759us 16.76% 243.759us 40.627us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.81% 69.906us 6.24% 90.817us 3.784us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.44% 20.911us 1.44% 20.911us 0.871us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.40% 223.946us 15.40% 223.946us 4.666us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.41% 5.961us 0.41% 5.961us 5.961us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.454ms
+Self CUDA time total: 176.927us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 966.807us 326.27% 966.807us 966.807us 1
+ torch_eager 21.10% 301.699us 99.64% 1.425ms 1.425ms 0.000us 0.00% 314.141us 314.141us 1
+ aten::mul 10.74% 153.603us 18.95% 270.927us 11.289us 144.864us 48.89% 144.864us 6.036us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 144.864us 48.89% 144.864us 6.036us 24
+ aten::copy_ 7.47% 106.842us 41.64% 595.420us 33.079us 110.942us 37.44% 128.766us 7.154us 18
+ aten::clone 1.49% 21.294us 35.47% 507.209us 84.535us 0.000us 0.00% 71.614us 11.936us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.152us 19.29% 57.152us 4.763us 12
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.790us 18.15% 53.790us 8.965us 6
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.511us 13.67% 40.511us 3.376us 12
+ aten::sub 2.47% 35.333us 4.25% 60.804us 10.134us 20.288us 6.85% 20.288us 3.381us 6
+ aten::add 2.13% 30.471us 3.66% 52.363us 8.727us 20.223us 6.82% 20.223us 3.371us 6
+ Activity Buffer Request 13.62% 194.727us 13.62% 194.727us 194.727us 17.824us 6.02% 17.824us 17.824us 1
+ aten::empty_strided 2.14% 30.600us 2.14% 30.600us 5.100us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.72% 224.758us 15.72% 224.758us 37.460us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 4.94% 70.633us 6.40% 91.582us 3.816us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.47% 20.949us 1.47% 20.949us 0.873us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 16.35% 233.780us 16.35% 233.780us 4.870us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.36% 5.210us 0.36% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.430ms
+Self CUDA time total: 296.317us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 954.667us 163.51% 954.667us 954.667us 1
+ torch_eager 21.08% 298.350us 99.62% 1.410ms 1.410ms 0.000us 0.00% 607.510us 607.510us 1
+ aten::copy_ 7.23% 102.385us 41.44% 586.482us 32.582us 268.667us 46.02% 292.315us 16.240us 18
+ aten::mul 10.73% 151.847us 18.95% 268.240us 11.177us 249.820us 42.79% 249.820us 10.409us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 249.820us 42.79% 249.820us 10.409us 24
+ aten::clone 1.47% 20.758us 35.72% 505.547us 84.258us 0.000us 0.00% 201.757us 33.626us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 178.109us 30.51% 178.109us 29.685us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.558us 15.51% 90.558us 7.547us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.375us 11.20% 65.375us 5.448us 12
+ aten::sub 2.55% 36.094us 4.25% 60.153us 10.026us 32.800us 5.62% 32.800us 5.467us 6
+ aten::add 2.18% 30.790us 3.76% 53.162us 8.860us 32.575us 5.58% 32.575us 5.429us 6
+ Activity Buffer Request 14.07% 199.186us 14.07% 199.186us 199.186us 23.648us 4.05% 23.648us 23.648us 1
+ aten::empty_strided 2.17% 30.642us 2.17% 30.642us 5.107us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 15.65% 221.418us 15.65% 221.418us 36.903us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 5.01% 70.953us 6.50% 91.982us 3.833us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 1.49% 21.029us 1.49% 21.029us 0.876us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 15.99% 226.317us 15.99% 226.317us 4.715us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 0.38% 5.410us 0.38% 5.410us 5.410us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.415ms
+Self CUDA time total: 583.862us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 13.40% 294.433us 64.71% 1.422ms 1.422ms 0.000us 0.00% 1.833ms 1.833ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.806ms 102.20% 1.806ms 1.806ms 1
+ aten::copy_ 4.78% 104.916us 25.99% 570.899us 31.717us 790.968us 44.76% 856.920us 47.607us 18
+ aten::mul 7.09% 155.744us 13.34% 293.130us 12.214us 828.278us 46.87% 828.278us 34.512us 24
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 828.278us 46.87% 828.278us 34.512us 24
+ aten::clone 0.98% 21.583us 22.19% 487.616us 81.269us 0.000us 0.00% 623.577us 103.929us 6
+ Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 557.625us 31.56% 557.625us 92.937us 6
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.343us 13.20% 233.343us 19.445us 12
+void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 147.904us 8.37% 147.904us 12.325us 12
+ aten::sub 1.71% 37.532us 2.94% 64.522us 10.754us 89.216us 5.05% 89.216us 14.869us 6
+ Activity Buffer Request 8.13% 178.646us 8.13% 178.646us 178.646us 65.952us 3.73% 65.952us 65.952us 1
+ aten::add 1.39% 30.430us 2.44% 53.591us 8.932us 58.688us 3.32% 58.688us 9.781us 6
+ aten::empty_strided 1.37% 30.060us 1.37% 30.060us 5.010us 0.000us 0.00% 0.000us 0.000us 6
+ cudaMemcpyAsync 10.15% 222.926us 10.15% 222.926us 37.154us 0.000us 0.00% 0.000us 0.000us 6
+ aten::slice 3.32% 73.001us 4.25% 93.471us 3.895us 0.000us 0.00% 0.000us 0.000us 24
+ aten::as_strided 0.93% 20.470us 0.93% 20.470us 0.853us 0.000us 0.00% 0.000us 0.000us 24
+ cudaLaunchKernel 11.47% 251.948us 11.47% 251.948us 5.249us 0.000us 0.00% 0.000us 0.000us 48
+ cudaDeviceSynchronize 35.29% 775.316us 35.29% 775.316us 775.316us 0.000us 0.00% 0.000us 0.000us 1
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.197ms
+Self CUDA time total: 1.767ms
+
+
+impl wl p50(ms) ok
+torch_eager cuda_B1_S128_H32_D128_R64 0.23 True
+torch_eager cuda_B1_S128_H32_D64_R32 0.23 True
+torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
+torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
+torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
+torch_eager cuda_B1_S2048_H32_D64_R32 0.23 True
+torch_eager cuda_B1_S2048_H8_D128_R64 0.23 True
+torch_eager cuda_B1_S2048_H8_D64_R32 0.23 True
+torch_eager cuda_B1_S512_H32_D128_R64 0.23 True
+torch_eager cuda_B1_S512_H32_D64_R32 0.23 True
+torch_eager cuda_B1_S512_H8_D128_R64 0.23 True
+torch_eager cuda_B1_S512_H8_D64_R32 0.23 True
+torch_eager cuda_B2_S128_H32_D128_R64 0.23 True
+torch_eager cuda_B2_S128_H32_D64_R32 0.23 True
+torch_eager cuda_B2_S128_H8_D128_R64 0.23 True
+torch_eager cuda_B2_S128_H8_D64_R32 0.23 True
+torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
+torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
+torch_eager cuda_B2_S2048_H8_D128_R64 0.23 True
+torch_eager cuda_B2_S2048_H8_D64_R32 0.23 True
+torch_eager cuda_B2_S512_H32_D128_R64 0.23 True
+torch_eager cuda_B2_S512_H32_D64_R32 0.23 True
+torch_eager cuda_B2_S512_H8_D128_R64 0.23 True
+torch_eager cuda_B2_S512_H8_D64_R32 0.23 True
+
+
+
+
+Installed 37 packages in 219ms
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/rotary/index.html b/rotary/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..5ff503336b04c290f15ed24958b96a45568efad3
--- /dev/null
+++ b/rotary/index.html
@@ -0,0 +1,3879 @@
+
+
+
+
+
+ index
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
Rotary Position Embeddings Benchmarks
+
This directory contains benchmarks for Rotary Position Embeddings (RoPE) implementations.
+
Implementations
+
+
Results
+
+
+
+
+
\ No newline at end of file
diff --git a/rotary/results/artifacts/combine/latency.svg b/rotary/results/artifacts/combine/latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..fbdea134ec02a1374a095572b0b66bc987fe1081
--- /dev/null
+++ b/rotary/results/artifacts/combine/latency.svg
@@ -0,0 +1,489 @@
+
+
\ No newline at end of file
diff --git a/rotary/results/cells/combine.py b/rotary/results/cells/combine.py
new file mode 100644
index 0000000000000000000000000000000000000000..92c70bcedd0c600f59230f141fc59b2158a3df4d
--- /dev/null
+++ b/rotary/results/cells/combine.py
@@ -0,0 +1,26 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "HF Kernels Rotary": "UVNOTE_FILE_HF_KERNELS_ROTARY_BENCHMARK",
+ "PyTorch Rotary": "UVNOTE_FILE_TORCH_ROTARY_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="rotary.jsonl",
+ svg_filename="latency.svg"
+)
\ No newline at end of file
diff --git a/rotary/results/combined_results.html b/rotary/results/combined_results.html
new file mode 100644
index 0000000000000000000000000000000000000000..488032a1b5a5ed4e0a06a61b22547b7136e3e053
--- /dev/null
+++ b/rotary/results/combined_results.html
@@ -0,0 +1,5024 @@
+
+
+
+
+
+ Rotary Position Embeddings Benchmark - Combined Results
+
+
+
+
+
+
+
+
+
+
+
+
+ Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
+
+
+
+
+
Rotary Position Embeddings Benchmarks - Aggregated Results
+
This document combines benchmark results from multiple Rotary Position Embeddings implementations.
+
Combined Summary and Visualization
+
+
+
+
+
+
+
+
+
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+from kernels_benchmark_tools.core.visuals import generate_combined_results
+
+# Map display names to uvnote environment variables
+cache_env_map = {
+ "HF Kernels Rotary": "UVNOTE_FILE_HF_KERNELS_ROTARY_BENCHMARK",
+ "PyTorch Rotary": "UVNOTE_FILE_TORCH_ROTARY_BENCHMARK",
+}
+
+# Generate combined results with visualization
+generate_combined_results(
+ cache_env_map=cache_env_map,
+ output_filename="rotary.jsonl",
+ svg_filename="latency.svg"
+)
+
+
+
+
+
+
+
+
======================================================================
+LOADING BENCHMARK DATA
+======================================================================
+✓ HF Kernels Rotary : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/49ec9501b131c967277abe3cccb638422565260339bb30f5ea386b0076f2183e
+✓ PyTorch Rotary : /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5
+
+ ✓ Found HF Kernels Rotary
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/49ec9501b131c967277abe3cccb638422565260339bb30f5ea386b0076f2183e/rotary.jsonl
+ ✓ Found PyTorch Rotary
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/rotary/impls/.uvnote/cache/abf801d6445dfa81a8dd7b2e6257930c39c18160a9b97a739858c3b244e16cc5/rotary.jsonl
+
+======================================================================
+Summary: 2 found, 0 skipped, 0 missing
+======================================================================
+
+COMBINED BENCHMARK SUMMARY
+
+impl wl p50(ms) ok
+hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False
+hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.10 False
+hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
+hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
+hf_kernels_rotary cuda_B1_S2048_H32_D128_R64 0.10 False
+hf_kernels_rotary cuda_B1_S2048_H32_D64_R32 0.09 False
+hf_kernels_rotary cuda_B1_S2048_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 False
+hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 False
+hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 False
+hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 False
+hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 False
+hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 False
+hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 False
+hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.28 False
+hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.10 False
+hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 False
+hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 False
+hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.10 False
+hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 False
+hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
+torch_eager cuda_B1_S128_H32_D128_R64 0.23 True
+torch_eager cuda_B1_S128_H32_D64_R32 0.23 True
+torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
+torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
+torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
+torch_eager cuda_B1_S2048_H32_D64_R32 0.23 True
+torch_eager cuda_B1_S2048_H8_D128_R64 0.23 True
+torch_eager cuda_B1_S2048_H8_D64_R32 0.23 True
+torch_eager cuda_B1_S512_H32_D128_R64 0.23 True
+torch_eager cuda_B1_S512_H32_D64_R32 0.23 True
+torch_eager cuda_B1_S512_H8_D128_R64 0.23 True
+torch_eager cuda_B1_S512_H8_D64_R32 0.23 True
+torch_eager cuda_B2_S128_H32_D128_R64 0.23 True
+torch_eager cuda_B2_S128_H32_D64_R32 0.23 True
+torch_eager cuda_B2_S128_H8_D128_R64 0.23 True
+torch_eager cuda_B2_S128_H8_D64_R32 0.23 True
+torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
+torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
+torch_eager cuda_B2_S2048_H8_D128_R64 0.23 True
+torch_eager cuda_B2_S2048_H8_D64_R32 0.23 True
+torch_eager cuda_B2_S512_H32_D128_R64 0.23 True
+torch_eager cuda_B2_S512_H32_D64_R32 0.23 True
+torch_eager cuda_B2_S512_H8_D128_R64 0.23 True
+torch_eager cuda_B2_S512_H8_D64_R32 0.23 True
+
+GENERATING COMBINED VISUALIZATION
+
+Loaded 48 records
+✓ Visualization saved as latency.svg
+Saved latency.png
+✓ Visualization saved as latency.svg
+✓ SVG visualization ready!
+
+ANALYSIS COMPLETE
+Total implementations analyzed: 2
+
+Implementations included:
+ ✓ HF Kernels Rotary
+ ✓ PyTorch Rotary
+
+
+
+
+Installed 37 packages in 224ms
+
+
+
+
Artifacts:
+
latency.svg
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/rotary/results/index.html b/rotary/results/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..329ab80bbf228ddf5f35f9df4da9dcb65d51731e
--- /dev/null
+++ b/rotary/results/index.html
@@ -0,0 +1,88 @@
+
+
+
+
+
+ Index of /rotary/results
+
+
+
+
+ Index of /rotary/results
+
+
+
\ No newline at end of file