diff --git a/activation/impls/cells/benchmark.py b/activation/impls/cells/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..dd0e4c48cbb96dbaa4f9070fe2fb723816d7c86b --- /dev/null +++ b/activation/impls/cells/benchmark.py @@ -0,0 +1,64 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch", +# "kernels-benchmark-tools", +# "kernels", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true } +# /// +import torch +import sys +import kernels_benchmark_tools as kbt +from kernels import get_kernel + +# Load the activation kernel +activation = get_kernel("kernels-community/activation") + + +def hf_kernels_swiglu(input_tensor): + """HuggingFace Kernels SwiGLU implementation""" + hidden_dim = input_tensor.shape[-1] // 2 + out_shape = input_tensor.shape[:-1] + (hidden_dim,) + out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device) + return activation.silu_and_mul(out, input_tensor) + + +# Register the implementation +kbt.add( + "hf_kernels_swiglu", + hf_kernels_swiglu, + tags={"family": "hf-kernels", "backend": "triton", "compile": "none"}, +) + +if __name__ == "__main__": + device = "cuda" if torch.cuda.is_available() else "cpu" + + if device == "cpu": + print("HF Kernels SwiGLU requires CUDA - skipping benchmark") + sys.exit(0) + + dtype = "bfloat16" + + # Generate workloads - using a subset for faster testing + wl = list(kbt.activation.llama_workloads(dtype=dtype))[:3] # First 3 workloads + + print(f"Running SwiGLU benchmarks on {device} with {dtype}") + print(f"Testing {len(wl)} workloads") + + # Run benchmark + kbt.run( + wl, + jsonl="activation.jsonl", + reps=5, + warmup=2, + gen=kbt.activation.gen_inputs, + ref=kbt.activation.ref_swiglu, + cmp=kbt.activation.cmp_allclose, + profile_trace=True + ) + + kbt.summarize(["activation.jsonl"]) \ No newline at end of file diff --git a/activation/impls/cells/nv.py b/activation/impls/cells/nv.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5 --- /dev/null +++ b/activation/impls/cells/nv.py @@ -0,0 +1,2 @@ +import subprocess +print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/activation/impls/compiled_swiglu.html b/activation/impls/compiled_swiglu.html new file mode 100644 index 0000000000000000000000000000000000000000..b4016b939dcf1bed676b6907278e8943c3cfea09 --- /dev/null +++ b/activation/impls/compiled_swiglu.html @@ -0,0 +1,3979 @@ + + + + + + compiled_swiglu + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

Compiled SwiGLU Activation

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.25s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 22 08:58:23 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | +| N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ + +
+
+
+ +

SwiGLU Benchmark (torch.compile)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 0.05s | FAILED + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
+# ///
+import torch
+import sys
+import kernels_benchmark_tools as kbt
+
+
+def torch_swiglu_base(input_tensor):
+    """Base PyTorch SwiGLU implementation"""
+    d = input_tensor.shape[-1] // 2
+    x1 = input_tensor[..., :d]
+    x2 = input_tensor[..., d:]
+    return torch.nn.functional.silu(x1) * x2
+
+
+# Compile the function
+compiled_swiglu = torch.compile(torch_swiglu_base, mode="max-autotune", fullgraph=True, dynamic=False)
+
+
+# Register the implementation
+kbt.add(
+    "compiled_swiglu_max_autotune",
+    compiled_swiglu,
+    tags={"family": "torch", "backend": "compiled", "compile": "max-autotune"},
+)
+
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = "float32" if device == "cpu" else "bfloat16"
+
+    # Generate workloads - using a subset for faster testing
+    if device == "cuda":
+        wl = list(kbt.activation.llama_workloads(dtype=dtype))[:3]
+    else:
+        wl = list(kbt.activation.cpu_workloads(dtype=dtype))[:3]
+
+    print(f"Running SwiGLU benchmarks on {device} with {dtype}")
+    print(f"Testing {len(wl)} workloads")
+
+    # Run benchmark
+    kbt.run(
+        wl,
+        jsonl="activation.jsonl",
+        reps=5,
+        warmup=2,
+        gen=kbt.activation.gen_inputs,
+        ref=kbt.activation.ref_swiglu,
+        cmp=kbt.activation.cmp_allclose,
+        profile_trace=True
+    )
+
+    kbt.summarize(["activation.jsonl"])
+
+ +
+
+
+
+
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools +
+
+
+
+ + + \ No newline at end of file diff --git a/activation/impls/hf_kernels_swiglu.html b/activation/impls/hf_kernels_swiglu.html new file mode 100644 index 0000000000000000000000000000000000000000..d6e3ddf4983356e5f664411d363211346fd04999 --- /dev/null +++ b/activation/impls/hf_kernels_swiglu.html @@ -0,0 +1,3982 @@ + + + + + + hf_kernels_swiglu + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

HF Kernels - SwiGLU Activation

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.25s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 22 08:58:23 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | +| N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ + +
+
+
+ +

SwiGLU Benchmark

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 0.01s | FAILED + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch",
+#     "kernels-benchmark-tools",
+#     "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
+# ///
+import torch
+import sys
+import kernels_benchmark_tools as kbt
+from kernels import get_kernel
+
+# Load the activation kernel
+activation = get_kernel("kernels-community/activation")
+
+
+def hf_kernels_swiglu(input_tensor):
+    """HuggingFace Kernels SwiGLU implementation"""
+    hidden_dim = input_tensor.shape[-1] // 2
+    out_shape = input_tensor.shape[:-1] + (hidden_dim,)
+    out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
+    return activation.silu_and_mul(out, input_tensor)
+
+
+# Register the implementation
+kbt.add(
+    "hf_kernels_swiglu",
+    hf_kernels_swiglu,
+    tags={"family": "hf-kernels", "backend": "triton", "compile": "none"},
+)
+
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    if device == "cpu":
+        print("HF Kernels SwiGLU requires CUDA - skipping benchmark")
+        sys.exit(0)
+
+    dtype = "bfloat16"
+
+    # Generate workloads - using a subset for faster testing
+    wl = list(kbt.activation.llama_workloads(dtype=dtype))[:3]  # First 3 workloads
+
+    print(f"Running SwiGLU benchmarks on {device} with {dtype}")
+    print(f"Testing {len(wl)} workloads")
+
+    # Run benchmark
+    kbt.run(
+        wl,
+        jsonl="activation.jsonl",
+        reps=5,
+        warmup=2,
+        gen=kbt.activation.gen_inputs,
+        ref=kbt.activation.ref_swiglu,
+        cmp=kbt.activation.cmp_allclose,
+        profile_trace=True
+    )
+
+    kbt.summarize(["activation.jsonl"])
+
+ +
+
+
+
+
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools +
+
+
+
+ + + \ No newline at end of file diff --git a/activation/impls/index.html b/activation/impls/index.html new file mode 100644 index 0000000000000000000000000000000000000000..d355ca2e6f8c88b50aaa0c1e632e89d4e30291de --- /dev/null +++ b/activation/impls/index.html @@ -0,0 +1,90 @@ + + + + + + Index of /activation/impls + + + +
+ ← back +
+

Index of /activation/impls

+ + + \ No newline at end of file diff --git a/activation/impls/torch_swiglu.html b/activation/impls/torch_swiglu.html new file mode 100644 index 0000000000000000000000000000000000000000..574bc4e8aa99fb749507209b1030b8e491afe8d8 --- /dev/null +++ b/activation/impls/torch_swiglu.html @@ -0,0 +1,3978 @@ + + + + + + torch_swiglu + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

PyTorch Native - SwiGLU Activation

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.25s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 22 08:58:23 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | +| N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ + +
+
+
+ +

SwiGLU Benchmark (PyTorch Native)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 0.02s | FAILED + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
+# ///
+import torch
+import sys
+import kernels_benchmark_tools as kbt
+
+
+def torch_swiglu(input_tensor):
+    """PyTorch native SwiGLU implementation"""
+    # Split input into two halves
+    d = input_tensor.shape[-1] // 2
+    x1 = input_tensor[..., :d]   # First half
+    x2 = input_tensor[..., d:]   # Second half
+
+    # SwiGLU: silu(x1) * x2
+    return torch.nn.functional.silu(x1) * x2
+
+
+# Register the implementation
+kbt.add(
+    "torch_swiglu",
+    torch_swiglu,
+    tags={"family": "torch", "backend": "native", "compile": "none"},
+)
+
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = "float32" if device == "cpu" else "bfloat16"
+
+    # Generate workloads - using a subset for faster testing
+    if device == "cuda":
+        wl = list(kbt.activation.llama_workloads(dtype=dtype))[:3]
+    else:
+        wl = list(kbt.activation.cpu_workloads(dtype=dtype))[:3]
+
+    print(f"Running SwiGLU benchmarks on {device} with {dtype}")
+    print(f"Testing {len(wl)} workloads")
+
+    # Run benchmark
+    kbt.run(
+        wl,
+        jsonl="activation.jsonl",
+        reps=5,
+        warmup=2,
+        gen=kbt.activation.gen_inputs,
+        ref=kbt.activation.ref_swiglu,
+        cmp=kbt.activation.cmp_allclose,
+        profile_trace=True
+    )
+
+    kbt.summarize(["activation.jsonl"])
+
+ +
+
+
+
+
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools +
+
+
+
+ + + \ No newline at end of file diff --git a/activation/index.html b/activation/index.html new file mode 100644 index 0000000000000000000000000000000000000000..ddb801226b9a0aa8d81788bef013e946eb8554ed --- /dev/null +++ b/activation/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /activation + + + +
+ ← back +
+

Index of /activation

+ + + \ No newline at end of file diff --git a/activation/results/index.html b/activation/results/index.html new file mode 100644 index 0000000000000000000000000000000000000000..5c60fe94ab1a86a4d9f299448a7d8a5b85027447 --- /dev/null +++ b/activation/results/index.html @@ -0,0 +1,88 @@ + + + + + + Index of /activation/results + + + +
+ ← back +
+

Index of /activation/results

+ + + \ No newline at end of file diff --git a/flash_attn/impls/cells/benchmark.py b/flash_attn/impls/cells/benchmark.py index c558501ad2f4abcbaec8236272134bb8e8b0cfc4..461f666ef33998017b756244a540c25d4793e29a 100644 --- a/flash_attn/impls/cells/benchmark.py +++ b/flash_attn/impls/cells/benchmark.py @@ -4,46 +4,42 @@ # "numpy", # "torch", # "kernels-benchmark-tools", -# "kernels", +# "xformers", # ] # # [tool.uv.sources] -# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" } +# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true } # /// import torch import sys import os import kernels_benchmark_tools as kbt -from kernels import get_kernel +import xformers.ops as xops -hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn", revision="v0.0.2") - -def hf_flash_attention(query, key, value): - """HuggingFace Kernels Flash Attention""" - return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0] +def xformers_attention(q, k, v): + """xFormers memory efficient attention""" + # xFormers expects [batch, seq_len, heads, head_dim] + return xops.memory_efficient_attention(q, k, v) kbt.add( - "hf_kernels_flash_attn", - hf_flash_attention, - tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"}, + "xformers_meff", + xformers_attention, + tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"}, ) if __name__ == "__main__": device = "cuda" if torch.cuda.is_available() else "cpu" - - if device == "cpu": - print("HF Kernels Flash Attention requires CUDA - skipping benchmark") - sys.exit(0) - - dtype = "bfloat16" + dtype = "float32" if device == "cpu" else "bfloat16" # Flux-like workloads - base = 1024 - flux_sizes = [128, 256, 320, 384, 448, 512] - heads = 24 - head_dim = 128 + base = 1024 if device == "cuda" else 512 + flux_sizes = ( + [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256] + ) + heads = 24 if device == "cuda" else 8 + head_dim = 128 if device == "cuda" else 64 wl = [] for L in flux_sizes: @@ -68,5 +64,6 @@ if __name__ == "__main__": gen=kbt.attn.gen_qkv, ref=kbt.attn.ref_math, cmp=kbt.attn.cmp_allclose, + profile_trace=True ) kbt.summarize(["attn.jsonl"]) \ No newline at end of file diff --git a/flash_attn/impls/cells/benchmark_default.py b/flash_attn/impls/cells/benchmark_default.py index cc2fd06ac69ffe1f5bc88d1821b17447dc90c846..020dca60cc336dc64f2484e50cdb941926408fac 100644 --- a/flash_attn/impls/cells/benchmark_default.py +++ b/flash_attn/impls/cells/benchmark_default.py @@ -7,7 +7,7 @@ # ] # # [tool.uv.sources] -# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" } +# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true } # /// import torch import sys @@ -66,5 +66,6 @@ if __name__ == "__main__": gen=kbt.attn.gen_qkv, ref=kbt.attn.ref_math, cmp=kbt.attn.cmp_allclose, + profile_trace=True ) kbt.summarize(["attn_default.jsonl"]) \ No newline at end of file diff --git a/flash_attn/impls/compiled_variants.html b/flash_attn/impls/compiled_variants.html index 6a8ad1c87fe3de178f48216a9a342d6d09eda581..d6a42971b613c93a42b4998ff389e66cebe43f8c 100644 --- a/flash_attn/impls/compiled_variants.html +++ b/flash_attn/impls/compiled_variants.html @@ -3829,7 +3829,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3837,20 +3837,20 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:

Torch Compile Variants!

This file benchmarks Flash Attention with different torch.compile modes.

Flash Attention with torch.compile(mode="default")

-
+
▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark_default | 45.83s +Cell: benchmark_default | 0.02s | FAILED | Raw GitHub
-
+
# /// script
 # requires-python = ">=3.10"
@@ -3861,7 +3861,7 @@ Cell: benchmark_default | 45.83s
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 import sys
@@ -3920,6 +3920,7 @@ Cell: benchmark_default | 45.83s
         gen=kbt.attn.gen_qkv,
         ref=kbt.attn.ref_math,
         cmp=kbt.attn.cmp_allclose,
+        profile_trace=True
     )
     kbt.summarize(["attn_default.jsonl"])
 
@@ -3928,235 +3929,14 @@ Cell: benchmark_default | 45.83s
-
impl wl p50(ms) ok -torch_flash_compiled_default flux_L128 0.36 True -torch_flash_compiled_default flux_L256 0.50 True -torch_flash_compiled_default flux_L320 0.54 True -torch_flash_compiled_default flux_L384 0.59 True -torch_flash_compiled_default flux_L448 0.61 True -torch_flash_compiled_default flux_L512 0.64 True -
-
-
▶ UV Install Logs
- -
-
-

Artifacts:

-attn_default.jsonl +
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools

Flash Attention with torch.compile(mode="max-autotune")

-
-
- -▼ code -▼ output - ▶ uv-logs - | -Cell: benchmark_max_autotune | 48.72s - | - -Raw -GitHub -
-
-
-
# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#     "numpy",
-#     "torch",
-#     "kernels-benchmark-tools",
-# ]
-#
-# [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
-# ///
-import torch
-import sys
-import os
-import kernels_benchmark_tools as kbt
-
-
-def torch_flash_base(q, k, v):
-    qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
-    with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
-        o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
-    return o.transpose(1, 2).contiguous()
-
-
-# Compile with max-autotune mode
-compiled_flash_max_autotune = torch.compile(torch_flash_base, mode="max-autotune", fullgraph=True, dynamic=False)
-
-kbt.add(
-    "torch_flash_compiled_max_autotune",
-    compiled_flash_max_autotune,
-    tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
-)
-
-if __name__ == "__main__":
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype = "float32" if device == "cpu" else "bfloat16"
-
-    # Flux-like workloads
-    base = 1024 if device == "cuda" else 512
-    flux_sizes = (
-        [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
-    )
-    heads = 24 if device == "cuda" else 8
-    head_dim = 128 if device == "cuda" else 64
-
-    wl = []
-    for L in flux_sizes:
-        wl.append(
-            {
-                "name": f"flux_L{L}",
-                "batch": 1,
-                "seq_len": base + L,
-                "heads": heads,
-                "head_dim": head_dim,
-                "dtype": dtype,
-                "device": device,
-                "seed": 0,
-            }
-        )
-
-    kbt.run(
-        wl,
-        jsonl="attn_max_autotune.jsonl",
-        reps=5,
-        warmup=2,
-        gen=kbt.attn.gen_qkv,
-        ref=kbt.attn.ref_math,
-        cmp=kbt.attn.cmp_allclose,
-    )
-    kbt.summarize(["attn_max_autotune.jsonl"])
-
- -
-
-
-
-
impl wl p50(ms) ok -torch_flash_compiled_max_autotune flux_L128 0.38 True -torch_flash_compiled_max_autotune flux_L256 0.55 True -torch_flash_compiled_max_autotune flux_L320 0.61 True -torch_flash_compiled_max_autotune flux_L384 0.66 True -torch_flash_compiled_max_autotune flux_L448 0.70 True -torch_flash_compiled_max_autotune flux_L512 0.76 True -
-
-
▶ UV Install Logs
- -
-
-

Artifacts:

-attn_max_autotune.jsonl -
-
-
diff --git a/flash_attn/impls/flash_attention.html b/flash_attn/impls/flash_attention.html index afbfa0ff38a4b327c16742ee1fb6af147876deb1..a0770a6914dc81a46eeae62c6660b4b5e3cd5790 100644 --- a/flash_attn/impls/flash_attention.html +++ b/flash_attn/impls/flash_attention.html @@ -3829,7 +3829,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
@@ -3843,7 +3843,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 4.06s +Cell: nv | 0.23s | Raw @@ -3860,34 +3860,22 @@ Cell: nv | 4.06s
-
Thu Oct 2 16:12:42 2025 +
Wed Oct 22 08:58:24 2025 +-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 560.35.05 Driver Version: 560.35.05 CUDA Version: 12.6 | +| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| -| 0 NVIDIA L4 Off | 00000000:38:00.0 Off | 0 | -| N/A 41C P0 27W / 72W | 1MiB / 23034MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA L4 Off | 00000000:3A:00.0 Off | 0 | -| N/A 41C P0 27W / 72W | 1MiB / 23034MiB | 2% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA L4 Off | 00000000:3C:00.0 Off | 0 | -| N/A 44C P0 29W / 72W | 1MiB / 23034MiB | 2% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA L4 Off | 00000000:3E:00.0 Off | 0 | -| N/A 42C P0 29W / 72W | 1MiB / 23034MiB | 2% Default | +| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | +| N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | -| GPU GI CI PID Type Process name GPU Memory | +| GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | No running processes found | @@ -3898,20 +3886,20 @@ Cell: nv | 4.06s

Flash Attention Benchmark

-
+
▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 38.14s +Cell: benchmark | 0.01s | FAILED | Raw GitHub
-
+
# /// script
 # requires-python = ">=3.10"
@@ -3922,7 +3910,7 @@ Cell: benchmark | 38.14s
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 import sys
@@ -3977,6 +3965,7 @@ Cell: benchmark | 38.14s
         gen=kbt.attn.gen_qkv,
         ref=kbt.attn.ref_math,
         cmp=kbt.attn.cmp_allclose,
+        profile_trace=True
     )
     kbt.summarize(["attn.jsonl"])
 
@@ -3985,71 +3974,9 @@ Cell: benchmark | 38.14s
-
impl wl p50(ms) ok -torch_flash_ma flux_L128 0.41 True -torch_flash_ma flux_L256 0.52 True -torch_flash_ma flux_L320 0.55 True -torch_flash_ma flux_L384 0.59 True -torch_flash_ma flux_L448 0.64 True -torch_flash_ma flux_L512 0.68 True -
-
-
▶ UV Install Logs
- -
-
-

Artifacts:

-attn.jsonl +
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
diff --git a/flash_attn/impls/hf_kernels_flash_attn.html b/flash_attn/impls/hf_kernels_flash_attn.html index a73274cdbcce6cc4d5e87b9056f24bf359798266..41d33910252a30c59119c5959d3ae78e6f7774bc 100644 --- a/flash_attn/impls/hf_kernels_flash_attn.html +++ b/flash_attn/impls/hf_kernels_flash_attn.html @@ -3829,40 +3829,40 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35

HF Kernels - Flash Attention

HuggingFace Kernels Flash Attention Benchmark

-
+
▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 40.14s +Cell: benchmark | 0.01s | FAILED | Raw GitHub 🤗 HF
-
+
# /// script
 # requires-python = ">=3.10"
 # dependencies = [
 #     "numpy",
-#     "torch",
+#     "torch==2.8.0",
 #     "kernels-benchmark-tools",
 #     "kernels",
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 import sys
@@ -3870,7 +3870,7 @@ Cell: benchmark | 40.14s
 import kernels_benchmark_tools as kbt
 from kernels import get_kernel
 
-hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn", revision="v0.0.2")
+hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
 
 
 def hf_flash_attention(query, key, value):
@@ -3922,6 +3922,7 @@ Cell: benchmark | 40.14s
         gen=kbt.attn.gen_qkv,
         ref=kbt.attn.ref_math,
         cmp=kbt.attn.cmp_allclose,
+        profile_trace=True
     )
     kbt.summarize(["attn.jsonl"])
 
@@ -3930,77 +3931,9 @@ Cell: benchmark | 40.14s
-
impl wl p50(ms) ok -hf_kernels_flash_attn flux_L128 0.25 True -hf_kernels_flash_attn flux_L256 0.32 True -hf_kernels_flash_attn flux_L320 0.34 True -hf_kernels_flash_attn flux_L384 0.35 True -hf_kernels_flash_attn flux_L448 0.38 True -hf_kernels_flash_attn flux_L512 0.42 True -
-
-
▶ UV Install Logs
- -
-
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s] -Fetching 20 files: 5%|▌ | 1/20 [00:00<00:05, 3.64it/s] -Fetching 20 files: 10%|█ | 2/20 [00:02<00:22, 1.24s/it] -Fetching 20 files: 100%|██████████| 20/20 [00:02<00:00, 9.14it/s]
-
-

Artifacts:

-attn.jsonl +
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
diff --git a/flash_attn/impls/hf_kernels_flash_attn3.html b/flash_attn/impls/hf_kernels_flash_attn3.html index 73152c9d7d2b2ae02a61e5976e614b15ab93f5e7..efe2b1e9a2d98522716dd5a6d54246f2bda7df60 100644 --- a/flash_attn/impls/hf_kernels_flash_attn3.html +++ b/flash_attn/impls/hf_kernels_flash_attn3.html @@ -3829,28 +3829,28 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35

HF Kernels - Flash Attention 3

HuggingFace Kernels Flash Attention 3 Benchmark

-
+
▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 40.68s +Cell: benchmark | 0.05s | FAILED | Raw GitHub 🤗 HF
-
+
# /// script
 # requires-python = ">=3.10"
@@ -3862,7 +3862,7 @@ Cell: benchmark | 40.68s
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 import sys
@@ -3921,6 +3921,7 @@ Cell: benchmark | 40.68s
         gen=kbt.attn.gen_qkv,
         ref=kbt.attn.ref_math,
         cmp=kbt.attn.cmp_allclose,
+        profile_trace=True
     )
     kbt.summarize(["attn.jsonl"])
 
@@ -3929,77 +3930,9 @@ Cell: benchmark | 40.68s
-
impl wl p50(ms) ok -hf_kernels_flash_attn3 flux_L128 0.28 True -hf_kernels_flash_attn3 flux_L256 0.34 True -hf_kernels_flash_attn3 flux_L320 0.36 True -hf_kernels_flash_attn3 flux_L384 0.37 True -hf_kernels_flash_attn3 flux_L448 0.40 True -hf_kernels_flash_attn3 flux_L512 0.43 True -
-
-
▶ UV Install Logs
- -
-
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] -Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 3.56it/s] -Fetching 4 files: 50%|█████ | 2/4 [00:02<00:02, 1.32s/it] -Fetching 4 files: 100%|██████████| 4/4 [00:02<00:00, 1.72it/s]
-
-

Artifacts:

-attn.jsonl +
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
diff --git a/flash_attn/impls/mem_efficient_attention.html b/flash_attn/impls/mem_efficient_attention.html index d312b68c7234c936d844e0e594ee7dcd9cfc80f7..5f096db1621b7899513f8ab39db491fce973f695 100644 --- a/flash_attn/impls/mem_efficient_attention.html +++ b/flash_attn/impls/mem_efficient_attention.html @@ -3829,27 +3829,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35

Memory Efficient Attention Implementation

Memory Efficient SDPA Benchmark

-
+
▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 39.23s +Cell: benchmark | 0.01s | FAILED | Raw GitHub
-
+
# /// script
 # requires-python = ">=3.10"
@@ -3860,7 +3860,7 @@ Cell: benchmark | 39.23s
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 import sys
@@ -3917,6 +3917,7 @@ Cell: benchmark | 39.23s
         gen=kbt.attn.gen_qkv,
         ref=kbt.attn.ref_math,
         cmp=kbt.attn.cmp_allclose,
+        profile_trace=True
     )
     kbt.summarize(["attn.jsonl"])
 
@@ -3925,71 +3926,9 @@ Cell: benchmark | 39.23s
-
impl wl p50(ms) ok -torch_mem_eff flux_L128 0.48 True -torch_mem_eff flux_L256 0.63 True -torch_mem_eff flux_L320 0.70 True -torch_mem_eff flux_L384 0.83 True -torch_mem_eff flux_L448 0.95 True -torch_mem_eff flux_L512 1.00 True -
-
-
▶ UV Install Logs
- -
-
-

Artifacts:

-attn.jsonl +
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
diff --git a/flash_attn/impls/sage_attention.html b/flash_attn/impls/sage_attention.html index 28506e5e6e014e92b72b012a13eeb752eb738612..1eb83cd2eeadcc2c1a8a7ec9aeff36a99a645ce7 100644 --- a/flash_attn/impls/sage_attention.html +++ b/flash_attn/impls/sage_attention.html @@ -3829,27 +3829,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35

SageAttention Implementation

SageAttention Benchmark (INT8 Quantized)

-
+
▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 41.27s +Cell: benchmark | 0.05s | FAILED | Raw GitHub
-
+
# /// script
 # requires-python = ">=3.10"
@@ -3862,7 +3862,7 @@ Cell: benchmark | 41.27s
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 import sys
@@ -3928,6 +3928,7 @@ Cell: benchmark | 41.27s
         gen=kbt.attn.gen_qkv,
         ref=kbt.attn.ref_math,
         cmp=kbt.attn.cmp_allclose,
+        profile_trace=True
     )
     kbt.summarize(["attn.jsonl"])
 
@@ -3936,84 +3937,9 @@ Cell: benchmark | 41.27s
-
impl wl p50(ms) ok -sage_int8_fp16 flux_L128 FAIL False - Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd' -sage_int8_fp16 flux_L256 FAIL False - Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd' -sage_int8_fp16 flux_L320 FAIL False - Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd' -sage_int8_fp16 flux_L384 FAIL False - Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd' -sage_int8_fp16 flux_L448 FAIL False - Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd' -sage_int8_fp16 flux_L512 FAIL False - Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd' -
-
-
▶ UV Install Logs
- -
-
Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s] -Fetching 11 files: 9%|▉ | 1/11 [00:00<00:05, 1.85it/s] -Fetching 11 files: 45%|████▌ | 5/11 [00:00<00:00, 6.46it/s] -Fetching 11 files: 73%|███████▎ | 8/11 [00:01<00:00, 10.07it/s] -Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 10.94it/s]
-
-

Artifacts:

-attn.jsonl +
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
diff --git a/flash_attn/impls/xformers.html b/flash_attn/impls/xformers.html index 373894c5797700bdd37003fd887d9b68e8139e31..f41ebeee1d5a268d65fb78f5d792b6e9a5ff7e36 100644 --- a/flash_attn/impls/xformers.html +++ b/flash_attn/impls/xformers.html @@ -3829,27 +3829,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
Generated on:
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31 + Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35

xFormers Memory Efficient Attention

xFormers Benchmark

-
+
▼ code ▼ output - ▶ uv-logs + ▶ uv-logs | -Cell: benchmark | 41.87s +Cell: benchmark | 0.01s | FAILED | Raw GitHub
-
+
# /// script
 # requires-python = ">=3.10"
@@ -3861,7 +3861,7 @@ Cell: benchmark | 41.87s
 # ]
 #
 # [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
 # ///
 import torch
 import sys
@@ -3917,6 +3917,7 @@ Cell: benchmark | 41.87s
         gen=kbt.attn.gen_qkv,
         ref=kbt.attn.ref_math,
         cmp=kbt.attn.cmp_allclose,
+        profile_trace=True
     )
     kbt.summarize(["attn.jsonl"])
 
@@ -3925,73 +3926,9 @@ Cell: benchmark | 41.87s
-
impl wl p50(ms) ok -xformers_meff flux_L128 0.35 True -xformers_meff flux_L256 0.41 True -xformers_meff flux_L320 0.43 True -xformers_meff flux_L384 0.44 True -xformers_meff flux_L448 0.48 True -xformers_meff flux_L512 0.50 True -
-
-
▶ UV Install Logs
- -
-
-

Artifacts:

-attn.jsonl +
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
diff --git a/index.html b/index.html index df44040e2dd9e1e4a0fc2d5ee08453d4b9953f11..2e332fbb4625d6616600ed587901091f0cce28ce 100644 --- a/index.html +++ b/index.html @@ -79,7 +79,9 @@

Index of /

\ No newline at end of file diff --git a/layer_norm/impls/cells/benchmark.py b/layer_norm/impls/cells/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..7d5061e985592977dbe3cf1ff2ad0cf1d1de7d6c --- /dev/null +++ b/layer_norm/impls/cells/benchmark.py @@ -0,0 +1,62 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "numpy", +# "torch", +# "kernels", +# "kernels-benchmark-tools", +# ] +# +# [tool.uv.sources] +# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true } +# /// +import torch +from kernels import get_kernel +import kernels_benchmark_tools as kbt + +layer_norm_kernel = get_kernel("kernels-community/layer-norm") + +def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5): + B, S, D = x.shape + # The kernel expects [N, D] input; support beta (bias) if provided. + out = layer_norm_kernel.dropout_add_ln_fwd( + input=x.view(-1, D), + gamma=weight, + beta=bias, + rowscale=None, + colscale=None, + x0_subset=None, + z_subset=None, + dropout_p=0.0, + epsilon=eps, + rowscale_const=1.0, + z_numrows=S, + gen=None, + residual_in_fp32=False, + is_rms_norm=False, + )[0].view(B, S, D) + return out + +kbt.add( + "hf_kernels_layer_norm", + hf_kernels_layer_norm, + tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, +) + +if __name__ == "__main__": + device = "cuda" if torch.cuda.is_available() else "cpu" + dtype = "float32" if device == "cpu" else "bfloat16" + + wl = list(kbt.layer_norm.llama_workloads(dtype)) if device == "cuda" else list(kbt.layer_norm.cpu_workloads(dtype)) + + kbt.run( + wl, + jsonl="ln.jsonl", + reps=5, + warmup=2, + gen=kbt.layer_norm.gen_inputs, + ref=kbt.layer_norm.ref_layer_norm, + cmp=kbt.layer_norm.cmp_allclose, + profile_trace=False, + ) + kbt.summarize(["ln.jsonl"]) \ No newline at end of file diff --git a/layer_norm/impls/cells/nv.py b/layer_norm/impls/cells/nv.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b2d5c1f9f3850dc1d3092240fd46853e7ed0c5 --- /dev/null +++ b/layer_norm/impls/cells/nv.py @@ -0,0 +1,2 @@ +import subprocess +print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/layer_norm/impls/hf_kernels_layer_norm.html b/layer_norm/impls/hf_kernels_layer_norm.html new file mode 100644 index 0000000000000000000000000000000000000000..24b5162eaf6392c44c5cf9fe82684349d8e6302a --- /dev/null +++ b/layer_norm/impls/hf_kernels_layer_norm.html @@ -0,0 +1,3932 @@ + + + + + + hf_kernels_layer_norm + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

on_github: huggingface/kernels-uvnotes

+

HF Kernels LayerNorm Implementation

+

Based on kernels-community layer-norm kernel.

+

LayerNorm Benchmark (HF Kernels)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 0.05s | FAILED + | + +Raw +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch",
+#     "kernels",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
+# ///
+import torch
+from kernels import get_kernel
+import kernels_benchmark_tools as kbt
+
+layer_norm_kernel = get_kernel("kernels-community/layer-norm")
+
+def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
+    B, S, D = x.shape
+    # The kernel expects [N, D] input; support beta (bias) if provided.
+    out = layer_norm_kernel.dropout_add_ln_fwd(
+        input=x.view(-1, D),
+        gamma=weight,
+        beta=bias,
+        rowscale=None,
+        colscale=None,
+        x0_subset=None,
+        z_subset=None,
+        dropout_p=0.0,
+        epsilon=eps,
+        rowscale_const=1.0,
+        z_numrows=S,
+        gen=None,
+        residual_in_fp32=False,
+        is_rms_norm=False,
+    )[0].view(B, S, D)
+    return out
+
+kbt.add(
+    "hf_kernels_layer_norm",
+    hf_kernels_layer_norm,
+    tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
+)
+
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = "float32" if device == "cpu" else "bfloat16"
+
+    wl = list(kbt.layer_norm.llama_workloads(dtype)) if device == "cuda" else list(kbt.layer_norm.cpu_workloads(dtype))
+
+    kbt.run(
+        wl,
+        jsonl="ln.jsonl",
+        reps=5,
+        warmup=2,
+        gen=kbt.layer_norm.gen_inputs,
+        ref=kbt.layer_norm.ref_layer_norm,
+        cmp=kbt.layer_norm.cmp_allclose,
+        profile_trace=False,
+    )
+    kbt.summarize(["ln.jsonl"])
+
+ +
+
+
+
+
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools +
+
+
+
+ + + \ No newline at end of file diff --git a/layer_norm/impls/index.html b/layer_norm/impls/index.html new file mode 100644 index 0000000000000000000000000000000000000000..51ba6dd6789d67e2ffa1e3f02dea720dbda17216 --- /dev/null +++ b/layer_norm/impls/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /layer_norm/impls + + + +
+ ← back +
+

Index of /layer_norm/impls

+ + + \ No newline at end of file diff --git a/layer_norm/impls/torch_layer_norm.html b/layer_norm/impls/torch_layer_norm.html new file mode 100644 index 0000000000000000000000000000000000000000..dd538e5f7886597ab68a10269589bb2a90331b2a --- /dev/null +++ b/layer_norm/impls/torch_layer_norm.html @@ -0,0 +1,3958 @@ + + + + + + torch_layer_norm + + + + + + + +
+
+ + ← back + +
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35 +
+
+ +
+

on_github: huggingface/kernels-uvnotes

+

Torch LayerNorm Implementation

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.22s + | + +Raw +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 22 08:58:23 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | +| N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ + +
+
+
+ +

LayerNorm Benchmark (PyTorch)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 0.01s | FAILED + | + +Raw +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
+# ///
+import torch
+import kernels_benchmark_tools as kbt
+
+
+def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
+    return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
+
+kbt.add(
+    "torch_layer_norm",
+    torch_layer_norm,
+    tags={"family": "torch", "op": "layer_norm"},
+)
+
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = "float32" if device == "cpu" else "bfloat16"
+
+    wl = list(kbt.layer_norm.llama_workloads(dtype)) if device == "cuda" else list(kbt.layer_norm.cpu_workloads(dtype))
+
+    kbt.run(
+        wl,
+        jsonl="ln.jsonl",
+        reps=5,
+        warmup=2,
+        gen=kbt.layer_norm.gen_inputs,
+        ref=kbt.layer_norm.ref_layer_norm,
+        cmp=kbt.layer_norm.cmp_allclose,
+        profile_trace=False,
+    )
+    kbt.summarize(["ln.jsonl"])
+
+ +
+
+
+
+
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools +
+
+
+
+ + + \ No newline at end of file diff --git a/layer_norm/index.html b/layer_norm/index.html new file mode 100644 index 0000000000000000000000000000000000000000..12f60968be235270e079aa5c48545ec9a928579b --- /dev/null +++ b/layer_norm/index.html @@ -0,0 +1,89 @@ + + + + + + Index of /layer_norm + + + +
+ ← back +
+

Index of /layer_norm

+ + + \ No newline at end of file diff --git a/layer_norm/results/index.html b/layer_norm/results/index.html new file mode 100644 index 0000000000000000000000000000000000000000..5b6bcefdc3dcaa949d66002abc2672c3de221470 --- /dev/null +++ b/layer_norm/results/index.html @@ -0,0 +1,88 @@ + + + + + + Index of /layer_norm/results + + + +
+ ← back +
+

Index of /layer_norm/results

+ + + \ No newline at end of file