Spaces:
No application file
No application file
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- activation/impls/artifacts/benchmark/activation.jsonl +9 -0
- activation/impls/cells/benchmark.py +34 -0
- activation/impls/cells/nv.py +2 -0
- activation/impls/hf_kernels_swiglu.html +0 -0
- activation/impls/index.html +89 -0
- activation/impls/torch_swiglu.html +0 -0
- activation/index.html +89 -0
- activation/results/artifacts/combine/latency.svg +318 -0
- activation/results/cells/combine.py +27 -0
- activation/results/combined_results.html +0 -0
- activation/results/index.html +88 -0
- causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -0
- causal_conv1d/impls/cells/benchmark.py +31 -0
- causal_conv1d/impls/cells/nv.py +2 -0
- causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
- causal_conv1d/impls/index.html +89 -0
- causal_conv1d/impls/torch_causal_conv1d.html +0 -0
- causal_conv1d/index.html +89 -0
- causal_conv1d/results/artifacts/combine/latency.svg +530 -0
- causal_conv1d/results/cells/combine.py +26 -0
- causal_conv1d/results/combined_results.html +0 -0
- causal_conv1d/results/index.html +88 -0
- flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -0
- flash_attn/impls/cells/benchmark.py +30 -0
- flash_attn/impls/cells/nv.py +3 -0
- flash_attn/impls/flash_attention.html +0 -0
- flash_attn/impls/hf_kernels_flash_attn.html +0 -0
- flash_attn/impls/hf_kernels_flash_attn3.html +0 -0
- flash_attn/impls/index.html +93 -0
- flash_attn/impls/mem_efficient_attention.html +0 -0
- flash_attn/impls/sage_attention.html +0 -0
- flash_attn/impls/xformers.html +0 -0
- flash_attn/index.html +89 -0
- flash_attn/results/artifacts/combine/latency.svg +355 -0
- flash_attn/results/cells/combine.py +30 -0
- flash_attn/results/combined_results.html +0 -0
- flash_attn/results/index.html +88 -0
- index.html +0 -0
- layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -0
- layer_norm/impls/cells/benchmark.py +49 -0
- layer_norm/impls/cells/nv.py +2 -0
- layer_norm/impls/hf_kernels_layer_norm.html +0 -0
- layer_norm/impls/index.html +89 -0
- layer_norm/impls/torch_layer_norm.html +0 -0
- layer_norm/index.html +89 -0
- layer_norm/results/artifacts/combine/latency.svg +230 -0
- layer_norm/results/cells/combine.py +26 -0
- layer_norm/results/combined_results.html +0 -0
- layer_norm/results/index.html +88 -0
- rotary/impls/artifacts/benchmark/rotary.jsonl +24 -0
activation/impls/artifacts/benchmark/activation.jsonl
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.022950000015953265, "p50": 0.023951000002853107, "p90": 0.0245499999778076, "mean": 0.02414040001212925, "iqr": 0.0010899999551838846, "raw_times": [0.02579100004140855, 0.0245499999778076, 0.023951000002853107, 0.022950000015953265, 0.023460000022623717], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031180999997104664, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02659000000448941, "p50": 0.03026100000624865, "p90": 0.03163099995617813, "mean": 0.03016299999671901, "iqr": 0.001709999935428641, "raw_times": [0.02659000000448941, 0.03026100000624865, 0.02992100002074949, 0.03163099995617813, 0.032411999995929364], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03256100001181039, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02795999995441889, "p50": 0.0293610000312583, "p90": 0.02937200002861573, "mean": 0.029306999988421012, "iqr": 9.100006082007894e-05, "raw_times": [0.02795999995441889, 0.03056099996001649, 0.0293610000312583, 0.02928099996779565, 0.02937200002861573], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03265100002636245, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02837199997429707, "p50": 0.029151000035199104, "p90": 0.0292910000325719, "mean": 0.028971200003979902, "iqr": 0.0007500000265281415, "raw_times": [0.02854100000604376, 0.0292910000325719, 0.029500999971787678, 0.029151000035199104, 0.02837199997429707], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03205100000513994, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0284509999914917, "p50": 0.02926099995192999, "p90": 0.029411000014079036, "mean": 0.029144599977826147, "iqr": 0.0005010000450056395, "raw_times": [0.028909999969073397, 0.029689999962556612, 0.029411000014079036, 0.0284509999914917, 0.02926099995192999], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031930999966789386, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027061000025696558, "p50": 0.028121000013925368, "p90": 0.02836999999544787, "mean": 0.027967000005446607, "iqr": 0.0005990000317979138, "raw_times": [0.027770999963649956, 0.028512000028513285, 0.028121000013925368, 0.02836999999544787, 0.027061000025696558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030291000030047144, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02748099996097153, "p50": 0.029001000029893476, "p90": 0.030041000002256624, "mean": 0.029116999996858794, "iqr": 0.0011299999869152089, "raw_times": [0.02748099996097153, 0.030150999975830928, 0.030041000002256624, 0.029001000029893476, 0.028911000015341415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031200999956126907, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028581000037775084, "p50": 0.028771000017968618, "p90": 0.02886099997567726, "mean": 0.028774800000519463, "iqr": 0.00020999999605919584, "raw_times": [0.028581000037775084, 0.02900999999155829, 0.028771000017968618, 0.028650999979618064, 0.02886099997567726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03162100000508872, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028431000032469456, "p50": 0.029390999998213374, "p90": 0.029580999978406908, "mean": 0.029274800010625768, "iqr": 0.00035999994452140527, "raw_times": [0.028431000032469456, 0.029221000033885502, 0.0297500000101536, 0.029390999998213374, 0.029580999978406908], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030401000003621448, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
activation/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
import torch
|
| 14 |
+
import sys
|
| 15 |
+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
+
|
| 18 |
+
# Load the activation kernel
|
| 19 |
+
activation = get_kernel("kernels-community/activation")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def hf_kernels_swiglu(input_tensor):
|
| 23 |
+
hidden_dim = input_tensor.shape[-1] // 2
|
| 24 |
+
out_shape = input_tensor.shape[:-1] + (hidden_dim,)
|
| 25 |
+
out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
|
| 26 |
+
return activation.silu_and_mul(out, input_tensor)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
run_benchmark(
|
| 30 |
+
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 31 |
+
impl_name="hf_kernels_swiglu",
|
| 32 |
+
impl_tags={"family": "hf-kernels", "backend": "cuda"},
|
| 33 |
+
impl_func=hf_kernels_swiglu,
|
| 34 |
+
)
|
activation/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
activation/impls/hf_kernels_swiglu.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
activation/impls/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /activation/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /activation/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='hf_kernels_swiglu.html' class='file'>hf_kernels_swiglu.html</a></li>
|
| 86 |
+
<li><a href='torch_swiglu.html' class='file'>torch_swiglu.html</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
activation/impls/torch_swiglu.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
activation/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /activation</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /activation</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
activation/results/artifacts/combine/latency.svg
ADDED
|
|
activation/results/cells/combine.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 14 |
+
|
| 15 |
+
# Map display names to uvnote environment variables
|
| 16 |
+
cache_env_map = {
|
| 17 |
+
"HF Kernels SwiGLU": "UVNOTE_FILE_HF_KERNELS_SWIGLU_BENCHMARK",
|
| 18 |
+
"PyTorch SwiGLU": "UVNOTE_FILE_TORCH_SWIGLU_BENCHMARK",
|
| 19 |
+
# "Compiled SwiGLU": "UVNOTE_FILE_COMPILED_SWIGLU_BENCHMARK",
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# Generate combined results with visualization
|
| 23 |
+
generate_combined_results(
|
| 24 |
+
cache_env_map=cache_env_map,
|
| 25 |
+
output_filename="activation.jsonl",
|
| 26 |
+
svg_filename="latency.svg"
|
| 27 |
+
)
|
activation/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
activation/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /activation/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /activation/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.046111000017390325, "p50": 0.046270999973785365, "p90": 0.04740100001754399, "mean": 0.04670720001058726, "iqr": 0.001160000010713702, "raw_times": [0.047512000037386315, 0.04740100001754399, 0.04624100000683029, 0.046270999973785365, 0.046111000017390325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05871199999774035, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05225199998903918, "p50": 0.053462000039417035, "p90": 0.053592000028857, "mean": 0.05365380001194353, "iqr": 0.0002100000529026147, "raw_times": [0.053462000039417035, 0.055581000026450056, 0.053592000028857, 0.053381999975954386, 0.05225199998903918], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0581319999923835, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05121200001667603, "p50": 0.05470199999990655, "p90": 0.05482099999198908, "mean": 0.05431980000594194, "iqr": 0.0013289999856169743, "raw_times": [0.05121200001667603, 0.057372000014765945, 0.05470199999990655, 0.05482099999198908, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056541999981618574, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05210199998373355, "p50": 0.05333199999313365, "p90": 0.05396199998131124, "mean": 0.05322599998862643, "iqr": 0.0016399999935856613, "raw_times": [0.05210199998373355, 0.05333199999313365, 0.05396199998131124, 0.052321999987725576, 0.05441199999722812], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09094299997514099, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05103099999814731, "p50": 0.05309199997327596, "p90": 0.053381999975954386, "mean": 0.05291379998197954, "iqr": 0.0004199999921183917, "raw_times": [0.053381999975954386, 0.052961999983835994, 0.05103099999814731, 0.05309199997327596, 0.054101999978684034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05603199997494812, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051181999992877536, "p50": 0.05189199998767435, "p90": 0.05201199996918149, "mean": 0.052023999978700886, "iqr": 0.0004999999987376214, "raw_times": [0.05151199997044387, 0.05352199997332718, 0.05189199998767435, 0.05201199996918149, 0.051181999992877536], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055981999992127385, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05042200001525998, "p50": 0.052002000018092076, "p90": 0.05382199998393844, "mean": 0.05366420000427752, "iqr": 0.00333999997792489, "raw_times": [0.05048200000601355, 0.05042200001525998, 0.052002000018092076, 0.06159299999808354, 0.05382199998393844], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05433199999060889, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0522220000220841, "p50": 0.053632000003744906, "p90": 0.05870200004665094, "mean": 0.056078200009324064, "iqr": 0.005690000079994206, "raw_times": [0.0522220000220841, 0.06282300000748364, 0.053632000003744906, 0.05301199996665673, 0.05870200004665094], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.055741999972269696, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05032100000335049, "p50": 0.050921000024573004, "p90": 0.05318199998782802, "mean": 0.05303959999309882, "iqr": 0.0023800000121809717, "raw_times": [0.05080199997564705, 0.050921000024573004, 0.05032100000335049, 0.059971999974095525, 0.05318199998782802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0550720000092042, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05211199999166638, "p50": 0.05235200001152407, "p90": 0.053132000005007285, "mean": 0.05707820000679931, "iqr": 0.0008700000080352766, "raw_times": [0.05235200001152407, 0.05226199999697201, 0.053132000005007285, 0.07553300002882679, 0.05211199999166638], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05610199997363452, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0512720000074296, "p50": 0.0524320000181433, "p90": 0.05278200001157529, "mean": 0.05529400000341411, "iqr": 0.000919999990856013, "raw_times": [0.05278200001157529, 0.0524320000181433, 0.0512720000074296, 0.0681219999592031, 0.05186200002071928], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05547199998545693, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05112100001269937, "p50": 0.051342000006115995, "p90": 0.05172099997707846, "mean": 0.053885599993463984, "iqr": 0.00040899999476096127, "raw_times": [0.05112100001269937, 0.06393199998910859, 0.05172099997707846, 0.0513119999823175, 0.051342000006115995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055091999968226446, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050531999988834286, "p50": 0.05176199999823439, "p90": 0.051821999988987955, "mean": 0.05163600000059887, "iqr": 0.0003099999617006688, "raw_times": [0.050531999988834286, 0.05176199999823439, 0.052551999999650434, 0.051821999988987955, 0.051512000027287286], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055182000039621926, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124200004047452, "p50": 0.05148200000348879, "p90": 0.05251200002476253, "mean": 0.051918000008299714, "iqr": 0.0011100000278929656, "raw_times": [0.05251200002476253, 0.05295199997590316, 0.05148200000348879, 0.05140199999686956, 0.05124200004047452], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05506200000127137, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05025200005093211, "p50": 0.05105200000343757, "p90": 0.05146199998762313, "mean": 0.05136380001431462, "iqr": 0.0005399999736255268, "raw_times": [0.05146199998762313, 0.053131000015582686, 0.050922000013997604, 0.05025200005093211, 0.05105200000343757], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0684330000240152, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051181999992877536, "p50": 0.052152000023397704, "p90": 0.05241200000227764, "mean": 0.05240600000888662, "iqr": 0.00034999999343199306, "raw_times": [0.052152000023397704, 0.05422200001703459, 0.05241200000227764, 0.051181999992877536, 0.052062000008845644], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05490099999860831, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05016099999011203, "p50": 0.05225199998903918, "p90": 0.05251199996791911, "mean": 0.05182779999586273, "iqr": 0.001349999934063817, "raw_times": [0.05016099999011203, 0.053051999998388055, 0.05116200003385529, 0.05251199996791911, 0.05225199998903918], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05627199999480581, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05154100000481776, "p50": 0.0524320000181433, "p90": 0.05299099996136647, "mean": 0.05266959998380116, "iqr": 0.0006189999908201571, "raw_times": [0.05154100000481776, 0.054011999964131974, 0.05299099996136647, 0.0524320000181433, 0.05237199997054631], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05572200001324745, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05098199994790775, "p50": 0.05128100002593783, "p90": 0.052071999959935056, "mean": 0.05161159999715892, "iqr": 0.0008409999168179638, "raw_times": [0.05098199994790775, 0.052071999959935056, 0.05128100002593783, 0.05123100004311709, 0.052492000008896866], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055401999986770534, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050202000011267955, "p50": 0.05295199997590316, "p90": 0.05307200001425372, "mean": 0.052619999996750266, "iqr": 0.00046000002384971594, "raw_times": [0.050202000011267955, 0.05307200001425372, 0.054261999991922494, 0.05295199997590316, 0.052611999990404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05440200004613871, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05220100001679384, "p50": 0.052891999985149596, "p90": 0.05323199997064876, "mean": 0.05431980000594194, "iqr": 0.0007509999591093219, "raw_times": [0.05220100001679384, 0.052891999985149596, 0.05323199997064876, 0.06079300004557808, 0.052481000011539436], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0552820000052634, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05108200002723606, "p50": 0.05157200001804085, "p90": 0.053041000001030625, "mean": 0.051985800007514626, "iqr": 0.0018490000002202578, "raw_times": [0.05157200001804085, 0.05108200002723606, 0.053041000001030625, 0.05119200000081037, 0.053041999990455224], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05657200000541707, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05095099999152808, "p50": 0.0515919999770631, "p90": 0.05208099997844329, "mean": 0.05173159999003474, "iqr": 0.0006789999815737247, "raw_times": [0.0515919999770631, 0.05208099997844329, 0.052632000006269664, 0.05095099999152808, 0.05140199999686956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056392000033156364, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05110099999683371, "p50": 0.051662000032592914, "p90": 0.051741999982368725, "mean": 0.05161380000799909, "iqr": 0.00010999997357430402, "raw_times": [0.05163200000879442, 0.05110099999683371, 0.051741999982368725, 0.051662000032592914, 0.05193200001940568], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05588200002648591, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
causal_conv1d/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
import torch
|
| 14 |
+
import sys
|
| 15 |
+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
+
|
| 18 |
+
# Load the causal conv1d kernel
|
| 19 |
+
causal_conv1d = get_kernel("kernels-community/causal-conv1d")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def hf_kernels_causal_conv1d(input_tensor, weight, bias):
|
| 23 |
+
return causal_conv1d.causal_conv1d_fn(input_tensor, weight, bias)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
run_benchmark(
|
| 27 |
+
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
|
| 28 |
+
impl_name="hf_kernels_causal_conv1d",
|
| 29 |
+
impl_tags={"family": "hf-kernels", "backend": "cuda"},
|
| 30 |
+
impl_func=hf_kernels_causal_conv1d,
|
| 31 |
+
)
|
causal_conv1d/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
causal_conv1d/impls/hf_kernels_causal_conv1d.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/impls/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /causal_conv1d/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /causal_conv1d/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='hf_kernels_causal_conv1d.html' class='file'>hf_kernels_causal_conv1d.html</a></li>
|
| 86 |
+
<li><a href='torch_causal_conv1d.html' class='file'>torch_causal_conv1d.html</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
causal_conv1d/impls/torch_causal_conv1d.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /causal_conv1d</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /causal_conv1d</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
causal_conv1d/results/artifacts/combine/latency.svg
ADDED
|
|
causal_conv1d/results/cells/combine.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 14 |
+
|
| 15 |
+
# Map display names to uvnote environment variables
|
| 16 |
+
cache_env_map = {
|
| 17 |
+
"HF Kernels Causal Conv1D": "UVNOTE_FILE_HF_KERNELS_CAUSAL_CONV1D_BENCHMARK",
|
| 18 |
+
"PyTorch Causal Conv1D": "UVNOTE_FILE_TORCH_CAUSAL_CONV1D_BENCHMARK",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# Generate combined results with visualization
|
| 22 |
+
generate_combined_results(
|
| 23 |
+
cache_env_map=cache_env_map,
|
| 24 |
+
output_filename="causal_conv1d.jsonl",
|
| 25 |
+
svg_filename="latency.svg"
|
| 26 |
+
)
|
causal_conv1d/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /causal_conv1d/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /causal_conv1d/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|
flash_attn/impls/artifacts/benchmark/attention.jsonl
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9715130000245153, "p50": 0.9773340000265307, "p90": 0.9788430000412518, "mean": 0.976309200018477, "iqr": 0.005310000005920301, "raw_times": [0.9735330000353315, 0.9773340000265307, 0.9803229999647556, 0.9788430000412518, 0.9715130000245153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9926440000072034, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0154749999742307, "p50": 1.0199449999959143, "p90": 1.0278160000325443, "mean": 1.0223952000046665, "iqr": 0.010921000011876458, "raw_times": [1.0278160000325443, 1.0168950000206678, 1.0318449999999757, 1.0154749999742307, 1.0199449999959143], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0225849999869752, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0612160000391668, "p50": 1.0721770000259312, "p90": 1.075397000022349, "mean": 1.0706886000093618, "iqr": 0.009251000051335723, "raw_times": [1.0612160000391668, 1.0721770000259312, 1.0661459999710132, 1.075397000022349, 1.078506999988349], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0771669999485312, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.075485999990633, "p50": 1.0823069999901236, "p90": 1.084176999995634, "mean": 1.0827727999981107, "iqr": 0.0021099999685247894, "raw_times": [1.075485999990633, 1.0820670000271093, 1.0823069999901236, 1.0898269999870536, 1.084176999995634], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1057869999717695, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2330920000067636, "p50": 1.237381999999343, "p90": 1.239422999958606, "mean": 1.2375224000038543, "iqr": 0.002220999931523693, "raw_times": [1.2405130000274767, 1.2372020000270822, 1.2330920000067636, 1.237381999999343, 1.239422999958606], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.22687200001792, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2296720000222194, "p50": 1.230811999960224, "p90": 1.236231999996562, "mean": 1.2357499999893662, "iqr": 0.005929999986165058, "raw_times": [1.236231999996562, 1.2517319999574283, 1.230811999960224, 1.230302000010397, 1.2296720000222194], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2250920000269616, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "xformers",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
import torch
|
| 14 |
+
import sys
|
| 15 |
+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
import xformers.ops as xops
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def xformers_attention(q, k, v):
|
| 20 |
+
"""xFormers memory efficient attention"""
|
| 21 |
+
# xFormers expects [batch, seq_len, heads, head_dim]
|
| 22 |
+
return xops.memory_efficient_attention(q, k, v)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
run_benchmark(
|
| 26 |
+
kernel_type=KernelTypeEnum.ATTENTION,
|
| 27 |
+
impl_name="xformers_meff",
|
| 28 |
+
impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
|
| 29 |
+
impl_func=xformers_attention,
|
| 30 |
+
)
|
flash_attn/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
|
| 3 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
flash_attn/impls/flash_attention.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/hf_kernels_flash_attn.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/hf_kernels_flash_attn3.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/index.html
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /flash_attn/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /flash_attn/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='flash_attention.html' class='file'>flash_attention.html</a></li>
|
| 86 |
+
<li><a href='hf_kernels_flash_attn.html' class='file'>hf_kernels_flash_attn.html</a></li>
|
| 87 |
+
<li><a href='hf_kernels_flash_attn3.html' class='file'>hf_kernels_flash_attn3.html</a></li>
|
| 88 |
+
<li><a href='mem_efficient_attention.html' class='file'>mem_efficient_attention.html</a></li>
|
| 89 |
+
<li><a href='sage_attention.html' class='file'>sage_attention.html</a></li>
|
| 90 |
+
<li><a href='xformers.html' class='file'>xformers.html</a></li>
|
| 91 |
+
</ul>
|
| 92 |
+
</body>
|
| 93 |
+
</html>
|
flash_attn/impls/mem_efficient_attention.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/sage_attention.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/impls/xformers.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /flash_attn</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /flash_attn</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
flash_attn/results/artifacts/combine/latency.svg
ADDED
|
|
flash_attn/results/cells/combine.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 14 |
+
|
| 15 |
+
# Map display names to uvnote environment variables
|
| 16 |
+
cache_env_map = {
|
| 17 |
+
"Flash (PyTorch SDPA)": "UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK",
|
| 18 |
+
"MemEff (PyTorch SDPA)": "UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK",
|
| 19 |
+
"xFormers": "UVNOTE_FILE_XFORMERS_BENCHMARK",
|
| 20 |
+
"HF Kernels Flash Attn": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK",
|
| 21 |
+
"HF Kernels Flash Attn3": "UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK",
|
| 22 |
+
"SageAttention": "UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK",
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# Generate combined results with visualization
|
| 26 |
+
generate_combined_results(
|
| 27 |
+
cache_env_map=cache_env_map,
|
| 28 |
+
output_filename="attention.jsonl",
|
| 29 |
+
svg_filename="latency.svg"
|
| 30 |
+
)
|
flash_attn/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flash_attn/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /flash_attn/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /flash_attn/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|
index.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8265980000032869, "p50": 0.8294890000115629, "p90": 0.8318879999933415, "mean": 0.8305783999958294, "iqr": 0.0024899999857552757, "raw_times": [0.8318879999933415, 0.8294890000115629, 0.8293980000075862, 0.8355189999633694, 0.8265980000032869], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8372490000283506, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6484859999650325, "p50": 1.6553460000068299, "p90": 1.6562569999791776, "mean": 1.654196599986335, "iqr": 0.004349999983332964, "raw_times": [1.6589869999847906, 1.6484859999650325, 1.6553460000068299, 1.6519069999958447, 1.6562569999791776], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6548570000054497, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6374860000496483, "p50": 1.6479959999742277, "p90": 1.650296000036633, "mean": 1.6462442000261035, "iqr": 0.007159000006140559, "raw_times": [1.6479959999742277, 1.6374860000496483, 1.6523060000395162, 1.6431370000304923, 1.650296000036633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.658577000000605, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2406110000238186, "p50": 3.2579909999981282, "p90": 3.259831999969265, "mean": 3.2558895999954984, "iqr": 0.00626999997166422, "raw_times": [3.259831999969265, 3.2579909999981282, 3.2674519999886797, 3.2535619999976007, 3.2406110000238186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2579709999822626, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
|
layer_norm/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels",
|
| 7 |
+
# "kernels-benchmark-tools",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
import torch
|
| 14 |
+
import sys
|
| 15 |
+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
+
|
| 18 |
+
# Load the layer norm kernel
|
| 19 |
+
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
|
| 23 |
+
B, S, D = x.shape
|
| 24 |
+
# The kernel expects [N, D] input; support beta (bias) if provided.
|
| 25 |
+
out = layer_norm_kernel.dropout_add_ln_fwd(
|
| 26 |
+
input=x.view(-1, D),
|
| 27 |
+
gamma=weight,
|
| 28 |
+
beta=bias,
|
| 29 |
+
rowscale=None,
|
| 30 |
+
colscale=None,
|
| 31 |
+
x0_subset=None,
|
| 32 |
+
z_subset=None,
|
| 33 |
+
dropout_p=0.0,
|
| 34 |
+
epsilon=eps,
|
| 35 |
+
rowscale_const=1.0,
|
| 36 |
+
z_numrows=S,
|
| 37 |
+
gen=None,
|
| 38 |
+
residual_in_fp32=False,
|
| 39 |
+
is_rms_norm=False,
|
| 40 |
+
)[0].view(B, S, D)
|
| 41 |
+
return out
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
run_benchmark(
|
| 45 |
+
kernel_type=KernelTypeEnum.LAYER_NORM,
|
| 46 |
+
impl_name="hf_kernels_layer_norm",
|
| 47 |
+
impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
|
| 48 |
+
impl_func=hf_kernels_layer_norm,
|
| 49 |
+
)
|
layer_norm/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
layer_norm/impls/hf_kernels_layer_norm.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/impls/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /layer_norm/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /layer_norm/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='hf_kernels_layer_norm.html' class='file'>hf_kernels_layer_norm.html</a></li>
|
| 86 |
+
<li><a href='torch_layer_norm.html' class='file'>torch_layer_norm.html</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
layer_norm/impls/torch_layer_norm.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /layer_norm</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /layer_norm</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
layer_norm/results/artifacts/combine/latency.svg
ADDED
|
|
layer_norm/results/cells/combine.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 14 |
+
|
| 15 |
+
# Map display names to uvnote environment variables
|
| 16 |
+
cache_env_map = {
|
| 17 |
+
"PyTorch LayerNorm": "UVNOTE_FILE_TORCH_LAYER_NORM_BENCHMARK",
|
| 18 |
+
"HF Kernels LayerNorm": "UVNOTE_FILE_HF_KERNELS_LAYER_NORM_BENCHMARK",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# Generate combined results with visualization
|
| 22 |
+
generate_combined_results(
|
| 23 |
+
cache_env_map=cache_env_map,
|
| 24 |
+
output_filename="layer_norm.jsonl",
|
| 25 |
+
svg_filename="latency.svg"
|
| 26 |
+
)
|
layer_norm/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /layer_norm/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /layer_norm/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|
rotary/impls/artifacts/benchmark/rotary.jsonl
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07538300002352116, "p50": 0.07777199999736695, "p90": 0.07795200002647107, "mean": 0.07717860000866494, "iqr": 0.0014790000477660215, "raw_times": [0.07777199999736695, 0.07647299997870505, 0.07795200002647107, 0.07831300001726049, 0.07538300002352116], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0837029999729566, "peak_bytes": 1720320, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.00153350830078125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09504299998752685, "p50": 0.09633299998768052, "p90": 0.09746300003143915, "mean": 0.0966769999877215, "iqr": 0.0013000000649299182, "raw_times": [0.09504299998752685, 0.09633299998768052, 0.09838299996545175, 0.09616299996650923, 0.09746300003143915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09918300003164404, "peak_bytes": 3440640, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.00154876708984375, "mse_q": 1.5854835510253906e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0929430000269349, "p50": 0.09560399996644264, "p90": 0.09620299999824056, "mean": 0.09600920000139013, "iqr": 0.0026899999738816405, "raw_times": [0.09620299999824056, 0.09560399996644264, 0.10178299999097362, 0.09351300002435892, 0.0929430000269349], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10062299998025992, "peak_bytes": 6832128, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09350300001642609, "p50": 0.09415400000989393, "p90": 0.09585299994796515, "mean": 0.09842139999136634, "iqr": 0.001959999963219161, "raw_times": [0.09350300001642609, 0.09585299994796515, 0.09415400000989393, 0.11470399999780057, 0.09389299998474598], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09742299999970783, "peak_bytes": 13664256, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09248300000308518, "p50": 0.09347299999262759, "p90": 0.09500300001263895, "mean": 0.09405499998820233, "iqr": 0.0018000000636675395, "raw_times": [0.09248300000308518, 0.09500300001263895, 0.0961129999836885, 0.09347299999262759, 0.09320299994897141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09855400003289105, "peak_bytes": 6881280, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09233299999777955, "p50": 0.09477300000071409, "p90": 0.09477400004698211, "mean": 0.09424540002100912, "iqr": 0.0021910000214120373, "raw_times": [0.09233299999777955, 0.09477400004698211, 0.09477300000071409, 0.09676400003399976, 0.09258300002557007], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09677399998508918, "peak_bytes": 13762560, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09216400002287628, "p50": 0.09306300000844203, "p90": 0.09349300000849325, "mean": 0.09324520001428027, "iqr": 0.0005399999736255268, "raw_times": [0.09216400002287628, 0.09306300000844203, 0.09455299999672206, 0.09349300000849325, 0.09295300003486773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10914400002093316, "peak_bytes": 27328512, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.00153350830078125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.5854835510253906e-05, "ref": "rotary_torch"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09248299994624176, "p50": 0.09334300000318763, "p90": 0.09355399998867142, "mean": 0.0935691999870869, "iqr": 0.00066100000140068, "raw_times": [0.09355399998867142, 0.09557300001006297, 0.09334300000318763, 0.09248299994624176, 0.09289299998727074], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09584299999687573, "peak_bytes": 54657024, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.00154876708984375, "mse_q": 1.621246337890625e-05, "mse_k": 1.621246337890625e-05, "ref": "rotary_torch"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09247299999515235, "p50": 0.09385300000985808, "p90": 0.09445400002050519, "mean": 0.09405140001490508, "iqr": 0.001121000025250396, "raw_times": [0.09247299999515235, 0.09445400002050519, 0.0933329999952548, 0.09385300000985808, 0.09614400005375501], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09844400000247333, "peak_bytes": 27525120, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09372300002041811, "p50": 0.094173000036335, "p90": 0.09575299998232367, "mean": 0.09506720000445057, "iqr": 0.0020299999619055598, "raw_times": [0.09796399996275795, 0.09575299998232367, 0.094173000036335, 0.09372300002041811, 0.09372300002041811], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09865399999853253, "peak_bytes": 55050240, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09140299999899071, "p50": 0.092913999992561, "p90": 0.09422299996231231, "mean": 0.09330119999049202, "iqr": 0.0015199999552351073, "raw_times": [0.09140299999899071, 0.09526299999151888, 0.092913999992561, 0.09422299996231231, 0.09270300000707721], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09514300001001175, "peak_bytes": 109314048, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09479400000600435, "p50": 0.09623299996519563, "p90": 0.09679300001153024, "mean": 0.09610519999796452, "iqr": 0.000919999990856013, "raw_times": [0.09587300002067423, 0.09679300001153024, 0.09479400000600435, 0.09623299996519563, 0.09683299998641814], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09740300004068558, "peak_bytes": 218628096, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09216300003345168, "p50": 0.09397300004820863, "p90": 0.09462299999540846, "mean": 0.09381320001011773, "iqr": 0.0016889999869817984, "raw_times": [0.09293400000842666, 0.09537299996509319, 0.09397300004820863, 0.09216300003345168, 0.09462299999540846], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10023300001194002, "peak_bytes": 68698112, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0913630000241028, "p50": 0.0930929999753971, "p90": 0.09448299999803567, "mean": 0.09361499999158696, "iqr": 0.0023500000452258973, "raw_times": [0.0913630000241028, 0.09700300000758943, 0.09213299995280977, 0.09448299999803567, 0.0930929999753971], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09703300003138793, "peak_bytes": 6848512, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.5974044799804688e-05, "ref": "rotary_torch"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0902330000371876, "p50": 0.09208300002683245, "p90": 0.0927039999965018, "mean": 0.0920254000220666, "iqr": 0.0007599999776175537, "raw_times": [0.0902330000371876, 0.09194400001888425, 0.09208300002683245, 0.09316300003092692, 0.0927039999965018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09501400000999638, "peak_bytes": 13647872, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09339300004285178, "p50": 0.09388300003365657, "p90": 0.09438299997555077, "mean": 0.09392300001991316, "iqr": 0.0009499999578110874, "raw_times": [0.09388300003365657, 0.09452300002976699, 0.09438299997555077, 0.09339300004285178, 0.09343300001773969], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09746399996402033, "peak_bytes": 27295744, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.621246337890625e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09369299999661962, "p50": 0.09495300002981821, "p90": 0.09641299999429975, "mean": 0.09557120000636132, "iqr": 0.001839999981712026, "raw_times": [0.09457300001258773, 0.09495300002981821, 0.09641299999429975, 0.09369299999661962, 0.0982239999984813], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09584299999687573, "peak_bytes": 13697024, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09207300001889962, "p50": 0.09441299999934927, "p90": 0.09493300001395255, "mean": 0.09826719999637135, "iqr": 0.0009000000318337698, "raw_times": [0.09207300001889962, 0.11588399996753651, 0.09441299999934927, 0.09493300001395255, 0.09403299998211878], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09803300002886317, "peak_bytes": 27394048, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.00153350830078125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.5974044799804688e-05, "ref": "rotary_torch"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09320300000581483, "p50": 0.09509299997034759, "p90": 0.0968430000511944, "mean": 0.0957752000090295, "iqr": 0.0027100000465907215, "raw_times": [0.0968430000511944, 0.09413300000460367, 0.09509299997034759, 0.09960400001318703, 0.09320300000581483], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09855399997604763, "peak_bytes": 54591488, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0926630000321893, "p50": 0.09438299997555077, "p90": 0.09443299995837151, "mean": 0.09837319998950989, "iqr": 0.0016799999684735667, "raw_times": [0.09275299998989794, 0.09438299997555077, 0.09443299995837151, 0.0926630000321893, 0.1176339999915399], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09754300003805838, "peak_bytes": 109182976, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09100299996589456, "p50": 0.09359299997413473, "p90": 0.09518299998489965, "mean": 0.09356119999210932, "iqr": 0.0025699999355310865, "raw_times": [0.09100299996589456, 0.09518299998489965, 0.09261300004936857, 0.09541399998624911, 0.09359299997413473], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.11789399997041983, "peak_bytes": 54788096, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.125, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09348399998998502, "p50": 0.09433299999273004, "p90": 0.09580299996514441, "mean": 0.09473540000044522, "iqr": 0.0016299999288094114, "raw_times": [0.09433299999273004, 0.09580299996514441, 0.09588400001803166, 0.09348399998998502, 0.094173000036335], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09657300000753821, "peak_bytes": 109576192, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0974529999666629, "p50": 0.09860399995886837, "p90": 0.09875400002101742, "mean": 0.09851759998582565, "iqr": 0.0008510000384376326, "raw_times": [0.09790299998257979, 0.0974529999666629, 0.09860399995886837, 0.09875400002101742, 0.0998739999999998], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10540400000991212, "peak_bytes": 218365952, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2809499999898435, "p50": 0.28135000002293964, "p90": 0.2840199999809556, "mean": 0.28239179999900443, "iqr": 0.0029809999659846653, "raw_times": [0.2809499999898435, 0.28459999998631247, 0.2840199999809556, 0.28103900001497095, 0.28135000002293964], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.28416999998626125, "peak_bytes": 436731904, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.125, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
|