Upload folder using huggingface_hub
Browse files- activation/impls/cells/benchmark.py +64 -0
- activation/impls/cells/nv.py +2 -0
- activation/impls/compiled_swiglu.html +0 -0
- activation/impls/hf_kernels_swiglu.html +0 -0
- activation/impls/index.html +90 -0
- activation/impls/torch_swiglu.html +0 -0
- activation/index.html +89 -0
- activation/results/index.html +88 -0
- flash_attn/impls/cells/benchmark.py +18 -21
- flash_attn/impls/cells/benchmark_default.py +2 -1
- flash_attn/impls/compiled_variants.html +10 -230
- flash_attn/impls/flash_attention.html +16 -89
- flash_attn/impls/hf_kernels_flash_attn.html +12 -79
- flash_attn/impls/hf_kernels_flash_attn3.html +10 -77
- flash_attn/impls/mem_efficient_attention.html +10 -71
- flash_attn/impls/sage_attention.html +10 -84
- flash_attn/impls/xformers.html +10 -73
- index.html +2 -0
- layer_norm/impls/cells/benchmark.py +62 -0
- layer_norm/impls/cells/nv.py +2 -0
- layer_norm/impls/hf_kernels_layer_norm.html +0 -0
- layer_norm/impls/index.html +89 -0
- layer_norm/impls/torch_layer_norm.html +0 -0
- layer_norm/index.html +89 -0
- layer_norm/results/index.html +88 -0
activation/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
import torch
|
| 14 |
+
import sys
|
| 15 |
+
import kernels_benchmark_tools as kbt
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
+
|
| 18 |
+
# Load the activation kernel
|
| 19 |
+
activation = get_kernel("kernels-community/activation")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def hf_kernels_swiglu(input_tensor):
|
| 23 |
+
"""HuggingFace Kernels SwiGLU implementation"""
|
| 24 |
+
hidden_dim = input_tensor.shape[-1] // 2
|
| 25 |
+
out_shape = input_tensor.shape[:-1] + (hidden_dim,)
|
| 26 |
+
out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
|
| 27 |
+
return activation.silu_and_mul(out, input_tensor)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Register the implementation
|
| 31 |
+
kbt.add(
|
| 32 |
+
"hf_kernels_swiglu",
|
| 33 |
+
hf_kernels_swiglu,
|
| 34 |
+
tags={"family": "hf-kernels", "backend": "triton", "compile": "none"},
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 39 |
+
|
| 40 |
+
if device == "cpu":
|
| 41 |
+
print("HF Kernels SwiGLU requires CUDA - skipping benchmark")
|
| 42 |
+
sys.exit(0)
|
| 43 |
+
|
| 44 |
+
dtype = "bfloat16"
|
| 45 |
+
|
| 46 |
+
# Generate workloads - using a subset for faster testing
|
| 47 |
+
wl = list(kbt.activation.llama_workloads(dtype=dtype))[:3] # First 3 workloads
|
| 48 |
+
|
| 49 |
+
print(f"Running SwiGLU benchmarks on {device} with {dtype}")
|
| 50 |
+
print(f"Testing {len(wl)} workloads")
|
| 51 |
+
|
| 52 |
+
# Run benchmark
|
| 53 |
+
kbt.run(
|
| 54 |
+
wl,
|
| 55 |
+
jsonl="activation.jsonl",
|
| 56 |
+
reps=5,
|
| 57 |
+
warmup=2,
|
| 58 |
+
gen=kbt.activation.gen_inputs,
|
| 59 |
+
ref=kbt.activation.ref_swiglu,
|
| 60 |
+
cmp=kbt.activation.cmp_allclose,
|
| 61 |
+
profile_trace=True
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
kbt.summarize(["activation.jsonl"])
|
activation/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
activation/impls/compiled_swiglu.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
activation/impls/hf_kernels_swiglu.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
activation/impls/index.html
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /activation/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /activation/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='compiled_swiglu.html' class='file'>compiled_swiglu.html</a></li>
|
| 86 |
+
<li><a href='hf_kernels_swiglu.html' class='file'>hf_kernels_swiglu.html</a></li>
|
| 87 |
+
<li><a href='torch_swiglu.html' class='file'>torch_swiglu.html</a></li>
|
| 88 |
+
</ul>
|
| 89 |
+
</body>
|
| 90 |
+
</html>
|
activation/impls/torch_swiglu.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
activation/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /activation</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /activation</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
activation/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /activation/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /activation/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -4,46 +4,42 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
-
# kernels-benchmark-tools = {
|
| 12 |
# ///
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
import os
|
| 16 |
import kernels_benchmark_tools as kbt
|
| 17 |
-
|
| 18 |
|
| 19 |
-
hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn", revision="v0.0.2")
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
return
|
| 25 |
|
| 26 |
|
| 27 |
kbt.add(
|
| 28 |
-
"
|
| 29 |
-
|
| 30 |
-
tags={"family": "
|
| 31 |
)
|
| 32 |
|
| 33 |
if __name__ == "__main__":
|
| 34 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 35 |
-
|
| 36 |
-
if device == "cpu":
|
| 37 |
-
print("HF Kernels Flash Attention requires CUDA - skipping benchmark")
|
| 38 |
-
sys.exit(0)
|
| 39 |
-
|
| 40 |
-
dtype = "bfloat16"
|
| 41 |
|
| 42 |
# Flux-like workloads
|
| 43 |
-
base = 1024
|
| 44 |
-
flux_sizes =
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
| 47 |
|
| 48 |
wl = []
|
| 49 |
for L in flux_sizes:
|
|
@@ -68,5 +64,6 @@ if __name__ == "__main__":
|
|
| 68 |
gen=kbt.attn.gen_qkv,
|
| 69 |
ref=kbt.attn.ref_math,
|
| 70 |
cmp=kbt.attn.cmp_allclose,
|
|
|
|
| 71 |
)
|
| 72 |
kbt.summarize(["attn.jsonl"])
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
+
# "xformers",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
|
| 12 |
# ///
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
import os
|
| 16 |
import kernels_benchmark_tools as kbt
|
| 17 |
+
import xformers.ops as xops
|
| 18 |
|
|
|
|
| 19 |
|
| 20 |
+
def xformers_attention(q, k, v):
|
| 21 |
+
"""xFormers memory efficient attention"""
|
| 22 |
+
# xFormers expects [batch, seq_len, heads, head_dim]
|
| 23 |
+
return xops.memory_efficient_attention(q, k, v)
|
| 24 |
|
| 25 |
|
| 26 |
kbt.add(
|
| 27 |
+
"xformers_meff",
|
| 28 |
+
xformers_attention,
|
| 29 |
+
tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
|
| 30 |
)
|
| 31 |
|
| 32 |
if __name__ == "__main__":
|
| 33 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 34 |
+
dtype = "float32" if device == "cpu" else "bfloat16"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# Flux-like workloads
|
| 37 |
+
base = 1024 if device == "cuda" else 512
|
| 38 |
+
flux_sizes = (
|
| 39 |
+
[128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
|
| 40 |
+
)
|
| 41 |
+
heads = 24 if device == "cuda" else 8
|
| 42 |
+
head_dim = 128 if device == "cuda" else 64
|
| 43 |
|
| 44 |
wl = []
|
| 45 |
for L in flux_sizes:
|
|
|
|
| 64 |
gen=kbt.attn.gen_qkv,
|
| 65 |
ref=kbt.attn.ref_math,
|
| 66 |
cmp=kbt.attn.cmp_allclose,
|
| 67 |
+
profile_trace=True
|
| 68 |
)
|
| 69 |
kbt.summarize(["attn.jsonl"])
|
flash_attn/impls/cells/benchmark_default.py
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
| 10 |
-
# kernels-benchmark-tools = {
|
| 11 |
# ///
|
| 12 |
import torch
|
| 13 |
import sys
|
|
@@ -66,5 +66,6 @@ if __name__ == "__main__":
|
|
| 66 |
gen=kbt.attn.gen_qkv,
|
| 67 |
ref=kbt.attn.ref_math,
|
| 68 |
cmp=kbt.attn.cmp_allclose,
|
|
|
|
| 69 |
)
|
| 70 |
kbt.summarize(["attn_default.jsonl"])
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
|
| 11 |
# ///
|
| 12 |
import torch
|
| 13 |
import sys
|
|
|
|
| 66 |
gen=kbt.attn.gen_qkv,
|
| 67 |
ref=kbt.attn.ref_math,
|
| 68 |
cmp=kbt.attn.cmp_allclose,
|
| 69 |
+
profile_trace=True
|
| 70 |
)
|
| 71 |
kbt.summarize(["attn_default.jsonl"])
|
flash_attn/impls/compiled_variants.html
CHANGED
|
@@ -3829,7 +3829,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
@@ -3837,20 +3837,20 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3837 |
<h1>Torch Compile Variants!</h1>
|
| 3838 |
<p>This file benchmarks Flash Attention with different torch.compile modes.</p>
|
| 3839 |
<h2>Flash Attention with torch.compile(mode="default")</h2>
|
| 3840 |
-
<div class="cell" id="cell-benchmark_default">
|
| 3841 |
<div class="cell-header">
|
| 3842 |
<span class="collapse-indicators">
|
| 3843 |
<span onclick="toggleCode('benchmark_default')" style="cursor: pointer;">▼ code</span>
|
| 3844 |
<span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
|
| 3845 |
-
<span id="uv-indicator-benchmark_default"
|
| 3846 |
</span> |
|
| 3847 |
-
Cell: benchmark_default |
|
| 3848 |
| <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
|
| 3849 |
<button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
|
| 3850 |
<a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
|
| 3851 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/compiled_variants.md" target="_blank" class="github-btn">GitHub</a>
|
| 3852 |
</div>
|
| 3853 |
-
<div id="code-benchmark_default" class="cell-code" data-lines="
|
| 3854 |
<div class="code-wrap">
|
| 3855 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3856 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
@@ -3861,7 +3861,7 @@ Cell: benchmark_default | 45.83s
|
|
| 3861 |
<span class="c1"># ]</span>
|
| 3862 |
<span class="c1">#</span>
|
| 3863 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3864 |
-
<span class="c1"># kernels-benchmark-tools = {
|
| 3865 |
<span class="c1"># ///</span>
|
| 3866 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3920,6 +3920,7 @@ Cell: benchmark_default | 45.83s
|
|
| 3920 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3921 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3922 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
|
|
|
| 3923 |
<span class="p">)</span>
|
| 3924 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn_default.jsonl"</span><span class="p">])</span>
|
| 3925 |
</pre></div>
|
|
@@ -3928,235 +3929,14 @@ Cell: benchmark_default | 45.83s
|
|
| 3928 |
</div>
|
| 3929 |
</div>
|
| 3930 |
<div id="output-benchmark_default" class="cell-output">
|
| 3931 |
-
<div class="cell-
|
| 3932 |
-
|
| 3933 |
-
|
| 3934 |
-
torch_flash_compiled_default flux_L320 0.54 True
|
| 3935 |
-
torch_flash_compiled_default flux_L384 0.59 True
|
| 3936 |
-
torch_flash_compiled_default flux_L448 0.61 True
|
| 3937 |
-
torch_flash_compiled_default flux_L512 0.64 True
|
| 3938 |
-
</div>
|
| 3939 |
-
<div class="uv-install-logs" id="uv-logs-benchmark_default">
|
| 3940 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3941 |
-
<div class="uv-logs-content" style="display: none;">
|
| 3942 |
-
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3943 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3944 |
-
Downloading matplotlib (8.3MiB)
|
| 3945 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3946 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3947 |
-
Downloading kiwisolver (1.4MiB)
|
| 3948 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3949 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3950 |
-
Downloading fonttools (4.7MiB)
|
| 3951 |
-
Downloading triton (148.4MiB)
|
| 3952 |
-
Downloading numpy (15.9MiB)
|
| 3953 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3954 |
-
Downloading torch (846.8MiB)
|
| 3955 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3956 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3957 |
-
Downloading setuptools (1.1MiB)
|
| 3958 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3959 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3960 |
-
Downloading networkx (1.9MiB)
|
| 3961 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3962 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3963 |
-
Downloading sympy (6.0MiB)
|
| 3964 |
-
Downloading pillow (6.3MiB)
|
| 3965 |
-
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3966 |
-
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3967 |
-
Downloading nvidia-cufile-cu12
|
| 3968 |
-
Downloading kiwisolver
|
| 3969 |
-
Downloading setuptools
|
| 3970 |
-
Downloading fonttools
|
| 3971 |
-
Downloading networkx
|
| 3972 |
-
Downloading pillow
|
| 3973 |
-
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3974 |
-
Downloading matplotlib
|
| 3975 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 3976 |
-
Downloading numpy
|
| 3977 |
-
Downloading sympy
|
| 3978 |
-
Downloading nvidia-nvjitlink-cu12
|
| 3979 |
-
Downloading nvidia-curand-cu12
|
| 3980 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 3981 |
-
Downloading triton
|
| 3982 |
-
Downloading nvidia-cufft-cu12
|
| 3983 |
-
Downloading nvidia-cusolver-cu12
|
| 3984 |
-
Downloading nvidia-cusparse-cu12
|
| 3985 |
-
Downloading nvidia-cusparselt-cu12
|
| 3986 |
-
Downloading nvidia-nccl-cu12
|
| 3987 |
-
Downloading nvidia-cublas-cu12
|
| 3988 |
-
Downloading nvidia-cudnn-cu12
|
| 3989 |
-
Downloading torch
|
| 3990 |
-
Installed 37 packages in 203ms
|
| 3991 |
-
</div>
|
| 3992 |
-
</div>
|
| 3993 |
-
<div class="cell-artifacts">
|
| 3994 |
-
<h4>Artifacts:</h4>
|
| 3995 |
-
<a href="artifacts/benchmark_default/attn_default.jsonl" class="artifact" target="_blank">attn_default.jsonl</a>
|
| 3996 |
</div>
|
| 3997 |
</div>
|
| 3998 |
</div>
|
| 3999 |
|
| 4000 |
<h2>Flash Attention with torch.compile(mode="max-autotune")</h2>
|
| 4001 |
-
<div class="cell" id="cell-benchmark_max_autotune">
|
| 4002 |
-
<div class="cell-header">
|
| 4003 |
-
<span class="collapse-indicators">
|
| 4004 |
-
<span onclick="toggleCode('benchmark_max_autotune')" style="cursor: pointer;">▼ code</span>
|
| 4005 |
-
<span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
|
| 4006 |
-
<span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4007 |
-
</span> |
|
| 4008 |
-
Cell: benchmark_max_autotune | 48.72s
|
| 4009 |
-
| <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
|
| 4010 |
-
<button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
|
| 4011 |
-
<a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
|
| 4012 |
-
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/compiled_variants.md" target="_blank" class="github-btn">GitHub</a>
|
| 4013 |
-
</div>
|
| 4014 |
-
<div id="code-benchmark_max_autotune" class="cell-code" data-lines="70">
|
| 4015 |
-
<div class="code-wrap">
|
| 4016 |
-
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 4017 |
-
<span class="c1"># requires-python = ">=3.10"</span>
|
| 4018 |
-
<span class="c1"># dependencies = [</span>
|
| 4019 |
-
<span class="c1"># "numpy",</span>
|
| 4020 |
-
<span class="c1"># "torch",</span>
|
| 4021 |
-
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 4022 |
-
<span class="c1"># ]</span>
|
| 4023 |
-
<span class="c1">#</span>
|
| 4024 |
-
<span class="c1"># [tool.uv.sources]</span>
|
| 4025 |
-
<span class="c1"># kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }</span>
|
| 4026 |
-
<span class="c1"># ///</span>
|
| 4027 |
-
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 4028 |
-
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
| 4029 |
-
<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
|
| 4030 |
-
<span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
|
| 4031 |
-
|
| 4032 |
-
|
| 4033 |
-
<span class="k">def</span><span class="w"> </span><span class="nf">torch_flash_base</span><span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
|
| 4034 |
-
<span class="n">qt</span><span class="p">,</span> <span class="n">kt</span><span class="p">,</span> <span class="n">vt</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">))</span>
|
| 4035 |
-
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">sdpa_kernel</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">FLASH_ATTENTION</span><span class="p">):</span>
|
| 4036 |
-
<span class="n">o</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">scaled_dot_product_attention</span><span class="p">(</span><span class="n">qt</span><span class="p">,</span> <span class="n">kt</span><span class="p">,</span> <span class="n">vt</span><span class="p">)</span>
|
| 4037 |
-
<span class="k">return</span> <span class="n">o</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
|
| 4038 |
-
|
| 4039 |
-
|
| 4040 |
-
<span class="c1"># Compile with max-autotune mode</span>
|
| 4041 |
-
<span class="n">compiled_flash_max_autotune</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">torch_flash_base</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">"max-autotune"</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
| 4042 |
-
|
| 4043 |
-
<span class="n">kbt</span><span class="o">.</span><span class="n">add</span><span class="p">(</span>
|
| 4044 |
-
<span class="s2">"torch_flash_compiled_max_autotune"</span><span class="p">,</span>
|
| 4045 |
-
<span class="n">compiled_flash_max_autotune</span><span class="p">,</span>
|
| 4046 |
-
<span class="n">tags</span><span class="o">=</span><span class="p">{</span><span class="s2">"family"</span><span class="p">:</span> <span class="s2">"torch-sdpa"</span><span class="p">,</span> <span class="s2">"backend"</span><span class="p">:</span> <span class="s2">"FLASH"</span><span class="p">,</span> <span class="s2">"compile"</span><span class="p">:</span> <span class="s2">"max-autotune"</span><span class="p">},</span>
|
| 4047 |
-
<span class="p">)</span>
|
| 4048 |
-
|
| 4049 |
-
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span>
|
| 4050 |
-
<span class="n">device</span> <span class="o">=</span> <span class="s2">"cuda"</span> <span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">()</span> <span class="k">else</span> <span class="s2">"cpu"</span>
|
| 4051 |
-
<span class="n">dtype</span> <span class="o">=</span> <span class="s2">"float32"</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">"cpu"</span> <span class="k">else</span> <span class="s2">"bfloat16"</span>
|
| 4052 |
-
|
| 4053 |
-
<span class="c1"># Flux-like workloads</span>
|
| 4054 |
-
<span class="n">base</span> <span class="o">=</span> <span class="mi">1024</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">"cuda"</span> <span class="k">else</span> <span class="mi">512</span>
|
| 4055 |
-
<span class="n">flux_sizes</span> <span class="o">=</span> <span class="p">(</span>
|
| 4056 |
-
<span class="p">[</span><span class="mi">128</span><span class="p">,</span> <span class="mi">256</span><span class="p">,</span> <span class="mi">320</span><span class="p">,</span> <span class="mi">384</span><span class="p">,</span> <span class="mi">448</span><span class="p">,</span> <span class="mi">512</span><span class="p">]</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">"cuda"</span> <span class="k">else</span> <span class="p">[</span><span class="mi">64</span><span class="p">,</span> <span class="mi">128</span><span class="p">,</span> <span class="mi">192</span><span class="p">,</span> <span class="mi">256</span><span class="p">]</span>
|
| 4057 |
-
<span class="p">)</span>
|
| 4058 |
-
<span class="n">heads</span> <span class="o">=</span> <span class="mi">24</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">"cuda"</span> <span class="k">else</span> <span class="mi">8</span>
|
| 4059 |
-
<span class="n">head_dim</span> <span class="o">=</span> <span class="mi">128</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">"cuda"</span> <span class="k">else</span> <span class="mi">64</span>
|
| 4060 |
-
|
| 4061 |
-
<span class="n">wl</span> <span class="o">=</span> <span class="p">[]</span>
|
| 4062 |
-
<span class="k">for</span> <span class="n">L</span> <span class="ow">in</span> <span class="n">flux_sizes</span><span class="p">:</span>
|
| 4063 |
-
<span class="n">wl</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
|
| 4064 |
-
<span class="p">{</span>
|
| 4065 |
-
<span class="s2">"name"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"flux_L</span><span class="si">{</span><span class="n">L</span><span class="si">}</span><span class="s2">"</span><span class="p">,</span>
|
| 4066 |
-
<span class="s2">"batch"</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span>
|
| 4067 |
-
<span class="s2">"seq_len"</span><span class="p">:</span> <span class="n">base</span> <span class="o">+</span> <span class="n">L</span><span class="p">,</span>
|
| 4068 |
-
<span class="s2">"heads"</span><span class="p">:</span> <span class="n">heads</span><span class="p">,</span>
|
| 4069 |
-
<span class="s2">"head_dim"</span><span class="p">:</span> <span class="n">head_dim</span><span class="p">,</span>
|
| 4070 |
-
<span class="s2">"dtype"</span><span class="p">:</span> <span class="n">dtype</span><span class="p">,</span>
|
| 4071 |
-
<span class="s2">"device"</span><span class="p">:</span> <span class="n">device</span><span class="p">,</span>
|
| 4072 |
-
<span class="s2">"seed"</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span>
|
| 4073 |
-
<span class="p">}</span>
|
| 4074 |
-
<span class="p">)</span>
|
| 4075 |
-
|
| 4076 |
-
<span class="n">kbt</span><span class="o">.</span><span class="n">run</span><span class="p">(</span>
|
| 4077 |
-
<span class="n">wl</span><span class="p">,</span>
|
| 4078 |
-
<span class="n">jsonl</span><span class="o">=</span><span class="s2">"attn_max_autotune.jsonl"</span><span class="p">,</span>
|
| 4079 |
-
<span class="n">reps</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
|
| 4080 |
-
<span class="n">warmup</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
|
| 4081 |
-
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 4082 |
-
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 4083 |
-
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
| 4084 |
-
<span class="p">)</span>
|
| 4085 |
-
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn_max_autotune.jsonl"</span><span class="p">])</span>
|
| 4086 |
-
</pre></div>
|
| 4087 |
-
|
| 4088 |
-
<div class="code-line-highlight" id="line-highlight-benchmark_max_autotune"></div>
|
| 4089 |
-
</div>
|
| 4090 |
-
</div>
|
| 4091 |
-
<div id="output-benchmark_max_autotune" class="cell-output">
|
| 4092 |
-
<div class="cell-stdout">impl wl p50(ms) ok
|
| 4093 |
-
torch_flash_compiled_max_autotune flux_L128 0.38 True
|
| 4094 |
-
torch_flash_compiled_max_autotune flux_L256 0.55 True
|
| 4095 |
-
torch_flash_compiled_max_autotune flux_L320 0.61 True
|
| 4096 |
-
torch_flash_compiled_max_autotune flux_L384 0.66 True
|
| 4097 |
-
torch_flash_compiled_max_autotune flux_L448 0.70 True
|
| 4098 |
-
torch_flash_compiled_max_autotune flux_L512 0.76 True
|
| 4099 |
-
</div>
|
| 4100 |
-
<div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
|
| 4101 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4102 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4103 |
-
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 4104 |
-
Downloading setuptools (1.1MiB)
|
| 4105 |
-
Downloading pillow (6.3MiB)
|
| 4106 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4107 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4108 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4109 |
-
Downloading networkx (1.9MiB)
|
| 4110 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4111 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4112 |
-
Downloading kiwisolver (1.4MiB)
|
| 4113 |
-
Downloading fonttools (4.7MiB)
|
| 4114 |
-
Downloading numpy (15.9MiB)
|
| 4115 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4116 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4117 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4118 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4119 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4120 |
-
Downloading matplotlib (8.3MiB)
|
| 4121 |
-
Downloading torch (846.8MiB)
|
| 4122 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4123 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4124 |
-
Downloading triton (148.4MiB)
|
| 4125 |
-
Downloading sympy (6.0MiB)
|
| 4126 |
-
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 4127 |
-
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 4128 |
-
Downloading nvidia-cufile-cu12
|
| 4129 |
-
Downloading kiwisolver
|
| 4130 |
-
Downloading setuptools
|
| 4131 |
-
Downloading fonttools
|
| 4132 |
-
Downloading networkx
|
| 4133 |
-
Downloading pillow
|
| 4134 |
-
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 4135 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4136 |
-
Downloading matplotlib
|
| 4137 |
-
Downloading numpy
|
| 4138 |
-
Downloading sympy
|
| 4139 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4140 |
-
Downloading nvidia-curand-cu12
|
| 4141 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4142 |
-
Downloading triton
|
| 4143 |
-
Downloading nvidia-cufft-cu12
|
| 4144 |
-
Downloading nvidia-cusolver-cu12
|
| 4145 |
-
Downloading nvidia-cusparse-cu12
|
| 4146 |
-
Downloading nvidia-cusparselt-cu12
|
| 4147 |
-
Downloading nvidia-nccl-cu12
|
| 4148 |
-
Downloading nvidia-cudnn-cu12
|
| 4149 |
-
Downloading nvidia-cublas-cu12
|
| 4150 |
-
Downloading torch
|
| 4151 |
-
Installed 37 packages in 208ms
|
| 4152 |
-
</div>
|
| 4153 |
-
</div>
|
| 4154 |
-
<div class="cell-artifacts">
|
| 4155 |
-
<h4>Artifacts:</h4>
|
| 4156 |
-
<a href="artifacts/benchmark_max_autotune/attn_max_autotune.jsonl" class="artifact" target="_blank">attn_max_autotune.jsonl</a>
|
| 4157 |
-
</div>
|
| 4158 |
-
</div>
|
| 4159 |
-
</div>
|
| 4160 |
</div>
|
| 4161 |
|
| 4162 |
</body>
|
|
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
+
Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
|
|
| 3837 |
<h1>Torch Compile Variants!</h1>
|
| 3838 |
<p>This file benchmarks Flash Attention with different torch.compile modes.</p>
|
| 3839 |
<h2>Flash Attention with torch.compile(mode="default")</h2>
|
| 3840 |
+
<div class="cell cell-failed" id="cell-benchmark_default">
|
| 3841 |
<div class="cell-header">
|
| 3842 |
<span class="collapse-indicators">
|
| 3843 |
<span onclick="toggleCode('benchmark_default')" style="cursor: pointer;">▼ code</span>
|
| 3844 |
<span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
|
| 3845 |
+
<span id="uv-indicator-benchmark_default" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3846 |
</span> |
|
| 3847 |
+
Cell: benchmark_default | 0.02s | FAILED
|
| 3848 |
| <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
|
| 3849 |
<button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
|
| 3850 |
<a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
|
| 3851 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/compiled_variants.md" target="_blank" class="github-btn">GitHub</a>
|
| 3852 |
</div>
|
| 3853 |
+
<div id="code-benchmark_default" class="cell-code" data-lines="71">
|
| 3854 |
<div class="code-wrap">
|
| 3855 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3856 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
|
|
| 3861 |
<span class="c1"># ]</span>
|
| 3862 |
<span class="c1">#</span>
|
| 3863 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3864 |
+
<span class="c1"># kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }</span>
|
| 3865 |
<span class="c1"># ///</span>
|
| 3866 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3920 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3921 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3922 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
| 3923 |
+
<span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
|
| 3924 |
<span class="p">)</span>
|
| 3925 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn_default.jsonl"</span><span class="p">])</span>
|
| 3926 |
</pre></div>
|
|
|
|
| 3929 |
</div>
|
| 3930 |
</div>
|
| 3931 |
<div id="output-benchmark_default" class="cell-output">
|
| 3932 |
+
<div class="cell-stderr"> × Failed to resolve script requirement
|
| 3933 |
+
╰─▶ Distribution not found at:
|
| 3934 |
+
file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3935 |
</div>
|
| 3936 |
</div>
|
| 3937 |
</div>
|
| 3938 |
|
| 3939 |
<h2>Flash Attention with torch.compile(mode="max-autotune")</h2>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3940 |
</div>
|
| 3941 |
|
| 3942 |
</body>
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -3829,7 +3829,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
@@ -3843,7 +3843,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3843 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: nv |
|
| 3847 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3849 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3860,34 +3860,22 @@ Cell: nv | 4.06s
|
|
| 3860 |
</div>
|
| 3861 |
</div>
|
| 3862 |
<div id="output-nv" class="cell-output">
|
| 3863 |
-
<div class="cell-stdout">
|
| 3864 |
+-----------------------------------------------------------------------------------------+
|
| 3865 |
-
| NVIDIA-SMI
|
| 3866 |
|-----------------------------------------+------------------------+----------------------+
|
| 3867 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3868 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3869 |
| | | MIG M. |
|
| 3870 |
|=========================================+========================+======================|
|
| 3871 |
-
| 0 NVIDIA
|
| 3872 |
-
| N/A
|
| 3873 |
-
| | | N/A |
|
| 3874 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3875 |
-
| 1 NVIDIA L4 Off | 00000000:3A:00.0 Off | 0 |
|
| 3876 |
-
| N/A 41C P0 27W / 72W | 1MiB / 23034MiB | 2% Default |
|
| 3877 |
-
| | | N/A |
|
| 3878 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3879 |
-
| 2 NVIDIA L4 Off | 00000000:3C:00.0 Off | 0 |
|
| 3880 |
-
| N/A 44C P0 29W / 72W | 1MiB / 23034MiB | 2% Default |
|
| 3881 |
-
| | | N/A |
|
| 3882 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3883 |
-
| 3 NVIDIA L4 Off | 00000000:3E:00.0 Off | 0 |
|
| 3884 |
-
| N/A 42C P0 29W / 72W | 1MiB / 23034MiB | 2% Default |
|
| 3885 |
| | | N/A |
|
| 3886 |
+-----------------------------------------+------------------------+----------------------+
|
| 3887 |
|
| 3888 |
+-----------------------------------------------------------------------------------------+
|
| 3889 |
| Processes: |
|
| 3890 |
-
| GPU GI CI
|
| 3891 |
| ID ID Usage |
|
| 3892 |
|=========================================================================================|
|
| 3893 |
| No running processes found |
|
|
@@ -3898,20 +3886,20 @@ Cell: nv | 4.06s
|
|
| 3898 |
</div>
|
| 3899 |
|
| 3900 |
<h2>Flash Attention Benchmark</h2>
|
| 3901 |
-
<div class="cell" id="cell-benchmark">
|
| 3902 |
<div class="cell-header">
|
| 3903 |
<span class="collapse-indicators">
|
| 3904 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3905 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3906 |
-
<span id="uv-indicator-benchmark"
|
| 3907 |
</span> |
|
| 3908 |
-
Cell: benchmark |
|
| 3909 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3910 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3911 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3912 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 3913 |
</div>
|
| 3914 |
-
<div id="code-benchmark" class="cell-code" data-lines="
|
| 3915 |
<div class="code-wrap">
|
| 3916 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3917 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
@@ -3922,7 +3910,7 @@ Cell: benchmark | 38.14s
|
|
| 3922 |
<span class="c1"># ]</span>
|
| 3923 |
<span class="c1">#</span>
|
| 3924 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3925 |
-
<span class="c1"># kernels-benchmark-tools = {
|
| 3926 |
<span class="c1"># ///</span>
|
| 3927 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3928 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3977,6 +3965,7 @@ Cell: benchmark | 38.14s
|
|
| 3977 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3978 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3979 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
|
|
|
| 3980 |
<span class="p">)</span>
|
| 3981 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3982 |
</pre></div>
|
|
@@ -3985,71 +3974,9 @@ Cell: benchmark | 38.14s
|
|
| 3985 |
</div>
|
| 3986 |
</div>
|
| 3987 |
<div id="output-benchmark" class="cell-output">
|
| 3988 |
-
<div class="cell-
|
| 3989 |
-
|
| 3990 |
-
|
| 3991 |
-
torch_flash_ma flux_L320 0.55 True
|
| 3992 |
-
torch_flash_ma flux_L384 0.59 True
|
| 3993 |
-
torch_flash_ma flux_L448 0.64 True
|
| 3994 |
-
torch_flash_ma flux_L512 0.68 True
|
| 3995 |
-
</div>
|
| 3996 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3997 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3998 |
-
<div class="uv-logs-content" style="display: none;">
|
| 3999 |
-
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 4000 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4001 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4002 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4003 |
-
Downloading fonttools (4.7MiB)
|
| 4004 |
-
Downloading matplotlib (8.3MiB)
|
| 4005 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4006 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4007 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4008 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4009 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4010 |
-
Downloading torch (846.8MiB)
|
| 4011 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4012 |
-
Downloading pillow (6.3MiB)
|
| 4013 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4014 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4015 |
-
Downloading sympy (6.0MiB)
|
| 4016 |
-
Downloading setuptools (1.1MiB)
|
| 4017 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4018 |
-
Downloading networkx (1.9MiB)
|
| 4019 |
-
Downloading triton (148.4MiB)
|
| 4020 |
-
Downloading kiwisolver (1.4MiB)
|
| 4021 |
-
Downloading numpy (15.9MiB)
|
| 4022 |
-
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 4023 |
-
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 4024 |
-
Downloading nvidia-cufile-cu12
|
| 4025 |
-
Downloading kiwisolver
|
| 4026 |
-
Downloading setuptools
|
| 4027 |
-
Downloading fonttools
|
| 4028 |
-
Downloading networkx
|
| 4029 |
-
Downloading pillow
|
| 4030 |
-
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 4031 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4032 |
-
Downloading matplotlib
|
| 4033 |
-
Downloading numpy
|
| 4034 |
-
Downloading sympy
|
| 4035 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4036 |
-
Downloading nvidia-curand-cu12
|
| 4037 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4038 |
-
Downloading triton
|
| 4039 |
-
Downloading nvidia-cufft-cu12
|
| 4040 |
-
Downloading nvidia-cusolver-cu12
|
| 4041 |
-
Downloading nvidia-cusparse-cu12
|
| 4042 |
-
Downloading nvidia-cusparselt-cu12
|
| 4043 |
-
Downloading nvidia-nccl-cu12
|
| 4044 |
-
Downloading nvidia-cublas-cu12
|
| 4045 |
-
Downloading nvidia-cudnn-cu12
|
| 4046 |
-
Downloading torch
|
| 4047 |
-
Installed 37 packages in 224ms
|
| 4048 |
-
</div>
|
| 4049 |
-
</div>
|
| 4050 |
-
<div class="cell-artifacts">
|
| 4051 |
-
<h4>Artifacts:</h4>
|
| 4052 |
-
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 4053 |
</div>
|
| 4054 |
</div>
|
| 4055 |
</div>
|
|
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
+
Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
|
|
|
| 3843 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: nv | 0.23s
|
| 3847 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3849 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3860 |
</div>
|
| 3861 |
</div>
|
| 3862 |
<div id="output-nv" class="cell-output">
|
| 3863 |
+
<div class="cell-stdout">Wed Oct 22 08:58:24 2025
|
| 3864 |
+-----------------------------------------------------------------------------------------+
|
| 3865 |
+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3866 |
|-----------------------------------------+------------------------+----------------------+
|
| 3867 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3868 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3869 |
| | | MIG M. |
|
| 3870 |
|=========================================+========================+======================|
|
| 3871 |
+
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3872 |
+
| N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3873 |
| | | N/A |
|
| 3874 |
+-----------------------------------------+------------------------+----------------------+
|
| 3875 |
|
| 3876 |
+-----------------------------------------------------------------------------------------+
|
| 3877 |
| Processes: |
|
| 3878 |
+
| GPU GI CI PID Type Process name GPU Memory |
|
| 3879 |
| ID ID Usage |
|
| 3880 |
|=========================================================================================|
|
| 3881 |
| No running processes found |
|
|
|
|
| 3886 |
</div>
|
| 3887 |
|
| 3888 |
<h2>Flash Attention Benchmark</h2>
|
| 3889 |
+
<div class="cell cell-failed" id="cell-benchmark">
|
| 3890 |
<div class="cell-header">
|
| 3891 |
<span class="collapse-indicators">
|
| 3892 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3893 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3894 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3895 |
</span> |
|
| 3896 |
+
Cell: benchmark | 0.01s | FAILED
|
| 3897 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3898 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3899 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3900 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 3901 |
</div>
|
| 3902 |
+
<div id="code-benchmark" class="cell-code" data-lines="67">
|
| 3903 |
<div class="code-wrap">
|
| 3904 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3905 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
|
|
| 3910 |
<span class="c1"># ]</span>
|
| 3911 |
<span class="c1">#</span>
|
| 3912 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3913 |
+
<span class="c1"># kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }</span>
|
| 3914 |
<span class="c1"># ///</span>
|
| 3915 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3916 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3965 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3966 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3967 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
| 3968 |
+
<span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
|
| 3969 |
<span class="p">)</span>
|
| 3970 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3971 |
</pre></div>
|
|
|
|
| 3974 |
</div>
|
| 3975 |
</div>
|
| 3976 |
<div id="output-benchmark" class="cell-output">
|
| 3977 |
+
<div class="cell-stderr"> × Failed to resolve script requirement
|
| 3978 |
+
╰─▶ Distribution not found at:
|
| 3979 |
+
file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3980 |
</div>
|
| 3981 |
</div>
|
| 3982 |
</div>
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -3829,40 +3829,40 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>HF Kernels - Flash Attention</h1>
|
| 3838 |
<h2>HuggingFace Kernels Flash Attention Benchmark</h2>
|
| 3839 |
-
<div class="cell" id="cell-benchmark">
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
-
<span id="uv-indicator-benchmark"
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: benchmark |
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3850 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn.md" target="_blank" class="github-btn">GitHub</a>
|
| 3851 |
<a href="https://huggingface.co/kernels-community/flash-attn2" target="_blank" class="hf-btn">🤗 HF</a>
|
| 3852 |
</div>
|
| 3853 |
-
<div id="code-benchmark" class="cell-code" data-lines="
|
| 3854 |
<div class="code-wrap">
|
| 3855 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3856 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3857 |
<span class="c1"># dependencies = [</span>
|
| 3858 |
<span class="c1"># "numpy",</span>
|
| 3859 |
-
<span class="c1"># "torch",</span>
|
| 3860 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3861 |
<span class="c1"># "kernels",</span>
|
| 3862 |
<span class="c1"># ]</span>
|
| 3863 |
<span class="c1">#</span>
|
| 3864 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3865 |
-
<span class="c1"># kernels-benchmark-tools = {
|
| 3866 |
<span class="c1"># ///</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3868 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3870,7 +3870,7 @@ Cell: benchmark | 40.14s
|
|
| 3870 |
<span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
|
| 3871 |
<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
|
| 3872 |
|
| 3873 |
-
<span class="n">hf_kernels_flash_attn</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">"kernels-community/flash-attn"</span><span class="p"
|
| 3874 |
|
| 3875 |
|
| 3876 |
<span class="k">def</span><span class="w"> </span><span class="nf">hf_flash_attention</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
|
|
@@ -3922,6 +3922,7 @@ Cell: benchmark | 40.14s
|
|
| 3922 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3923 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3924 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
|
|
|
| 3925 |
<span class="p">)</span>
|
| 3926 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3927 |
</pre></div>
|
|
@@ -3930,77 +3931,9 @@ Cell: benchmark | 40.14s
|
|
| 3930 |
</div>
|
| 3931 |
</div>
|
| 3932 |
<div id="output-benchmark" class="cell-output">
|
| 3933 |
-
<div class="cell-
|
| 3934 |
-
|
| 3935 |
-
|
| 3936 |
-
hf_kernels_flash_attn flux_L320 0.34 True
|
| 3937 |
-
hf_kernels_flash_attn flux_L384 0.35 True
|
| 3938 |
-
hf_kernels_flash_attn flux_L448 0.38 True
|
| 3939 |
-
hf_kernels_flash_attn flux_L512 0.42 True
|
| 3940 |
-
</div>
|
| 3941 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3942 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3943 |
-
<div class="uv-logs-content" style="display: none;">
|
| 3944 |
-
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3945 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3946 |
-
Downloading kiwisolver (1.4MiB)
|
| 3947 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3948 |
-
Downloading matplotlib (8.3MiB)
|
| 3949 |
-
Downloading fonttools (4.7MiB)
|
| 3950 |
-
Downloading setuptools (1.1MiB)
|
| 3951 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3952 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3953 |
-
Downloading sympy (6.0MiB)
|
| 3954 |
-
Downloading hf-xet (3.0MiB)
|
| 3955 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3956 |
-
Downloading pillow (6.3MiB)
|
| 3957 |
-
Downloading networkx (1.9MiB)
|
| 3958 |
-
Downloading numpy (15.9MiB)
|
| 3959 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3960 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3961 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3962 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3963 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3964 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3965 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3966 |
-
Downloading torch (846.8MiB)
|
| 3967 |
-
Downloading triton (148.4MiB)
|
| 3968 |
-
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3969 |
-
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3970 |
-
Downloading nvidia-cufile-cu12
|
| 3971 |
-
Downloading kiwisolver
|
| 3972 |
-
Downloading hf-xet
|
| 3973 |
-
Downloading setuptools
|
| 3974 |
-
Downloading networkx
|
| 3975 |
-
Downloading fonttools
|
| 3976 |
-
Downloading pillow
|
| 3977 |
-
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3978 |
-
Downloading matplotlib
|
| 3979 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 3980 |
-
Downloading numpy
|
| 3981 |
-
Downloading sympy
|
| 3982 |
-
Downloading nvidia-nvjitlink-cu12
|
| 3983 |
-
Downloading nvidia-curand-cu12
|
| 3984 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 3985 |
-
Downloading triton
|
| 3986 |
-
Downloading nvidia-cufft-cu12
|
| 3987 |
-
Downloading nvidia-cusolver-cu12
|
| 3988 |
-
Downloading nvidia-cusparselt-cu12
|
| 3989 |
-
Downloading nvidia-cusparse-cu12
|
| 3990 |
-
Downloading nvidia-nccl-cu12
|
| 3991 |
-
Downloading nvidia-cublas-cu12
|
| 3992 |
-
Downloading nvidia-cudnn-cu12
|
| 3993 |
-
Downloading torch
|
| 3994 |
-
Installed 47 packages in 255ms
|
| 3995 |
-
</div>
|
| 3996 |
-
</div>
|
| 3997 |
-
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 3998 |
-
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:05, 3.64it/s]
|
| 3999 |
-
Fetching 20 files: 10%|█ | 2/20 [00:02<00:22, 1.24s/it]
|
| 4000 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:02<00:00, 9.14it/s]</div>
|
| 4001 |
-
<div class="cell-artifacts">
|
| 4002 |
-
<h4>Artifacts:</h4>
|
| 4003 |
-
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 4004 |
</div>
|
| 4005 |
</div>
|
| 4006 |
</div>
|
|
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
+
Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>HF Kernels - Flash Attention</h1>
|
| 3838 |
<h2>HuggingFace Kernels Flash Attention Benchmark</h2>
|
| 3839 |
+
<div class="cell cell-failed" id="cell-benchmark">
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: benchmark | 0.01s | FAILED
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3850 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn.md" target="_blank" class="github-btn">GitHub</a>
|
| 3851 |
<a href="https://huggingface.co/kernels-community/flash-attn2" target="_blank" class="hf-btn">🤗 HF</a>
|
| 3852 |
</div>
|
| 3853 |
+
<div id="code-benchmark" class="cell-code" data-lines="73">
|
| 3854 |
<div class="code-wrap">
|
| 3855 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3856 |
<span class="c1"># requires-python = ">=3.10"</span>
|
| 3857 |
<span class="c1"># dependencies = [</span>
|
| 3858 |
<span class="c1"># "numpy",</span>
|
| 3859 |
+
<span class="c1"># "torch==2.8.0",</span>
|
| 3860 |
<span class="c1"># "kernels-benchmark-tools",</span>
|
| 3861 |
<span class="c1"># "kernels",</span>
|
| 3862 |
<span class="c1"># ]</span>
|
| 3863 |
<span class="c1">#</span>
|
| 3864 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3865 |
+
<span class="c1"># kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }</span>
|
| 3866 |
<span class="c1"># ///</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3868 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3870 |
<span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
|
| 3871 |
<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
|
| 3872 |
|
| 3873 |
+
<span class="n">hf_kernels_flash_attn</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">"kernels-community/flash-attn"</span><span class="p">)</span>
|
| 3874 |
|
| 3875 |
|
| 3876 |
<span class="k">def</span><span class="w"> </span><span class="nf">hf_flash_attention</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
|
|
|
|
| 3922 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3923 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3924 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
| 3925 |
+
<span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
|
| 3926 |
<span class="p">)</span>
|
| 3927 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3928 |
</pre></div>
|
|
|
|
| 3931 |
</div>
|
| 3932 |
</div>
|
| 3933 |
<div id="output-benchmark" class="cell-output">
|
| 3934 |
+
<div class="cell-stderr"> × Failed to resolve script requirement
|
| 3935 |
+
╰─▶ Distribution not found at:
|
| 3936 |
+
file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3937 |
</div>
|
| 3938 |
</div>
|
| 3939 |
</div>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -3829,28 +3829,28 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>HF Kernels - Flash Attention 3</h1>
|
| 3838 |
<h2>HuggingFace Kernels Flash Attention 3 Benchmark</h2>
|
| 3839 |
-
<div class="cell" id="cell-benchmark">
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
-
<span id="uv-indicator-benchmark"
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: benchmark |
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3850 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn3.md" target="_blank" class="github-btn">GitHub</a>
|
| 3851 |
<a href="https://huggingface.co/kernels-community/flash-attn3" target="_blank" class="hf-btn">🤗 HF</a>
|
| 3852 |
</div>
|
| 3853 |
-
<div id="code-benchmark" class="cell-code" data-lines="
|
| 3854 |
<div class="code-wrap">
|
| 3855 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3856 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
@@ -3862,7 +3862,7 @@ Cell: benchmark | 40.68s
|
|
| 3862 |
<span class="c1"># ]</span>
|
| 3863 |
<span class="c1">#</span>
|
| 3864 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3865 |
-
<span class="c1"># kernels-benchmark-tools = {
|
| 3866 |
<span class="c1"># ///</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3868 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3921,6 +3921,7 @@ Cell: benchmark | 40.68s
|
|
| 3921 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3922 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3923 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
|
|
|
| 3924 |
<span class="p">)</span>
|
| 3925 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3926 |
</pre></div>
|
|
@@ -3929,77 +3930,9 @@ Cell: benchmark | 40.68s
|
|
| 3929 |
</div>
|
| 3930 |
</div>
|
| 3931 |
<div id="output-benchmark" class="cell-output">
|
| 3932 |
-
<div class="cell-
|
| 3933 |
-
|
| 3934 |
-
|
| 3935 |
-
hf_kernels_flash_attn3 flux_L320 0.36 True
|
| 3936 |
-
hf_kernels_flash_attn3 flux_L384 0.37 True
|
| 3937 |
-
hf_kernels_flash_attn3 flux_L448 0.40 True
|
| 3938 |
-
hf_kernels_flash_attn3 flux_L512 0.43 True
|
| 3939 |
-
</div>
|
| 3940 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3941 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3942 |
-
<div class="uv-logs-content" style="display: none;">
|
| 3943 |
-
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3944 |
-
Downloading pillow (6.3MiB)
|
| 3945 |
-
Downloading hf-xet (3.0MiB)
|
| 3946 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3947 |
-
Downloading kiwisolver (1.4MiB)
|
| 3948 |
-
Downloading fonttools (4.7MiB)
|
| 3949 |
-
Downloading matplotlib (8.3MiB)
|
| 3950 |
-
Downloading networkx (1.9MiB)
|
| 3951 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3952 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3953 |
-
Downloading numpy (15.9MiB)
|
| 3954 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3955 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3956 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3957 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3958 |
-
Downloading torch (846.8MiB)
|
| 3959 |
-
Downloading triton (148.4MiB)
|
| 3960 |
-
Downloading setuptools (1.1MiB)
|
| 3961 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3962 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3963 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3964 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3965 |
-
Downloading sympy (6.0MiB)
|
| 3966 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3967 |
-
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3968 |
-
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3969 |
-
Downloading nvidia-cufile-cu12
|
| 3970 |
-
Downloading kiwisolver
|
| 3971 |
-
Downloading hf-xet
|
| 3972 |
-
Downloading setuptools
|
| 3973 |
-
Downloading networkx
|
| 3974 |
-
Downloading fonttools
|
| 3975 |
-
Downloading pillow
|
| 3976 |
-
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3977 |
-
Downloading matplotlib
|
| 3978 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 3979 |
-
Downloading numpy
|
| 3980 |
-
Downloading sympy
|
| 3981 |
-
Downloading nvidia-nvjitlink-cu12
|
| 3982 |
-
Downloading nvidia-curand-cu12
|
| 3983 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 3984 |
-
Downloading triton
|
| 3985 |
-
Downloading nvidia-cufft-cu12
|
| 3986 |
-
Downloading nvidia-cusolver-cu12
|
| 3987 |
-
Downloading nvidia-cusparse-cu12
|
| 3988 |
-
Downloading nvidia-cusparselt-cu12
|
| 3989 |
-
Downloading nvidia-nccl-cu12
|
| 3990 |
-
Downloading nvidia-cublas-cu12
|
| 3991 |
-
Downloading nvidia-cudnn-cu12
|
| 3992 |
-
Downloading torch
|
| 3993 |
-
Installed 47 packages in 229ms
|
| 3994 |
-
</div>
|
| 3995 |
-
</div>
|
| 3996 |
-
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 3997 |
-
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 3.56it/s]
|
| 3998 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:02<00:02, 1.32s/it]
|
| 3999 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:02<00:00, 1.72it/s]</div>
|
| 4000 |
-
<div class="cell-artifacts">
|
| 4001 |
-
<h4>Artifacts:</h4>
|
| 4002 |
-
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 4003 |
</div>
|
| 4004 |
</div>
|
| 4005 |
</div>
|
|
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
+
Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>HF Kernels - Flash Attention 3</h1>
|
| 3838 |
<h2>HuggingFace Kernels Flash Attention 3 Benchmark</h2>
|
| 3839 |
+
<div class="cell cell-failed" id="cell-benchmark">
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: benchmark | 0.05s | FAILED
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3850 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn3.md" target="_blank" class="github-btn">GitHub</a>
|
| 3851 |
<a href="https://huggingface.co/kernels-community/flash-attn3" target="_blank" class="hf-btn">🤗 HF</a>
|
| 3852 |
</div>
|
| 3853 |
+
<div id="code-benchmark" class="cell-code" data-lines="72">
|
| 3854 |
<div class="code-wrap">
|
| 3855 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3856 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
|
|
| 3862 |
<span class="c1"># ]</span>
|
| 3863 |
<span class="c1">#</span>
|
| 3864 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3865 |
+
<span class="c1"># kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }</span>
|
| 3866 |
<span class="c1"># ///</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3868 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3921 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3922 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3923 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
| 3924 |
+
<span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
|
| 3925 |
<span class="p">)</span>
|
| 3926 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3927 |
</pre></div>
|
|
|
|
| 3930 |
</div>
|
| 3931 |
</div>
|
| 3932 |
<div id="output-benchmark" class="cell-output">
|
| 3933 |
+
<div class="cell-stderr"> × Failed to resolve script requirement
|
| 3934 |
+
╰─▶ Distribution not found at:
|
| 3935 |
+
file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3936 |
</div>
|
| 3937 |
</div>
|
| 3938 |
</div>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -3829,27 +3829,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>Memory Efficient Attention Implementation</h1>
|
| 3838 |
<h2>Memory Efficient SDPA Benchmark</h2>
|
| 3839 |
-
<div class="cell" id="cell-benchmark">
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
-
<span id="uv-indicator-benchmark"
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: benchmark |
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3850 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/mem_efficient_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 3851 |
</div>
|
| 3852 |
-
<div id="code-benchmark" class="cell-code" data-lines="
|
| 3853 |
<div class="code-wrap">
|
| 3854 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3855 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
@@ -3860,7 +3860,7 @@ Cell: benchmark | 39.23s
|
|
| 3860 |
<span class="c1"># ]</span>
|
| 3861 |
<span class="c1">#</span>
|
| 3862 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3863 |
-
<span class="c1"># kernels-benchmark-tools = {
|
| 3864 |
<span class="c1"># ///</span>
|
| 3865 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3866 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3917,6 +3917,7 @@ Cell: benchmark | 39.23s
|
|
| 3917 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3918 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3919 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
|
|
|
| 3920 |
<span class="p">)</span>
|
| 3921 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3922 |
</pre></div>
|
|
@@ -3925,71 +3926,9 @@ Cell: benchmark | 39.23s
|
|
| 3925 |
</div>
|
| 3926 |
</div>
|
| 3927 |
<div id="output-benchmark" class="cell-output">
|
| 3928 |
-
<div class="cell-
|
| 3929 |
-
|
| 3930 |
-
|
| 3931 |
-
torch_mem_eff flux_L320 0.70 True
|
| 3932 |
-
torch_mem_eff flux_L384 0.83 True
|
| 3933 |
-
torch_mem_eff flux_L448 0.95 True
|
| 3934 |
-
torch_mem_eff flux_L512 1.00 True
|
| 3935 |
-
</div>
|
| 3936 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3937 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3938 |
-
<div class="uv-logs-content" style="display: none;">
|
| 3939 |
-
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3940 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3941 |
-
Downloading networkx (1.9MiB)
|
| 3942 |
-
Downloading sympy (6.0MiB)
|
| 3943 |
-
Downloading fonttools (4.7MiB)
|
| 3944 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3945 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3946 |
-
Downloading triton (148.4MiB)
|
| 3947 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3948 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3949 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3950 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3951 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3952 |
-
Downloading matplotlib (8.3MiB)
|
| 3953 |
-
Downloading pillow (6.3MiB)
|
| 3954 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3955 |
-
Downloading kiwisolver (1.4MiB)
|
| 3956 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3957 |
-
Downloading torch (846.8MiB)
|
| 3958 |
-
Downloading setuptools (1.1MiB)
|
| 3959 |
-
Downloading numpy (15.9MiB)
|
| 3960 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3961 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3962 |
-
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3963 |
-
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3964 |
-
Downloading nvidia-cufile-cu12
|
| 3965 |
-
Downloading kiwisolver
|
| 3966 |
-
Downloading setuptools
|
| 3967 |
-
Downloading fonttools
|
| 3968 |
-
Downloading networkx
|
| 3969 |
-
Downloading pillow
|
| 3970 |
-
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3971 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 3972 |
-
Downloading matplotlib
|
| 3973 |
-
Downloading numpy
|
| 3974 |
-
Downloading sympy
|
| 3975 |
-
Downloading nvidia-nvjitlink-cu12
|
| 3976 |
-
Downloading nvidia-curand-cu12
|
| 3977 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 3978 |
-
Downloading triton
|
| 3979 |
-
Downloading nvidia-cufft-cu12
|
| 3980 |
-
Downloading nvidia-cusolver-cu12
|
| 3981 |
-
Downloading nvidia-cusparse-cu12
|
| 3982 |
-
Downloading nvidia-cusparselt-cu12
|
| 3983 |
-
Downloading nvidia-nccl-cu12
|
| 3984 |
-
Downloading nvidia-cublas-cu12
|
| 3985 |
-
Downloading nvidia-cudnn-cu12
|
| 3986 |
-
Downloading torch
|
| 3987 |
-
Installed 37 packages in 248ms
|
| 3988 |
-
</div>
|
| 3989 |
-
</div>
|
| 3990 |
-
<div class="cell-artifacts">
|
| 3991 |
-
<h4>Artifacts:</h4>
|
| 3992 |
-
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 3993 |
</div>
|
| 3994 |
</div>
|
| 3995 |
</div>
|
|
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
+
Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>Memory Efficient Attention Implementation</h1>
|
| 3838 |
<h2>Memory Efficient SDPA Benchmark</h2>
|
| 3839 |
+
<div class="cell cell-failed" id="cell-benchmark">
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: benchmark | 0.01s | FAILED
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3850 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/mem_efficient_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 3851 |
</div>
|
| 3852 |
+
<div id="code-benchmark" class="cell-code" data-lines="69">
|
| 3853 |
<div class="code-wrap">
|
| 3854 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3855 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
|
|
| 3860 |
<span class="c1"># ]</span>
|
| 3861 |
<span class="c1">#</span>
|
| 3862 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3863 |
+
<span class="c1"># kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }</span>
|
| 3864 |
<span class="c1"># ///</span>
|
| 3865 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3866 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3917 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3918 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3919 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
| 3920 |
+
<span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
|
| 3921 |
<span class="p">)</span>
|
| 3922 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3923 |
</pre></div>
|
|
|
|
| 3926 |
</div>
|
| 3927 |
</div>
|
| 3928 |
<div id="output-benchmark" class="cell-output">
|
| 3929 |
+
<div class="cell-stderr"> × Failed to resolve script requirement
|
| 3930 |
+
╰─▶ Distribution not found at:
|
| 3931 |
+
file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3932 |
</div>
|
| 3933 |
</div>
|
| 3934 |
</div>
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -3829,27 +3829,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>SageAttention Implementation</h1>
|
| 3838 |
<h2>SageAttention Benchmark (INT8 Quantized)</h2>
|
| 3839 |
-
<div class="cell" id="cell-benchmark">
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
-
<span id="uv-indicator-benchmark"
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: benchmark |
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3850 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 3851 |
</div>
|
| 3852 |
-
<div id="code-benchmark" class="cell-code" data-lines="
|
| 3853 |
<div class="code-wrap">
|
| 3854 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3855 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
@@ -3862,7 +3862,7 @@ Cell: benchmark | 41.27s
|
|
| 3862 |
<span class="c1"># ]</span>
|
| 3863 |
<span class="c1">#</span>
|
| 3864 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3865 |
-
<span class="c1"># kernels-benchmark-tools = {
|
| 3866 |
<span class="c1"># ///</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3868 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3928,6 +3928,7 @@ Cell: benchmark | 41.27s
|
|
| 3928 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3929 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3930 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
|
|
|
| 3931 |
<span class="p">)</span>
|
| 3932 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3933 |
</pre></div>
|
|
@@ -3936,84 +3937,9 @@ Cell: benchmark | 41.27s
|
|
| 3936 |
</div>
|
| 3937 |
</div>
|
| 3938 |
<div id="output-benchmark" class="cell-output">
|
| 3939 |
-
<div class="cell-
|
| 3940 |
-
|
| 3941 |
-
|
| 3942 |
-
sage_int8_fp16 flux_L256 FAIL False
|
| 3943 |
-
Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd'
|
| 3944 |
-
sage_int8_fp16 flux_L320 FAIL False
|
| 3945 |
-
Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd'
|
| 3946 |
-
sage_int8_fp16 flux_L384 FAIL False
|
| 3947 |
-
Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd'
|
| 3948 |
-
sage_int8_fp16 flux_L448 FAIL False
|
| 3949 |
-
Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd'
|
| 3950 |
-
sage_int8_fp16 flux_L512 FAIL False
|
| 3951 |
-
Error: module 'sage_attention_a8eb63760f50ebd' has no attribute 'fwd'
|
| 3952 |
-
</div>
|
| 3953 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3954 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3955 |
-
<div class="uv-logs-content" style="display: none;">
|
| 3956 |
-
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3957 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3958 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3959 |
-
Downloading hf-xet (3.0MiB)
|
| 3960 |
-
Downloading sympy (6.0MiB)
|
| 3961 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3962 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3963 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3964 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3965 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3966 |
-
Downloading kiwisolver (1.4MiB)
|
| 3967 |
-
Downloading pillow (6.3MiB)
|
| 3968 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3969 |
-
Downloading numpy (15.9MiB)
|
| 3970 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3971 |
-
Downloading matplotlib (8.3MiB)
|
| 3972 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3973 |
-
Downloading fonttools (4.7MiB)
|
| 3974 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3975 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3976 |
-
Downloading setuptools (1.1MiB)
|
| 3977 |
-
Downloading networkx (1.9MiB)
|
| 3978 |
-
Downloading triton (148.4MiB)
|
| 3979 |
-
Downloading torch (846.8MiB)
|
| 3980 |
-
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3981 |
-
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3982 |
-
Downloading nvidia-cufile-cu12
|
| 3983 |
-
Downloading kiwisolver
|
| 3984 |
-
Downloading hf-xet
|
| 3985 |
-
Downloading setuptools
|
| 3986 |
-
Downloading networkx
|
| 3987 |
-
Downloading fonttools
|
| 3988 |
-
Downloading pillow
|
| 3989 |
-
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3990 |
-
Downloading matplotlib
|
| 3991 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 3992 |
-
Downloading numpy
|
| 3993 |
-
Downloading sympy
|
| 3994 |
-
Downloading nvidia-nvjitlink-cu12
|
| 3995 |
-
Downloading nvidia-curand-cu12
|
| 3996 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 3997 |
-
Downloading triton
|
| 3998 |
-
Downloading nvidia-cufft-cu12
|
| 3999 |
-
Downloading nvidia-cusolver-cu12
|
| 4000 |
-
Downloading nvidia-cusparselt-cu12
|
| 4001 |
-
Downloading nvidia-cusparse-cu12
|
| 4002 |
-
Downloading nvidia-nccl-cu12
|
| 4003 |
-
Downloading nvidia-cublas-cu12
|
| 4004 |
-
Downloading nvidia-cudnn-cu12
|
| 4005 |
-
Downloading torch
|
| 4006 |
-
Installed 48 packages in 239ms
|
| 4007 |
-
</div>
|
| 4008 |
-
</div>
|
| 4009 |
-
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 4010 |
-
Fetching 11 files: 9%|▉ | 1/11 [00:00<00:05, 1.85it/s]
|
| 4011 |
-
Fetching 11 files: 45%|████▌ | 5/11 [00:00<00:00, 6.46it/s]
|
| 4012 |
-
Fetching 11 files: 73%|███████▎ | 8/11 [00:01<00:00, 10.07it/s]
|
| 4013 |
-
Fetching 11 files: 100%|██████████| 11/11 [00:01<00:00, 10.94it/s]</div>
|
| 4014 |
-
<div class="cell-artifacts">
|
| 4015 |
-
<h4>Artifacts:</h4>
|
| 4016 |
-
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 4017 |
</div>
|
| 4018 |
</div>
|
| 4019 |
</div>
|
|
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
+
Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>SageAttention Implementation</h1>
|
| 3838 |
<h2>SageAttention Benchmark (INT8 Quantized)</h2>
|
| 3839 |
+
<div class="cell cell-failed" id="cell-benchmark">
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: benchmark | 0.05s | FAILED
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3850 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 3851 |
</div>
|
| 3852 |
+
<div id="code-benchmark" class="cell-code" data-lines="80">
|
| 3853 |
<div class="code-wrap">
|
| 3854 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3855 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
|
|
| 3862 |
<span class="c1"># ]</span>
|
| 3863 |
<span class="c1">#</span>
|
| 3864 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3865 |
+
<span class="c1"># kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }</span>
|
| 3866 |
<span class="c1"># ///</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3868 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3928 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3929 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3930 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
| 3931 |
+
<span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
|
| 3932 |
<span class="p">)</span>
|
| 3933 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3934 |
</pre></div>
|
|
|
|
| 3937 |
</div>
|
| 3938 |
</div>
|
| 3939 |
<div id="output-benchmark" class="cell-output">
|
| 3940 |
+
<div class="cell-stderr"> × Failed to resolve script requirement
|
| 3941 |
+
╰─▶ Distribution not found at:
|
| 3942 |
+
file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3943 |
</div>
|
| 3944 |
</div>
|
| 3945 |
</div>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -3829,27 +3829,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
-
Linux x86_64 | Linux-5.
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>xFormers Memory Efficient Attention</h1>
|
| 3838 |
<h2>xFormers Benchmark</h2>
|
| 3839 |
-
<div class="cell" id="cell-benchmark">
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
-
<span id="uv-indicator-benchmark"
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: benchmark |
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3850 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/xformers.md" target="_blank" class="github-btn">GitHub</a>
|
| 3851 |
</div>
|
| 3852 |
-
<div id="code-benchmark" class="cell-code" data-lines="
|
| 3853 |
<div class="code-wrap">
|
| 3854 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3855 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
@@ -3861,7 +3861,7 @@ Cell: benchmark | 41.87s
|
|
| 3861 |
<span class="c1"># ]</span>
|
| 3862 |
<span class="c1">#</span>
|
| 3863 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3864 |
-
<span class="c1"># kernels-benchmark-tools = {
|
| 3865 |
<span class="c1"># ///</span>
|
| 3866 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
@@ -3917,6 +3917,7 @@ Cell: benchmark | 41.87s
|
|
| 3917 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3918 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3919 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
|
|
|
| 3920 |
<span class="p">)</span>
|
| 3921 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3922 |
</pre></div>
|
|
@@ -3925,73 +3926,9 @@ Cell: benchmark | 41.87s
|
|
| 3925 |
</div>
|
| 3926 |
</div>
|
| 3927 |
<div id="output-benchmark" class="cell-output">
|
| 3928 |
-
<div class="cell-
|
| 3929 |
-
|
| 3930 |
-
|
| 3931 |
-
xformers_meff flux_L320 0.43 True
|
| 3932 |
-
xformers_meff flux_L384 0.44 True
|
| 3933 |
-
xformers_meff flux_L448 0.48 True
|
| 3934 |
-
xformers_meff flux_L512 0.50 True
|
| 3935 |
-
</div>
|
| 3936 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3937 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3938 |
-
<div class="uv-logs-content" style="display: none;">
|
| 3939 |
-
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3940 |
-
Downloading kiwisolver (1.4MiB)
|
| 3941 |
-
Downloading setuptools (1.1MiB)
|
| 3942 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3943 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3944 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3945 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3946 |
-
Downloading pillow (6.3MiB)
|
| 3947 |
-
Downloading numpy (15.9MiB)
|
| 3948 |
-
Downloading matplotlib (8.3MiB)
|
| 3949 |
-
Downloading fonttools (4.7MiB)
|
| 3950 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3951 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3952 |
-
Downloading xformers (111.8MiB)
|
| 3953 |
-
Downloading networkx (1.9MiB)
|
| 3954 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3955 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3956 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3957 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3958 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3959 |
-
Downloading sympy (6.0MiB)
|
| 3960 |
-
Downloading triton (148.4MiB)
|
| 3961 |
-
Downloading torch (846.8MiB)
|
| 3962 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3963 |
-
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3964 |
-
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3965 |
-
Downloading nvidia-cufile-cu12
|
| 3966 |
-
Downloading kiwisolver
|
| 3967 |
-
Downloading setuptools
|
| 3968 |
-
Downloading networkx
|
| 3969 |
-
Downloading fonttools
|
| 3970 |
-
Downloading pillow
|
| 3971 |
-
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3972 |
-
Downloading matplotlib
|
| 3973 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 3974 |
-
Downloading numpy
|
| 3975 |
-
Downloading sympy
|
| 3976 |
-
Downloading nvidia-nvjitlink-cu12
|
| 3977 |
-
Downloading nvidia-curand-cu12
|
| 3978 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 3979 |
-
Downloading xformers
|
| 3980 |
-
Downloading triton
|
| 3981 |
-
Downloading nvidia-cufft-cu12
|
| 3982 |
-
Downloading nvidia-cusolver-cu12
|
| 3983 |
-
Downloading nvidia-cusparse-cu12
|
| 3984 |
-
Downloading nvidia-cusparselt-cu12
|
| 3985 |
-
Downloading nvidia-nccl-cu12
|
| 3986 |
-
Downloading nvidia-cublas-cu12
|
| 3987 |
-
Downloading nvidia-cudnn-cu12
|
| 3988 |
-
Downloading torch
|
| 3989 |
-
Installed 38 packages in 250ms
|
| 3990 |
-
</div>
|
| 3991 |
-
</div>
|
| 3992 |
-
<div class="cell-artifacts">
|
| 3993 |
-
<h4>Artifacts:</h4>
|
| 3994 |
-
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
| 3995 |
</div>
|
| 3996 |
</div>
|
| 3997 |
</div>
|
|
|
|
| 3829 |
<div class="system-info">
|
| 3830 |
<div class="system-info-header">Generated on:</div>
|
| 3831 |
<div class="system-info-content">
|
| 3832 |
+
Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3833 |
</div>
|
| 3834 |
</div>
|
| 3835 |
|
| 3836 |
<div class="main-content">
|
| 3837 |
<h1>xFormers Memory Efficient Attention</h1>
|
| 3838 |
<h2>xFormers Benchmark</h2>
|
| 3839 |
+
<div class="cell cell-failed" id="cell-benchmark">
|
| 3840 |
<div class="cell-header">
|
| 3841 |
<span class="collapse-indicators">
|
| 3842 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3843 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: benchmark | 0.01s | FAILED
|
| 3847 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3849 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3850 |
<a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/xformers.md" target="_blank" class="github-btn">GitHub</a>
|
| 3851 |
</div>
|
| 3852 |
+
<div id="code-benchmark" class="cell-code" data-lines="69">
|
| 3853 |
<div class="code-wrap">
|
| 3854 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 3855 |
<span class="c1"># requires-python = ">=3.10"</span>
|
|
|
|
| 3861 |
<span class="c1"># ]</span>
|
| 3862 |
<span class="c1">#</span>
|
| 3863 |
<span class="c1"># [tool.uv.sources]</span>
|
| 3864 |
+
<span class="c1"># kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }</span>
|
| 3865 |
<span class="c1"># ///</span>
|
| 3866 |
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 3867 |
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
|
|
|
| 3917 |
<span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
|
| 3918 |
<span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
|
| 3919 |
<span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
|
| 3920 |
+
<span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
|
| 3921 |
<span class="p">)</span>
|
| 3922 |
<span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">"attn.jsonl"</span><span class="p">])</span>
|
| 3923 |
</pre></div>
|
|
|
|
| 3926 |
</div>
|
| 3927 |
</div>
|
| 3928 |
<div id="output-benchmark" class="cell-output">
|
| 3929 |
+
<div class="cell-stderr"> × Failed to resolve script requirement
|
| 3930 |
+
╰─▶ Distribution not found at:
|
| 3931 |
+
file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3932 |
</div>
|
| 3933 |
</div>
|
| 3934 |
</div>
|
index.html
CHANGED
|
@@ -79,7 +79,9 @@
|
|
| 79 |
<body>
|
| 80 |
<h1>Index of /</h1>
|
| 81 |
<ul>
|
|
|
|
| 82 |
<li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
|
|
|
|
| 83 |
</ul>
|
| 84 |
</body>
|
| 85 |
</html>
|
|
|
|
| 79 |
<body>
|
| 80 |
<h1>Index of /</h1>
|
| 81 |
<ul>
|
| 82 |
+
<li><a href='activation/index.html' class='dir'>activation/</a></li>
|
| 83 |
<li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
|
| 84 |
+
<li><a href='layer_norm/index.html' class='dir'>layer_norm/</a></li>
|
| 85 |
</ul>
|
| 86 |
</body>
|
| 87 |
</html>
|
layer_norm/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels",
|
| 7 |
+
# "kernels-benchmark-tools",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
import torch
|
| 14 |
+
from kernels import get_kernel
|
| 15 |
+
import kernels_benchmark_tools as kbt
|
| 16 |
+
|
| 17 |
+
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
|
| 18 |
+
|
| 19 |
+
def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
|
| 20 |
+
B, S, D = x.shape
|
| 21 |
+
# The kernel expects [N, D] input; support beta (bias) if provided.
|
| 22 |
+
out = layer_norm_kernel.dropout_add_ln_fwd(
|
| 23 |
+
input=x.view(-1, D),
|
| 24 |
+
gamma=weight,
|
| 25 |
+
beta=bias,
|
| 26 |
+
rowscale=None,
|
| 27 |
+
colscale=None,
|
| 28 |
+
x0_subset=None,
|
| 29 |
+
z_subset=None,
|
| 30 |
+
dropout_p=0.0,
|
| 31 |
+
epsilon=eps,
|
| 32 |
+
rowscale_const=1.0,
|
| 33 |
+
z_numrows=S,
|
| 34 |
+
gen=None,
|
| 35 |
+
residual_in_fp32=False,
|
| 36 |
+
is_rms_norm=False,
|
| 37 |
+
)[0].view(B, S, D)
|
| 38 |
+
return out
|
| 39 |
+
|
| 40 |
+
kbt.add(
|
| 41 |
+
"hf_kernels_layer_norm",
|
| 42 |
+
hf_kernels_layer_norm,
|
| 43 |
+
tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 48 |
+
dtype = "float32" if device == "cpu" else "bfloat16"
|
| 49 |
+
|
| 50 |
+
wl = list(kbt.layer_norm.llama_workloads(dtype)) if device == "cuda" else list(kbt.layer_norm.cpu_workloads(dtype))
|
| 51 |
+
|
| 52 |
+
kbt.run(
|
| 53 |
+
wl,
|
| 54 |
+
jsonl="ln.jsonl",
|
| 55 |
+
reps=5,
|
| 56 |
+
warmup=2,
|
| 57 |
+
gen=kbt.layer_norm.gen_inputs,
|
| 58 |
+
ref=kbt.layer_norm.ref_layer_norm,
|
| 59 |
+
cmp=kbt.layer_norm.cmp_allclose,
|
| 60 |
+
profile_trace=False,
|
| 61 |
+
)
|
| 62 |
+
kbt.summarize(["ln.jsonl"])
|
layer_norm/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
layer_norm/impls/hf_kernels_layer_norm.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/impls/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /layer_norm/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /layer_norm/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='hf_kernels_layer_norm.html' class='file'>hf_kernels_layer_norm.html</a></li>
|
| 86 |
+
<li><a href='torch_layer_norm.html' class='file'>torch_layer_norm.html</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
layer_norm/impls/torch_layer_norm.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /layer_norm</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /layer_norm</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
layer_norm/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /layer_norm/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /layer_norm/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|