drbh HF Staff commited on
Commit
aa3ac98
·
verified ·
1 Parent(s): 17d52c9

Upload folder using huggingface_hub

Browse files
activation/impls/cells/benchmark.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels-benchmark-tools",
7
+ # "kernels",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
12
+ # ///
13
+ import torch
14
+ import sys
15
+ import kernels_benchmark_tools as kbt
16
+ from kernels import get_kernel
17
+
18
+ # Load the activation kernel
19
+ activation = get_kernel("kernels-community/activation")
20
+
21
+
22
+ def hf_kernels_swiglu(input_tensor):
23
+ """HuggingFace Kernels SwiGLU implementation"""
24
+ hidden_dim = input_tensor.shape[-1] // 2
25
+ out_shape = input_tensor.shape[:-1] + (hidden_dim,)
26
+ out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
27
+ return activation.silu_and_mul(out, input_tensor)
28
+
29
+
30
+ # Register the implementation
31
+ kbt.add(
32
+ "hf_kernels_swiglu",
33
+ hf_kernels_swiglu,
34
+ tags={"family": "hf-kernels", "backend": "triton", "compile": "none"},
35
+ )
36
+
37
+ if __name__ == "__main__":
38
+ device = "cuda" if torch.cuda.is_available() else "cpu"
39
+
40
+ if device == "cpu":
41
+ print("HF Kernels SwiGLU requires CUDA - skipping benchmark")
42
+ sys.exit(0)
43
+
44
+ dtype = "bfloat16"
45
+
46
+ # Generate workloads - using a subset for faster testing
47
+ wl = list(kbt.activation.llama_workloads(dtype=dtype))[:3] # First 3 workloads
48
+
49
+ print(f"Running SwiGLU benchmarks on {device} with {dtype}")
50
+ print(f"Testing {len(wl)} workloads")
51
+
52
+ # Run benchmark
53
+ kbt.run(
54
+ wl,
55
+ jsonl="activation.jsonl",
56
+ reps=5,
57
+ warmup=2,
58
+ gen=kbt.activation.gen_inputs,
59
+ ref=kbt.activation.ref_swiglu,
60
+ cmp=kbt.activation.cmp_allclose,
61
+ profile_trace=True
62
+ )
63
+
64
+ kbt.summarize(["activation.jsonl"])
activation/impls/cells/nv.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import subprocess
2
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
activation/impls/compiled_swiglu.html ADDED
The diff for this file is too large to render. See raw diff
 
activation/impls/hf_kernels_swiglu.html ADDED
The diff for this file is too large to render. See raw diff
 
activation/impls/index.html ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /activation/impls</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /activation/impls</h1>
84
+ <ul>
85
+ <li><a href='compiled_swiglu.html' class='file'>compiled_swiglu.html</a></li>
86
+ <li><a href='hf_kernels_swiglu.html' class='file'>hf_kernels_swiglu.html</a></li>
87
+ <li><a href='torch_swiglu.html' class='file'>torch_swiglu.html</a></li>
88
+ </ul>
89
+ </body>
90
+ </html>
activation/impls/torch_swiglu.html ADDED
The diff for this file is too large to render. See raw diff
 
activation/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /activation</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /activation</h1>
84
+ <ul>
85
+ <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
+ <li><a href='results/index.html' class='dir'>results/</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
activation/results/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /activation/results</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /activation/results</h1>
84
+ <ul>
85
+ <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
86
+ </ul>
87
+ </body>
88
+ </html>
flash_attn/impls/cells/benchmark.py CHANGED
@@ -4,46 +4,42 @@
4
  # "numpy",
5
  # "torch",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
- # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
12
  # ///
13
  import torch
14
  import sys
15
  import os
16
  import kernels_benchmark_tools as kbt
17
- from kernels import get_kernel
18
 
19
- hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn", revision="v0.0.2")
20
 
21
-
22
- def hf_flash_attention(query, key, value):
23
- """HuggingFace Kernels Flash Attention"""
24
- return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
25
 
26
 
27
  kbt.add(
28
- "hf_kernels_flash_attn",
29
- hf_flash_attention,
30
- tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
31
  )
32
 
33
  if __name__ == "__main__":
34
  device = "cuda" if torch.cuda.is_available() else "cpu"
35
-
36
- if device == "cpu":
37
- print("HF Kernels Flash Attention requires CUDA - skipping benchmark")
38
- sys.exit(0)
39
-
40
- dtype = "bfloat16"
41
 
42
  # Flux-like workloads
43
- base = 1024
44
- flux_sizes = [128, 256, 320, 384, 448, 512]
45
- heads = 24
46
- head_dim = 128
 
 
47
 
48
  wl = []
49
  for L in flux_sizes:
@@ -68,5 +64,6 @@ if __name__ == "__main__":
68
  gen=kbt.attn.gen_qkv,
69
  ref=kbt.attn.ref_math,
70
  cmp=kbt.attn.cmp_allclose,
 
71
  )
72
  kbt.summarize(["attn.jsonl"])
 
4
  # "numpy",
5
  # "torch",
6
  # "kernels-benchmark-tools",
7
+ # "xformers",
8
  # ]
9
  #
10
  # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
12
  # ///
13
  import torch
14
  import sys
15
  import os
16
  import kernels_benchmark_tools as kbt
17
+ import xformers.ops as xops
18
 
 
19
 
20
+ def xformers_attention(q, k, v):
21
+ """xFormers memory efficient attention"""
22
+ # xFormers expects [batch, seq_len, heads, head_dim]
23
+ return xops.memory_efficient_attention(q, k, v)
24
 
25
 
26
  kbt.add(
27
+ "xformers_meff",
28
+ xformers_attention,
29
+ tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
30
  )
31
 
32
  if __name__ == "__main__":
33
  device = "cuda" if torch.cuda.is_available() else "cpu"
34
+ dtype = "float32" if device == "cpu" else "bfloat16"
 
 
 
 
 
35
 
36
  # Flux-like workloads
37
+ base = 1024 if device == "cuda" else 512
38
+ flux_sizes = (
39
+ [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
40
+ )
41
+ heads = 24 if device == "cuda" else 8
42
+ head_dim = 128 if device == "cuda" else 64
43
 
44
  wl = []
45
  for L in flux_sizes:
 
64
  gen=kbt.attn.gen_qkv,
65
  ref=kbt.attn.ref_math,
66
  cmp=kbt.attn.cmp_allclose,
67
+ profile_trace=True
68
  )
69
  kbt.summarize(["attn.jsonl"])
flash_attn/impls/cells/benchmark_default.py CHANGED
@@ -7,7 +7,7 @@
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
- # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
11
  # ///
12
  import torch
13
  import sys
@@ -66,5 +66,6 @@ if __name__ == "__main__":
66
  gen=kbt.attn.gen_qkv,
67
  ref=kbt.attn.ref_math,
68
  cmp=kbt.attn.cmp_allclose,
 
69
  )
70
  kbt.summarize(["attn_default.jsonl"])
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
11
  # ///
12
  import torch
13
  import sys
 
66
  gen=kbt.attn.gen_qkv,
67
  ref=kbt.attn.ref_math,
68
  cmp=kbt.attn.cmp_allclose,
69
+ profile_trace=True
70
  )
71
  kbt.summarize(["attn_default.jsonl"])
flash_attn/impls/compiled_variants.html CHANGED
@@ -3829,7 +3829,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3833
  </div>
3834
  </div>
3835
 
@@ -3837,20 +3837,20 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3837
  <h1>Torch Compile Variants!</h1>
3838
  <p>This file benchmarks Flash Attention with different torch.compile modes.</p>
3839
  <h2>Flash Attention with torch.compile(mode="default")</h2>
3840
- <div class="cell" id="cell-benchmark_default">
3841
  <div class="cell-header">
3842
  <span class="collapse-indicators">
3843
  <span onclick="toggleCode('benchmark_default')" style="cursor: pointer;">▼ code</span>
3844
  <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
3845
- <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
3846
  </span> |
3847
- Cell: benchmark_default | 45.83s
3848
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
3849
  <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
3850
  <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
3851
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/compiled_variants.md" target="_blank" class="github-btn">GitHub</a>
3852
  </div>
3853
- <div id="code-benchmark_default" class="cell-code" data-lines="70">
3854
  <div class="code-wrap">
3855
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3856
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
@@ -3861,7 +3861,7 @@ Cell: benchmark_default | 45.83s
3861
  <span class="c1"># ]</span>
3862
  <span class="c1">#</span>
3863
  <span class="c1"># [tool.uv.sources]</span>
3864
- <span class="c1"># kernels-benchmark-tools = { git = &quot;https://github.com/drbh/kernels-benchmark-tools.git&quot;, branch = &quot;main&quot; }</span>
3865
  <span class="c1"># ///</span>
3866
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3920,6 +3920,7 @@ Cell: benchmark_default | 45.83s
3920
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3921
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3922
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
 
3923
  <span class="p">)</span>
3924
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn_default.jsonl&quot;</span><span class="p">])</span>
3925
  </pre></div>
@@ -3928,235 +3929,14 @@ Cell: benchmark_default | 45.83s
3928
  </div>
3929
  </div>
3930
  <div id="output-benchmark_default" class="cell-output">
3931
- <div class="cell-stdout">impl wl p50(ms) ok
3932
- torch_flash_compiled_default flux_L128 0.36 True
3933
- torch_flash_compiled_default flux_L256 0.50 True
3934
- torch_flash_compiled_default flux_L320 0.54 True
3935
- torch_flash_compiled_default flux_L384 0.59 True
3936
- torch_flash_compiled_default flux_L448 0.61 True
3937
- torch_flash_compiled_default flux_L512 0.64 True
3938
- </div>
3939
- <div class="uv-install-logs" id="uv-logs-benchmark_default">
3940
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3941
- <div class="uv-logs-content" style="display: none;">
3942
- Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3943
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3944
- Downloading matplotlib (8.3MiB)
3945
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3946
- Downloading nvidia-cufile-cu12 (1.1MiB)
3947
- Downloading kiwisolver (1.4MiB)
3948
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3949
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3950
- Downloading fonttools (4.7MiB)
3951
- Downloading triton (148.4MiB)
3952
- Downloading numpy (15.9MiB)
3953
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3954
- Downloading torch (846.8MiB)
3955
- Downloading nvidia-cublas-cu12 (566.8MiB)
3956
- Downloading nvidia-nccl-cu12 (307.4MiB)
3957
- Downloading setuptools (1.1MiB)
3958
- Downloading nvidia-curand-cu12 (60.7MiB)
3959
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3960
- Downloading networkx (1.9MiB)
3961
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3962
- Downloading nvidia-cufft-cu12 (184.2MiB)
3963
- Downloading sympy (6.0MiB)
3964
- Downloading pillow (6.3MiB)
3965
- Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3966
- Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3967
- Downloading nvidia-cufile-cu12
3968
- Downloading kiwisolver
3969
- Downloading setuptools
3970
- Downloading fonttools
3971
- Downloading networkx
3972
- Downloading pillow
3973
- Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3974
- Downloading matplotlib
3975
- Downloading nvidia-cuda-cupti-cu12
3976
- Downloading numpy
3977
- Downloading sympy
3978
- Downloading nvidia-nvjitlink-cu12
3979
- Downloading nvidia-curand-cu12
3980
- Downloading nvidia-cuda-nvrtc-cu12
3981
- Downloading triton
3982
- Downloading nvidia-cufft-cu12
3983
- Downloading nvidia-cusolver-cu12
3984
- Downloading nvidia-cusparse-cu12
3985
- Downloading nvidia-cusparselt-cu12
3986
- Downloading nvidia-nccl-cu12
3987
- Downloading nvidia-cublas-cu12
3988
- Downloading nvidia-cudnn-cu12
3989
- Downloading torch
3990
- Installed 37 packages in 203ms
3991
- </div>
3992
- </div>
3993
- <div class="cell-artifacts">
3994
- <h4>Artifacts:</h4>
3995
- <a href="artifacts/benchmark_default/attn_default.jsonl" class="artifact" target="_blank">attn_default.jsonl</a>
3996
  </div>
3997
  </div>
3998
  </div>
3999
 
4000
  <h2>Flash Attention with torch.compile(mode="max-autotune")</h2>
4001
- <div class="cell" id="cell-benchmark_max_autotune">
4002
- <div class="cell-header">
4003
- <span class="collapse-indicators">
4004
- <span onclick="toggleCode('benchmark_max_autotune')" style="cursor: pointer;">▼ code</span>
4005
- <span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
4006
- <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
4007
- </span> |
4008
- Cell: benchmark_max_autotune | 48.72s
4009
- | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
4010
- <button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
4011
- <a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
4012
- <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/compiled_variants.md" target="_blank" class="github-btn">GitHub</a>
4013
- </div>
4014
- <div id="code-benchmark_max_autotune" class="cell-code" data-lines="70">
4015
- <div class="code-wrap">
4016
- <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
4017
- <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
4018
- <span class="c1"># dependencies = [</span>
4019
- <span class="c1"># &quot;numpy&quot;,</span>
4020
- <span class="c1"># &quot;torch&quot;,</span>
4021
- <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
4022
- <span class="c1"># ]</span>
4023
- <span class="c1">#</span>
4024
- <span class="c1"># [tool.uv.sources]</span>
4025
- <span class="c1"># kernels-benchmark-tools = { git = &quot;https://github.com/drbh/kernels-benchmark-tools.git&quot;, branch = &quot;main&quot; }</span>
4026
- <span class="c1"># ///</span>
4027
- <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
4028
- <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
4029
- <span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
4030
- <span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
4031
-
4032
-
4033
- <span class="k">def</span><span class="w"> </span><span class="nf">torch_flash_base</span><span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
4034
- <span class="n">qt</span><span class="p">,</span> <span class="n">kt</span><span class="p">,</span> <span class="n">vt</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">))</span>
4035
- <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">sdpa_kernel</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">FLASH_ATTENTION</span><span class="p">):</span>
4036
- <span class="n">o</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">scaled_dot_product_attention</span><span class="p">(</span><span class="n">qt</span><span class="p">,</span> <span class="n">kt</span><span class="p">,</span> <span class="n">vt</span><span class="p">)</span>
4037
- <span class="k">return</span> <span class="n">o</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
4038
-
4039
-
4040
- <span class="c1"># Compile with max-autotune mode</span>
4041
- <span class="n">compiled_flash_max_autotune</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">torch_flash_base</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;max-autotune&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
4042
-
4043
- <span class="n">kbt</span><span class="o">.</span><span class="n">add</span><span class="p">(</span>
4044
- <span class="s2">&quot;torch_flash_compiled_max_autotune&quot;</span><span class="p">,</span>
4045
- <span class="n">compiled_flash_max_autotune</span><span class="p">,</span>
4046
- <span class="n">tags</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;family&quot;</span><span class="p">:</span> <span class="s2">&quot;torch-sdpa&quot;</span><span class="p">,</span> <span class="s2">&quot;backend&quot;</span><span class="p">:</span> <span class="s2">&quot;FLASH&quot;</span><span class="p">,</span> <span class="s2">&quot;compile&quot;</span><span class="p">:</span> <span class="s2">&quot;max-autotune&quot;</span><span class="p">},</span>
4047
- <span class="p">)</span>
4048
-
4049
- <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
4050
- <span class="n">device</span> <span class="o">=</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">()</span> <span class="k">else</span> <span class="s2">&quot;cpu&quot;</span>
4051
- <span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;float32&quot;</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cpu&quot;</span> <span class="k">else</span> <span class="s2">&quot;bfloat16&quot;</span>
4052
-
4053
- <span class="c1"># Flux-like workloads</span>
4054
- <span class="n">base</span> <span class="o">=</span> <span class="mi">1024</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="mi">512</span>
4055
- <span class="n">flux_sizes</span> <span class="o">=</span> <span class="p">(</span>
4056
- <span class="p">[</span><span class="mi">128</span><span class="p">,</span> <span class="mi">256</span><span class="p">,</span> <span class="mi">320</span><span class="p">,</span> <span class="mi">384</span><span class="p">,</span> <span class="mi">448</span><span class="p">,</span> <span class="mi">512</span><span class="p">]</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="p">[</span><span class="mi">64</span><span class="p">,</span> <span class="mi">128</span><span class="p">,</span> <span class="mi">192</span><span class="p">,</span> <span class="mi">256</span><span class="p">]</span>
4057
- <span class="p">)</span>
4058
- <span class="n">heads</span> <span class="o">=</span> <span class="mi">24</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="mi">8</span>
4059
- <span class="n">head_dim</span> <span class="o">=</span> <span class="mi">128</span> <span class="k">if</span> <span class="n">device</span> <span class="o">==</span> <span class="s2">&quot;cuda&quot;</span> <span class="k">else</span> <span class="mi">64</span>
4060
-
4061
- <span class="n">wl</span> <span class="o">=</span> <span class="p">[]</span>
4062
- <span class="k">for</span> <span class="n">L</span> <span class="ow">in</span> <span class="n">flux_sizes</span><span class="p">:</span>
4063
- <span class="n">wl</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
4064
- <span class="p">{</span>
4065
- <span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;flux_L</span><span class="si">{</span><span class="n">L</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span>
4066
- <span class="s2">&quot;batch&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span>
4067
- <span class="s2">&quot;seq_len&quot;</span><span class="p">:</span> <span class="n">base</span> <span class="o">+</span> <span class="n">L</span><span class="p">,</span>
4068
- <span class="s2">&quot;heads&quot;</span><span class="p">:</span> <span class="n">heads</span><span class="p">,</span>
4069
- <span class="s2">&quot;head_dim&quot;</span><span class="p">:</span> <span class="n">head_dim</span><span class="p">,</span>
4070
- <span class="s2">&quot;dtype&quot;</span><span class="p">:</span> <span class="n">dtype</span><span class="p">,</span>
4071
- <span class="s2">&quot;device&quot;</span><span class="p">:</span> <span class="n">device</span><span class="p">,</span>
4072
- <span class="s2">&quot;seed&quot;</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span>
4073
- <span class="p">}</span>
4074
- <span class="p">)</span>
4075
-
4076
- <span class="n">kbt</span><span class="o">.</span><span class="n">run</span><span class="p">(</span>
4077
- <span class="n">wl</span><span class="p">,</span>
4078
- <span class="n">jsonl</span><span class="o">=</span><span class="s2">&quot;attn_max_autotune.jsonl&quot;</span><span class="p">,</span>
4079
- <span class="n">reps</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
4080
- <span class="n">warmup</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
4081
- <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
4082
- <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
4083
- <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
4084
- <span class="p">)</span>
4085
- <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn_max_autotune.jsonl&quot;</span><span class="p">])</span>
4086
- </pre></div>
4087
-
4088
- <div class="code-line-highlight" id="line-highlight-benchmark_max_autotune"></div>
4089
- </div>
4090
- </div>
4091
- <div id="output-benchmark_max_autotune" class="cell-output">
4092
- <div class="cell-stdout">impl wl p50(ms) ok
4093
- torch_flash_compiled_max_autotune flux_L128 0.38 True
4094
- torch_flash_compiled_max_autotune flux_L256 0.55 True
4095
- torch_flash_compiled_max_autotune flux_L320 0.61 True
4096
- torch_flash_compiled_max_autotune flux_L384 0.66 True
4097
- torch_flash_compiled_max_autotune flux_L448 0.70 True
4098
- torch_flash_compiled_max_autotune flux_L512 0.76 True
4099
- </div>
4100
- <div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
4101
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4102
- <div class="uv-logs-content" style="display: none;">
4103
- Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
4104
- Downloading setuptools (1.1MiB)
4105
- Downloading pillow (6.3MiB)
4106
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4107
- Downloading nvidia-nccl-cu12 (307.4MiB)
4108
- Downloading nvidia-curand-cu12 (60.7MiB)
4109
- Downloading networkx (1.9MiB)
4110
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4111
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4112
- Downloading kiwisolver (1.4MiB)
4113
- Downloading fonttools (4.7MiB)
4114
- Downloading numpy (15.9MiB)
4115
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4116
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4117
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4118
- Downloading nvidia-cufft-cu12 (184.2MiB)
4119
- Downloading nvidia-cufile-cu12 (1.1MiB)
4120
- Downloading matplotlib (8.3MiB)
4121
- Downloading torch (846.8MiB)
4122
- Downloading nvidia-cublas-cu12 (566.8MiB)
4123
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4124
- Downloading triton (148.4MiB)
4125
- Downloading sympy (6.0MiB)
4126
- Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
4127
- Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
4128
- Downloading nvidia-cufile-cu12
4129
- Downloading kiwisolver
4130
- Downloading setuptools
4131
- Downloading fonttools
4132
- Downloading networkx
4133
- Downloading pillow
4134
- Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
4135
- Downloading nvidia-cuda-cupti-cu12
4136
- Downloading matplotlib
4137
- Downloading numpy
4138
- Downloading sympy
4139
- Downloading nvidia-nvjitlink-cu12
4140
- Downloading nvidia-curand-cu12
4141
- Downloading nvidia-cuda-nvrtc-cu12
4142
- Downloading triton
4143
- Downloading nvidia-cufft-cu12
4144
- Downloading nvidia-cusolver-cu12
4145
- Downloading nvidia-cusparse-cu12
4146
- Downloading nvidia-cusparselt-cu12
4147
- Downloading nvidia-nccl-cu12
4148
- Downloading nvidia-cudnn-cu12
4149
- Downloading nvidia-cublas-cu12
4150
- Downloading torch
4151
- Installed 37 packages in 208ms
4152
- </div>
4153
- </div>
4154
- <div class="cell-artifacts">
4155
- <h4>Artifacts:</h4>
4156
- <a href="artifacts/benchmark_max_autotune/attn_max_autotune.jsonl" class="artifact" target="_blank">attn_max_autotune.jsonl</a>
4157
- </div>
4158
- </div>
4159
- </div>
4160
  </div>
4161
 
4162
  </body>
 
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
 
3837
  <h1>Torch Compile Variants!</h1>
3838
  <p>This file benchmarks Flash Attention with different torch.compile modes.</p>
3839
  <h2>Flash Attention with torch.compile(mode="default")</h2>
3840
+ <div class="cell cell-failed" id="cell-benchmark_default">
3841
  <div class="cell-header">
3842
  <span class="collapse-indicators">
3843
  <span onclick="toggleCode('benchmark_default')" style="cursor: pointer;">▼ code</span>
3844
  <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
3845
+ <span id="uv-indicator-benchmark_default" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3846
  </span> |
3847
+ Cell: benchmark_default | 0.02s | FAILED
3848
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
3849
  <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
3850
  <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
3851
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/compiled_variants.md" target="_blank" class="github-btn">GitHub</a>
3852
  </div>
3853
+ <div id="code-benchmark_default" class="cell-code" data-lines="71">
3854
  <div class="code-wrap">
3855
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3856
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 
3861
  <span class="c1"># ]</span>
3862
  <span class="c1">#</span>
3863
  <span class="c1"># [tool.uv.sources]</span>
3864
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3865
  <span class="c1"># ///</span>
3866
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3920
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3921
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3922
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
3923
+ <span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
3924
  <span class="p">)</span>
3925
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn_default.jsonl&quot;</span><span class="p">])</span>
3926
  </pre></div>
 
3929
  </div>
3930
  </div>
3931
  <div id="output-benchmark_default" class="cell-output">
3932
+ <div class="cell-stderr"> × Failed to resolve script requirement
3933
+ ╰─▶ Distribution not found at:
3934
+ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3935
  </div>
3936
  </div>
3937
  </div>
3938
 
3939
  <h2>Flash Attention with torch.compile(mode="max-autotune")</h2>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3940
  </div>
3941
 
3942
  </body>
flash_attn/impls/flash_attention.html CHANGED
@@ -3829,7 +3829,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3833
  </div>
3834
  </div>
3835
 
@@ -3843,7 +3843,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3843
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: nv | 4.06s
3847
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3849
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3860,34 +3860,22 @@ Cell: nv | 4.06s
3860
  </div>
3861
  </div>
3862
  <div id="output-nv" class="cell-output">
3863
- <div class="cell-stdout">Thu Oct 2 16:12:42 2025
3864
  +-----------------------------------------------------------------------------------------+
3865
- | NVIDIA-SMI 560.35.05 Driver Version: 560.35.05 CUDA Version: 12.6 |
3866
  |-----------------------------------------+------------------------+----------------------+
3867
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3868
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3869
  | | | MIG M. |
3870
  |=========================================+========================+======================|
3871
- | 0 NVIDIA L4 Off | 00000000:38:00.0 Off | 0 |
3872
- | N/A 41C P0 27W / 72W | 1MiB / 23034MiB | 0% Default |
3873
- | | | N/A |
3874
- +-----------------------------------------+------------------------+----------------------+
3875
- | 1 NVIDIA L4 Off | 00000000:3A:00.0 Off | 0 |
3876
- | N/A 41C P0 27W / 72W | 1MiB / 23034MiB | 2% Default |
3877
- | | | N/A |
3878
- +-----------------------------------------+------------------------+----------------------+
3879
- | 2 NVIDIA L4 Off | 00000000:3C:00.0 Off | 0 |
3880
- | N/A 44C P0 29W / 72W | 1MiB / 23034MiB | 2% Default |
3881
- | | | N/A |
3882
- +-----------------------------------------+------------------------+----------------------+
3883
- | 3 NVIDIA L4 Off | 00000000:3E:00.0 Off | 0 |
3884
- | N/A 42C P0 29W / 72W | 1MiB / 23034MiB | 2% Default |
3885
  | | | N/A |
3886
  +-----------------------------------------+------------------------+----------------------+
3887
 
3888
  +-----------------------------------------------------------------------------------------+
3889
  | Processes: |
3890
- | GPU GI CI PID Type Process name GPU Memory |
3891
  | ID ID Usage |
3892
  |=========================================================================================|
3893
  | No running processes found |
@@ -3898,20 +3886,20 @@ Cell: nv | 4.06s
3898
  </div>
3899
 
3900
  <h2>Flash Attention Benchmark</h2>
3901
- <div class="cell" id="cell-benchmark">
3902
  <div class="cell-header">
3903
  <span class="collapse-indicators">
3904
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3905
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3906
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3907
  </span> |
3908
- Cell: benchmark | 38.14s
3909
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3910
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3911
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3912
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
3913
  </div>
3914
- <div id="code-benchmark" class="cell-code" data-lines="66">
3915
  <div class="code-wrap">
3916
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3917
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
@@ -3922,7 +3910,7 @@ Cell: benchmark | 38.14s
3922
  <span class="c1"># ]</span>
3923
  <span class="c1">#</span>
3924
  <span class="c1"># [tool.uv.sources]</span>
3925
- <span class="c1"># kernels-benchmark-tools = { git = &quot;https://github.com/drbh/kernels-benchmark-tools.git&quot;, branch = &quot;main&quot; }</span>
3926
  <span class="c1"># ///</span>
3927
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3928
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3977,6 +3965,7 @@ Cell: benchmark | 38.14s
3977
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3978
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3979
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
 
3980
  <span class="p">)</span>
3981
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3982
  </pre></div>
@@ -3985,71 +3974,9 @@ Cell: benchmark | 38.14s
3985
  </div>
3986
  </div>
3987
  <div id="output-benchmark" class="cell-output">
3988
- <div class="cell-stdout">impl wl p50(ms) ok
3989
- torch_flash_ma flux_L128 0.41 True
3990
- torch_flash_ma flux_L256 0.52 True
3991
- torch_flash_ma flux_L320 0.55 True
3992
- torch_flash_ma flux_L384 0.59 True
3993
- torch_flash_ma flux_L448 0.64 True
3994
- torch_flash_ma flux_L512 0.68 True
3995
- </div>
3996
- <div class="uv-install-logs" id="uv-logs-benchmark">
3997
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3998
- <div class="uv-logs-content" style="display: none;">
3999
- Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
4000
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4001
- Downloading nvidia-cufft-cu12 (184.2MiB)
4002
- Downloading nvidia-curand-cu12 (60.7MiB)
4003
- Downloading fonttools (4.7MiB)
4004
- Downloading matplotlib (8.3MiB)
4005
- Downloading nvidia-cufile-cu12 (1.1MiB)
4006
- Downloading nvidia-cublas-cu12 (566.8MiB)
4007
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4008
- Downloading nvidia-nccl-cu12 (307.4MiB)
4009
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4010
- Downloading torch (846.8MiB)
4011
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4012
- Downloading pillow (6.3MiB)
4013
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4014
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4015
- Downloading sympy (6.0MiB)
4016
- Downloading setuptools (1.1MiB)
4017
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4018
- Downloading networkx (1.9MiB)
4019
- Downloading triton (148.4MiB)
4020
- Downloading kiwisolver (1.4MiB)
4021
- Downloading numpy (15.9MiB)
4022
- Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
4023
- Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
4024
- Downloading nvidia-cufile-cu12
4025
- Downloading kiwisolver
4026
- Downloading setuptools
4027
- Downloading fonttools
4028
- Downloading networkx
4029
- Downloading pillow
4030
- Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
4031
- Downloading nvidia-cuda-cupti-cu12
4032
- Downloading matplotlib
4033
- Downloading numpy
4034
- Downloading sympy
4035
- Downloading nvidia-nvjitlink-cu12
4036
- Downloading nvidia-curand-cu12
4037
- Downloading nvidia-cuda-nvrtc-cu12
4038
- Downloading triton
4039
- Downloading nvidia-cufft-cu12
4040
- Downloading nvidia-cusolver-cu12
4041
- Downloading nvidia-cusparse-cu12
4042
- Downloading nvidia-cusparselt-cu12
4043
- Downloading nvidia-nccl-cu12
4044
- Downloading nvidia-cublas-cu12
4045
- Downloading nvidia-cudnn-cu12
4046
- Downloading torch
4047
- Installed 37 packages in 224ms
4048
- </div>
4049
- </div>
4050
- <div class="cell-artifacts">
4051
- <h4>Artifacts:</h4>
4052
- <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
4053
  </div>
4054
  </div>
4055
  </div>
 
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
 
3843
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
+ Cell: nv | 0.23s
3847
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3849
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3860
  </div>
3861
  </div>
3862
  <div id="output-nv" class="cell-output">
3863
+ <div class="cell-stdout">Wed Oct 22 08:58:24 2025
3864
  +-----------------------------------------------------------------------------------------+
3865
+ | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3866
  |-----------------------------------------+------------------------+----------------------+
3867
  | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3868
  | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3869
  | | | MIG M. |
3870
  |=========================================+========================+======================|
3871
+ | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3872
+ | N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
 
 
 
 
 
 
 
 
 
 
 
 
3873
  | | | N/A |
3874
  +-----------------------------------------+------------------------+----------------------+
3875
 
3876
  +-----------------------------------------------------------------------------------------+
3877
  | Processes: |
3878
+ | GPU GI CI PID Type Process name GPU Memory |
3879
  | ID ID Usage |
3880
  |=========================================================================================|
3881
  | No running processes found |
 
3886
  </div>
3887
 
3888
  <h2>Flash Attention Benchmark</h2>
3889
+ <div class="cell cell-failed" id="cell-benchmark">
3890
  <div class="cell-header">
3891
  <span class="collapse-indicators">
3892
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3893
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3894
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3895
  </span> |
3896
+ Cell: benchmark | 0.01s | FAILED
3897
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3898
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3899
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3900
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
3901
  </div>
3902
+ <div id="code-benchmark" class="cell-code" data-lines="67">
3903
  <div class="code-wrap">
3904
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3905
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 
3910
  <span class="c1"># ]</span>
3911
  <span class="c1">#</span>
3912
  <span class="c1"># [tool.uv.sources]</span>
3913
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3914
  <span class="c1"># ///</span>
3915
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3916
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3965
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3966
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3967
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
3968
+ <span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
3969
  <span class="p">)</span>
3970
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3971
  </pre></div>
 
3974
  </div>
3975
  </div>
3976
  <div id="output-benchmark" class="cell-output">
3977
+ <div class="cell-stderr"> × Failed to resolve script requirement
3978
+ ╰─▶ Distribution not found at:
3979
+ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3980
  </div>
3981
  </div>
3982
  </div>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -3829,40 +3829,40 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>HF Kernels - Flash Attention</h1>
3838
  <h2>HuggingFace Kernels Flash Attention Benchmark</h2>
3839
- <div class="cell" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: benchmark | 40.14s
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3850
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn.md" target="_blank" class="github-btn">GitHub</a>
3851
  <a href="https://huggingface.co/kernels-community/flash-attn2" target="_blank" class="hf-btn">🤗 HF</a>
3852
  </div>
3853
- <div id="code-benchmark" class="cell-code" data-lines="72">
3854
  <div class="code-wrap">
3855
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3856
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3857
  <span class="c1"># dependencies = [</span>
3858
  <span class="c1"># &quot;numpy&quot;,</span>
3859
- <span class="c1"># &quot;torch&quot;,</span>
3860
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3861
  <span class="c1"># &quot;kernels&quot;,</span>
3862
  <span class="c1"># ]</span>
3863
  <span class="c1">#</span>
3864
  <span class="c1"># [tool.uv.sources]</span>
3865
- <span class="c1"># kernels-benchmark-tools = { git = &quot;https://github.com/drbh/kernels-benchmark-tools.git&quot;, branch = &quot;main&quot; }</span>
3866
  <span class="c1"># ///</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3868
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3870,7 +3870,7 @@ Cell: benchmark | 40.14s
3870
  <span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
3871
  <span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
3872
 
3873
- <span class="n">hf_kernels_flash_attn</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">&quot;kernels-community/flash-attn&quot;</span><span class="p">,</span> <span class="n">revision</span><span class="o">=</span><span class="s2">&quot;v0.0.2&quot;</span><span class="p">)</span>
3874
 
3875
 
3876
  <span class="k">def</span><span class="w"> </span><span class="nf">hf_flash_attention</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
@@ -3922,6 +3922,7 @@ Cell: benchmark | 40.14s
3922
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3923
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3924
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
 
3925
  <span class="p">)</span>
3926
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3927
  </pre></div>
@@ -3930,77 +3931,9 @@ Cell: benchmark | 40.14s
3930
  </div>
3931
  </div>
3932
  <div id="output-benchmark" class="cell-output">
3933
- <div class="cell-stdout">impl wl p50(ms) ok
3934
- hf_kernels_flash_attn flux_L128 0.25 True
3935
- hf_kernels_flash_attn flux_L256 0.32 True
3936
- hf_kernels_flash_attn flux_L320 0.34 True
3937
- hf_kernels_flash_attn flux_L384 0.35 True
3938
- hf_kernels_flash_attn flux_L448 0.38 True
3939
- hf_kernels_flash_attn flux_L512 0.42 True
3940
- </div>
3941
- <div class="uv-install-logs" id="uv-logs-benchmark">
3942
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3943
- <div class="uv-logs-content" style="display: none;">
3944
- Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3945
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3946
- Downloading kiwisolver (1.4MiB)
3947
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3948
- Downloading matplotlib (8.3MiB)
3949
- Downloading fonttools (4.7MiB)
3950
- Downloading setuptools (1.1MiB)
3951
- Downloading nvidia-nccl-cu12 (307.4MiB)
3952
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3953
- Downloading sympy (6.0MiB)
3954
- Downloading hf-xet (3.0MiB)
3955
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3956
- Downloading pillow (6.3MiB)
3957
- Downloading networkx (1.9MiB)
3958
- Downloading numpy (15.9MiB)
3959
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3960
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3961
- Downloading nvidia-cufile-cu12 (1.1MiB)
3962
- Downloading nvidia-curand-cu12 (60.7MiB)
3963
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3964
- Downloading nvidia-cufft-cu12 (184.2MiB)
3965
- Downloading nvidia-cublas-cu12 (566.8MiB)
3966
- Downloading torch (846.8MiB)
3967
- Downloading triton (148.4MiB)
3968
- Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3969
- Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3970
- Downloading nvidia-cufile-cu12
3971
- Downloading kiwisolver
3972
- Downloading hf-xet
3973
- Downloading setuptools
3974
- Downloading networkx
3975
- Downloading fonttools
3976
- Downloading pillow
3977
- Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3978
- Downloading matplotlib
3979
- Downloading nvidia-cuda-cupti-cu12
3980
- Downloading numpy
3981
- Downloading sympy
3982
- Downloading nvidia-nvjitlink-cu12
3983
- Downloading nvidia-curand-cu12
3984
- Downloading nvidia-cuda-nvrtc-cu12
3985
- Downloading triton
3986
- Downloading nvidia-cufft-cu12
3987
- Downloading nvidia-cusolver-cu12
3988
- Downloading nvidia-cusparselt-cu12
3989
- Downloading nvidia-cusparse-cu12
3990
- Downloading nvidia-nccl-cu12
3991
- Downloading nvidia-cublas-cu12
3992
- Downloading nvidia-cudnn-cu12
3993
- Downloading torch
3994
- Installed 47 packages in 255ms
3995
- </div>
3996
- </div>
3997
- <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
3998
- Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:05, 3.64it/s]
3999
- Fetching 20 files: 10%|█ | 2/20 [00:02&lt;00:22, 1.24s/it]
4000
- Fetching 20 files: 100%|██████████| 20/20 [00:02&lt;00:00, 9.14it/s]</div>
4001
- <div class="cell-artifacts">
4002
- <h4>Artifacts:</h4>
4003
- <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
4004
  </div>
4005
  </div>
4006
  </div>
 
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>HF Kernels - Flash Attention</h1>
3838
  <h2>HuggingFace Kernels Flash Attention Benchmark</h2>
3839
+ <div class="cell cell-failed" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
+ Cell: benchmark | 0.01s | FAILED
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3850
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn.md" target="_blank" class="github-btn">GitHub</a>
3851
  <a href="https://huggingface.co/kernels-community/flash-attn2" target="_blank" class="hf-btn">🤗 HF</a>
3852
  </div>
3853
+ <div id="code-benchmark" class="cell-code" data-lines="73">
3854
  <div class="code-wrap">
3855
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3856
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
3857
  <span class="c1"># dependencies = [</span>
3858
  <span class="c1"># &quot;numpy&quot;,</span>
3859
+ <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3860
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3861
  <span class="c1"># &quot;kernels&quot;,</span>
3862
  <span class="c1"># ]</span>
3863
  <span class="c1">#</span>
3864
  <span class="c1"># [tool.uv.sources]</span>
3865
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3866
  <span class="c1"># ///</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3868
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3870
  <span class="kn">import</span><span class="w"> </span><span class="nn">kernels_benchmark_tools</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">kbt</span>
3871
  <span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
3872
 
3873
+ <span class="n">hf_kernels_flash_attn</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">&quot;kernels-community/flash-attn&quot;</span><span class="p">)</span>
3874
 
3875
 
3876
  <span class="k">def</span><span class="w"> </span><span class="nf">hf_flash_attention</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
 
3922
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3923
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3924
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
3925
+ <span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
3926
  <span class="p">)</span>
3927
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3928
  </pre></div>
 
3931
  </div>
3932
  </div>
3933
  <div id="output-benchmark" class="cell-output">
3934
+ <div class="cell-stderr"> × Failed to resolve script requirement
3935
+ ╰─▶ Distribution not found at:
3936
+ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3937
  </div>
3938
  </div>
3939
  </div>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -3829,28 +3829,28 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>HF Kernels - Flash Attention 3</h1>
3838
  <h2>HuggingFace Kernels Flash Attention 3 Benchmark</h2>
3839
- <div class="cell" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: benchmark | 40.68s
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3850
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn3.md" target="_blank" class="github-btn">GitHub</a>
3851
  <a href="https://huggingface.co/kernels-community/flash-attn3" target="_blank" class="hf-btn">🤗 HF</a>
3852
  </div>
3853
- <div id="code-benchmark" class="cell-code" data-lines="71">
3854
  <div class="code-wrap">
3855
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3856
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
@@ -3862,7 +3862,7 @@ Cell: benchmark | 40.68s
3862
  <span class="c1"># ]</span>
3863
  <span class="c1">#</span>
3864
  <span class="c1"># [tool.uv.sources]</span>
3865
- <span class="c1"># kernels-benchmark-tools = { git = &quot;https://github.com/drbh/kernels-benchmark-tools.git&quot;, branch = &quot;main&quot; }</span>
3866
  <span class="c1"># ///</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3868
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3921,6 +3921,7 @@ Cell: benchmark | 40.68s
3921
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3922
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3923
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
 
3924
  <span class="p">)</span>
3925
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3926
  </pre></div>
@@ -3929,77 +3930,9 @@ Cell: benchmark | 40.68s
3929
  </div>
3930
  </div>
3931
  <div id="output-benchmark" class="cell-output">
3932
- <div class="cell-stdout">impl wl p50(ms) ok
3933
- hf_kernels_flash_attn3 flux_L128 0.28 True
3934
- hf_kernels_flash_attn3 flux_L256 0.34 True
3935
- hf_kernels_flash_attn3 flux_L320 0.36 True
3936
- hf_kernels_flash_attn3 flux_L384 0.37 True
3937
- hf_kernels_flash_attn3 flux_L448 0.40 True
3938
- hf_kernels_flash_attn3 flux_L512 0.43 True
3939
- </div>
3940
- <div class="uv-install-logs" id="uv-logs-benchmark">
3941
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3942
- <div class="uv-logs-content" style="display: none;">
3943
- Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3944
- Downloading pillow (6.3MiB)
3945
- Downloading hf-xet (3.0MiB)
3946
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3947
- Downloading kiwisolver (1.4MiB)
3948
- Downloading fonttools (4.7MiB)
3949
- Downloading matplotlib (8.3MiB)
3950
- Downloading networkx (1.9MiB)
3951
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3952
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3953
- Downloading numpy (15.9MiB)
3954
- Downloading nvidia-cufile-cu12 (1.1MiB)
3955
- Downloading nvidia-nccl-cu12 (307.4MiB)
3956
- Downloading nvidia-cublas-cu12 (566.8MiB)
3957
- Downloading nvidia-cufft-cu12 (184.2MiB)
3958
- Downloading torch (846.8MiB)
3959
- Downloading triton (148.4MiB)
3960
- Downloading setuptools (1.1MiB)
3961
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3962
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3963
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3964
- Downloading nvidia-curand-cu12 (60.7MiB)
3965
- Downloading sympy (6.0MiB)
3966
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3967
- Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3968
- Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3969
- Downloading nvidia-cufile-cu12
3970
- Downloading kiwisolver
3971
- Downloading hf-xet
3972
- Downloading setuptools
3973
- Downloading networkx
3974
- Downloading fonttools
3975
- Downloading pillow
3976
- Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3977
- Downloading matplotlib
3978
- Downloading nvidia-cuda-cupti-cu12
3979
- Downloading numpy
3980
- Downloading sympy
3981
- Downloading nvidia-nvjitlink-cu12
3982
- Downloading nvidia-curand-cu12
3983
- Downloading nvidia-cuda-nvrtc-cu12
3984
- Downloading triton
3985
- Downloading nvidia-cufft-cu12
3986
- Downloading nvidia-cusolver-cu12
3987
- Downloading nvidia-cusparse-cu12
3988
- Downloading nvidia-cusparselt-cu12
3989
- Downloading nvidia-nccl-cu12
3990
- Downloading nvidia-cublas-cu12
3991
- Downloading nvidia-cudnn-cu12
3992
- Downloading torch
3993
- Installed 47 packages in 229ms
3994
- </div>
3995
- </div>
3996
- <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
3997
- Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 3.56it/s]
3998
- Fetching 4 files: 50%|█████ | 2/4 [00:02&lt;00:02, 1.32s/it]
3999
- Fetching 4 files: 100%|██████████| 4/4 [00:02&lt;00:00, 1.72it/s]</div>
4000
- <div class="cell-artifacts">
4001
- <h4>Artifacts:</h4>
4002
- <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
4003
  </div>
4004
  </div>
4005
  </div>
 
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>HF Kernels - Flash Attention 3</h1>
3838
  <h2>HuggingFace Kernels Flash Attention 3 Benchmark</h2>
3839
+ <div class="cell cell-failed" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
+ Cell: benchmark | 0.05s | FAILED
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3850
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/hf_kernels_flash_attn3.md" target="_blank" class="github-btn">GitHub</a>
3851
  <a href="https://huggingface.co/kernels-community/flash-attn3" target="_blank" class="hf-btn">🤗 HF</a>
3852
  </div>
3853
+ <div id="code-benchmark" class="cell-code" data-lines="72">
3854
  <div class="code-wrap">
3855
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3856
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 
3862
  <span class="c1"># ]</span>
3863
  <span class="c1">#</span>
3864
  <span class="c1"># [tool.uv.sources]</span>
3865
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3866
  <span class="c1"># ///</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3868
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3921
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3922
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3923
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
3924
+ <span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
3925
  <span class="p">)</span>
3926
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3927
  </pre></div>
 
3930
  </div>
3931
  </div>
3932
  <div id="output-benchmark" class="cell-output">
3933
+ <div class="cell-stderr"> × Failed to resolve script requirement
3934
+ ╰─▶ Distribution not found at:
3935
+ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3936
  </div>
3937
  </div>
3938
  </div>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -3829,27 +3829,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>Memory Efficient Attention Implementation</h1>
3838
  <h2>Memory Efficient SDPA Benchmark</h2>
3839
- <div class="cell" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: benchmark | 39.23s
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3850
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/mem_efficient_attention.md" target="_blank" class="github-btn">GitHub</a>
3851
  </div>
3852
- <div id="code-benchmark" class="cell-code" data-lines="68">
3853
  <div class="code-wrap">
3854
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3855
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
@@ -3860,7 +3860,7 @@ Cell: benchmark | 39.23s
3860
  <span class="c1"># ]</span>
3861
  <span class="c1">#</span>
3862
  <span class="c1"># [tool.uv.sources]</span>
3863
- <span class="c1"># kernels-benchmark-tools = { git = &quot;https://github.com/drbh/kernels-benchmark-tools.git&quot;, branch = &quot;main&quot; }</span>
3864
  <span class="c1"># ///</span>
3865
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3866
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3917,6 +3917,7 @@ Cell: benchmark | 39.23s
3917
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3918
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3919
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
 
3920
  <span class="p">)</span>
3921
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3922
  </pre></div>
@@ -3925,71 +3926,9 @@ Cell: benchmark | 39.23s
3925
  </div>
3926
  </div>
3927
  <div id="output-benchmark" class="cell-output">
3928
- <div class="cell-stdout">impl wl p50(ms) ok
3929
- torch_mem_eff flux_L128 0.48 True
3930
- torch_mem_eff flux_L256 0.63 True
3931
- torch_mem_eff flux_L320 0.70 True
3932
- torch_mem_eff flux_L384 0.83 True
3933
- torch_mem_eff flux_L448 0.95 True
3934
- torch_mem_eff flux_L512 1.00 True
3935
- </div>
3936
- <div class="uv-install-logs" id="uv-logs-benchmark">
3937
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3938
- <div class="uv-logs-content" style="display: none;">
3939
- Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3940
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3941
- Downloading networkx (1.9MiB)
3942
- Downloading sympy (6.0MiB)
3943
- Downloading fonttools (4.7MiB)
3944
- Downloading nvidia-cufft-cu12 (184.2MiB)
3945
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3946
- Downloading triton (148.4MiB)
3947
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3948
- Downloading nvidia-cublas-cu12 (566.8MiB)
3949
- Downloading nvidia-curand-cu12 (60.7MiB)
3950
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3951
- Downloading nvidia-nccl-cu12 (307.4MiB)
3952
- Downloading matplotlib (8.3MiB)
3953
- Downloading pillow (6.3MiB)
3954
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3955
- Downloading kiwisolver (1.4MiB)
3956
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3957
- Downloading torch (846.8MiB)
3958
- Downloading setuptools (1.1MiB)
3959
- Downloading numpy (15.9MiB)
3960
- Downloading nvidia-cufile-cu12 (1.1MiB)
3961
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3962
- Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3963
- Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3964
- Downloading nvidia-cufile-cu12
3965
- Downloading kiwisolver
3966
- Downloading setuptools
3967
- Downloading fonttools
3968
- Downloading networkx
3969
- Downloading pillow
3970
- Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3971
- Downloading nvidia-cuda-cupti-cu12
3972
- Downloading matplotlib
3973
- Downloading numpy
3974
- Downloading sympy
3975
- Downloading nvidia-nvjitlink-cu12
3976
- Downloading nvidia-curand-cu12
3977
- Downloading nvidia-cuda-nvrtc-cu12
3978
- Downloading triton
3979
- Downloading nvidia-cufft-cu12
3980
- Downloading nvidia-cusolver-cu12
3981
- Downloading nvidia-cusparse-cu12
3982
- Downloading nvidia-cusparselt-cu12
3983
- Downloading nvidia-nccl-cu12
3984
- Downloading nvidia-cublas-cu12
3985
- Downloading nvidia-cudnn-cu12
3986
- Downloading torch
3987
- Installed 37 packages in 248ms
3988
- </div>
3989
- </div>
3990
- <div class="cell-artifacts">
3991
- <h4>Artifacts:</h4>
3992
- <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
3993
  </div>
3994
  </div>
3995
  </div>
 
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>Memory Efficient Attention Implementation</h1>
3838
  <h2>Memory Efficient SDPA Benchmark</h2>
3839
+ <div class="cell cell-failed" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
+ Cell: benchmark | 0.01s | FAILED
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3850
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/mem_efficient_attention.md" target="_blank" class="github-btn">GitHub</a>
3851
  </div>
3852
+ <div id="code-benchmark" class="cell-code" data-lines="69">
3853
  <div class="code-wrap">
3854
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3855
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 
3860
  <span class="c1"># ]</span>
3861
  <span class="c1">#</span>
3862
  <span class="c1"># [tool.uv.sources]</span>
3863
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3864
  <span class="c1"># ///</span>
3865
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3866
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3917
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3918
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3919
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
3920
+ <span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
3921
  <span class="p">)</span>
3922
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3923
  </pre></div>
 
3926
  </div>
3927
  </div>
3928
  <div id="output-benchmark" class="cell-output">
3929
+ <div class="cell-stderr"> × Failed to resolve script requirement
3930
+ ╰─▶ Distribution not found at:
3931
+ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3932
  </div>
3933
  </div>
3934
  </div>
flash_attn/impls/sage_attention.html CHANGED
@@ -3829,27 +3829,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>SageAttention Implementation</h1>
3838
  <h2>SageAttention Benchmark (INT8 Quantized)</h2>
3839
- <div class="cell" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: benchmark | 41.27s
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3850
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
3851
  </div>
3852
- <div id="code-benchmark" class="cell-code" data-lines="79">
3853
  <div class="code-wrap">
3854
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3855
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
@@ -3862,7 +3862,7 @@ Cell: benchmark | 41.27s
3862
  <span class="c1"># ]</span>
3863
  <span class="c1">#</span>
3864
  <span class="c1"># [tool.uv.sources]</span>
3865
- <span class="c1"># kernels-benchmark-tools = { git = &quot;https://github.com/drbh/kernels-benchmark-tools.git&quot;, branch = &quot;main&quot; }</span>
3866
  <span class="c1"># ///</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3868
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3928,6 +3928,7 @@ Cell: benchmark | 41.27s
3928
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3929
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3930
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
 
3931
  <span class="p">)</span>
3932
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3933
  </pre></div>
@@ -3936,84 +3937,9 @@ Cell: benchmark | 41.27s
3936
  </div>
3937
  </div>
3938
  <div id="output-benchmark" class="cell-output">
3939
- <div class="cell-stdout">impl wl p50(ms) ok
3940
- sage_int8_fp16 flux_L128 FAIL False
3941
- Error: module &#x27;sage_attention_a8eb63760f50ebd&#x27; has no attribute &#x27;fwd&#x27;
3942
- sage_int8_fp16 flux_L256 FAIL False
3943
- Error: module &#x27;sage_attention_a8eb63760f50ebd&#x27; has no attribute &#x27;fwd&#x27;
3944
- sage_int8_fp16 flux_L320 FAIL False
3945
- Error: module &#x27;sage_attention_a8eb63760f50ebd&#x27; has no attribute &#x27;fwd&#x27;
3946
- sage_int8_fp16 flux_L384 FAIL False
3947
- Error: module &#x27;sage_attention_a8eb63760f50ebd&#x27; has no attribute &#x27;fwd&#x27;
3948
- sage_int8_fp16 flux_L448 FAIL False
3949
- Error: module &#x27;sage_attention_a8eb63760f50ebd&#x27; has no attribute &#x27;fwd&#x27;
3950
- sage_int8_fp16 flux_L512 FAIL False
3951
- Error: module &#x27;sage_attention_a8eb63760f50ebd&#x27; has no attribute &#x27;fwd&#x27;
3952
- </div>
3953
- <div class="uv-install-logs" id="uv-logs-benchmark">
3954
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3955
- <div class="uv-logs-content" style="display: none;">
3956
- Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3957
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3958
- Downloading nvidia-cufile-cu12 (1.1MiB)
3959
- Downloading hf-xet (3.0MiB)
3960
- Downloading sympy (6.0MiB)
3961
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3962
- Downloading nvidia-nccl-cu12 (307.4MiB)
3963
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3964
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3965
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3966
- Downloading kiwisolver (1.4MiB)
3967
- Downloading pillow (6.3MiB)
3968
- Downloading nvidia-cublas-cu12 (566.8MiB)
3969
- Downloading numpy (15.9MiB)
3970
- Downloading nvidia-curand-cu12 (60.7MiB)
3971
- Downloading matplotlib (8.3MiB)
3972
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3973
- Downloading fonttools (4.7MiB)
3974
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3975
- Downloading nvidia-cufft-cu12 (184.2MiB)
3976
- Downloading setuptools (1.1MiB)
3977
- Downloading networkx (1.9MiB)
3978
- Downloading triton (148.4MiB)
3979
- Downloading torch (846.8MiB)
3980
- Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3981
- Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3982
- Downloading nvidia-cufile-cu12
3983
- Downloading kiwisolver
3984
- Downloading hf-xet
3985
- Downloading setuptools
3986
- Downloading networkx
3987
- Downloading fonttools
3988
- Downloading pillow
3989
- Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3990
- Downloading matplotlib
3991
- Downloading nvidia-cuda-cupti-cu12
3992
- Downloading numpy
3993
- Downloading sympy
3994
- Downloading nvidia-nvjitlink-cu12
3995
- Downloading nvidia-curand-cu12
3996
- Downloading nvidia-cuda-nvrtc-cu12
3997
- Downloading triton
3998
- Downloading nvidia-cufft-cu12
3999
- Downloading nvidia-cusolver-cu12
4000
- Downloading nvidia-cusparselt-cu12
4001
- Downloading nvidia-cusparse-cu12
4002
- Downloading nvidia-nccl-cu12
4003
- Downloading nvidia-cublas-cu12
4004
- Downloading nvidia-cudnn-cu12
4005
- Downloading torch
4006
- Installed 48 packages in 239ms
4007
- </div>
4008
- </div>
4009
- <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
4010
- Fetching 11 files: 9%|▉ | 1/11 [00:00&lt;00:05, 1.85it/s]
4011
- Fetching 11 files: 45%|████▌ | 5/11 [00:00&lt;00:00, 6.46it/s]
4012
- Fetching 11 files: 73%|███████▎ | 8/11 [00:01&lt;00:00, 10.07it/s]
4013
- Fetching 11 files: 100%|██████████| 11/11 [00:01&lt;00:00, 10.94it/s]</div>
4014
- <div class="cell-artifacts">
4015
- <h4>Artifacts:</h4>
4016
- <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
4017
  </div>
4018
  </div>
4019
  </div>
 
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>SageAttention Implementation</h1>
3838
  <h2>SageAttention Benchmark (INT8 Quantized)</h2>
3839
+ <div class="cell cell-failed" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
+ Cell: benchmark | 0.05s | FAILED
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3850
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
3851
  </div>
3852
+ <div id="code-benchmark" class="cell-code" data-lines="80">
3853
  <div class="code-wrap">
3854
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3855
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 
3862
  <span class="c1"># ]</span>
3863
  <span class="c1">#</span>
3864
  <span class="c1"># [tool.uv.sources]</span>
3865
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3866
  <span class="c1"># ///</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3868
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3928
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3929
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3930
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
3931
+ <span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
3932
  <span class="p">)</span>
3933
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3934
  </pre></div>
 
3937
  </div>
3938
  </div>
3939
  <div id="output-benchmark" class="cell-output">
3940
+ <div class="cell-stderr"> × Failed to resolve script requirement
3941
+ ╰─▶ Distribution not found at:
3942
+ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3943
  </div>
3944
  </div>
3945
  </div>
flash_attn/impls/xformers.html CHANGED
@@ -3829,27 +3829,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
- Linux x86_64 | Linux-5.15.0-1084-aws-x86_64-with-glibc2.31
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>xFormers Memory Efficient Attention</h1>
3838
  <h2>xFormers Benchmark</h2>
3839
- <div class="cell" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: benchmark | 41.87s
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3850
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/xformers.md" target="_blank" class="github-btn">GitHub</a>
3851
  </div>
3852
- <div id="code-benchmark" class="cell-code" data-lines="68">
3853
  <div class="code-wrap">
3854
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3855
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
@@ -3861,7 +3861,7 @@ Cell: benchmark | 41.87s
3861
  <span class="c1"># ]</span>
3862
  <span class="c1">#</span>
3863
  <span class="c1"># [tool.uv.sources]</span>
3864
- <span class="c1"># kernels-benchmark-tools = { git = &quot;https://github.com/drbh/kernels-benchmark-tools.git&quot;, branch = &quot;main&quot; }</span>
3865
  <span class="c1"># ///</span>
3866
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
@@ -3917,6 +3917,7 @@ Cell: benchmark | 41.87s
3917
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3918
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3919
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
 
3920
  <span class="p">)</span>
3921
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3922
  </pre></div>
@@ -3925,73 +3926,9 @@ Cell: benchmark | 41.87s
3925
  </div>
3926
  </div>
3927
  <div id="output-benchmark" class="cell-output">
3928
- <div class="cell-stdout">impl wl p50(ms) ok
3929
- xformers_meff flux_L128 0.35 True
3930
- xformers_meff flux_L256 0.41 True
3931
- xformers_meff flux_L320 0.43 True
3932
- xformers_meff flux_L384 0.44 True
3933
- xformers_meff flux_L448 0.48 True
3934
- xformers_meff flux_L512 0.50 True
3935
- </div>
3936
- <div class="uv-install-logs" id="uv-logs-benchmark">
3937
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3938
- <div class="uv-logs-content" style="display: none;">
3939
- Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3940
- Downloading kiwisolver (1.4MiB)
3941
- Downloading setuptools (1.1MiB)
3942
- Downloading nvidia-curand-cu12 (60.7MiB)
3943
- Downloading nvidia-cufft-cu12 (184.2MiB)
3944
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3945
- Downloading nvidia-cufile-cu12 (1.1MiB)
3946
- Downloading pillow (6.3MiB)
3947
- Downloading numpy (15.9MiB)
3948
- Downloading matplotlib (8.3MiB)
3949
- Downloading fonttools (4.7MiB)
3950
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3951
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3952
- Downloading xformers (111.8MiB)
3953
- Downloading networkx (1.9MiB)
3954
- Downloading nvidia-nccl-cu12 (307.4MiB)
3955
- Downloading nvidia-cublas-cu12 (566.8MiB)
3956
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3957
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3958
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3959
- Downloading sympy (6.0MiB)
3960
- Downloading triton (148.4MiB)
3961
- Downloading torch (846.8MiB)
3962
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3963
- Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3964
- Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3965
- Downloading nvidia-cufile-cu12
3966
- Downloading kiwisolver
3967
- Downloading setuptools
3968
- Downloading networkx
3969
- Downloading fonttools
3970
- Downloading pillow
3971
- Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3972
- Downloading matplotlib
3973
- Downloading nvidia-cuda-cupti-cu12
3974
- Downloading numpy
3975
- Downloading sympy
3976
- Downloading nvidia-nvjitlink-cu12
3977
- Downloading nvidia-curand-cu12
3978
- Downloading nvidia-cuda-nvrtc-cu12
3979
- Downloading xformers
3980
- Downloading triton
3981
- Downloading nvidia-cufft-cu12
3982
- Downloading nvidia-cusolver-cu12
3983
- Downloading nvidia-cusparse-cu12
3984
- Downloading nvidia-cusparselt-cu12
3985
- Downloading nvidia-nccl-cu12
3986
- Downloading nvidia-cublas-cu12
3987
- Downloading nvidia-cudnn-cu12
3988
- Downloading torch
3989
- Installed 38 packages in 250ms
3990
- </div>
3991
- </div>
3992
- <div class="cell-artifacts">
3993
- <h4>Artifacts:</h4>
3994
- <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
3995
  </div>
3996
  </div>
3997
  </div>
 
3829
  <div class="system-info">
3830
  <div class="system-info-header">Generated on:</div>
3831
  <div class="system-info-content">
3832
+ Linux x86_64 | Linux-5.10.244-240.965.amzn2.x86_64-x86_64-with-glibc2.35
3833
  </div>
3834
  </div>
3835
 
3836
  <div class="main-content">
3837
  <h1>xFormers Memory Efficient Attention</h1>
3838
  <h2>xFormers Benchmark</h2>
3839
+ <div class="cell cell-failed" id="cell-benchmark">
3840
  <div class="cell-header">
3841
  <span class="collapse-indicators">
3842
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3843
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3844
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3845
  </span> |
3846
+ Cell: benchmark | 0.01s | FAILED
3847
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3849
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3850
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/xformers.md" target="_blank" class="github-btn">GitHub</a>
3851
  </div>
3852
+ <div id="code-benchmark" class="cell-code" data-lines="69">
3853
  <div class="code-wrap">
3854
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3855
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 
3861
  <span class="c1"># ]</span>
3862
  <span class="c1">#</span>
3863
  <span class="c1"># [tool.uv.sources]</span>
3864
+ <span class="c1"># kernels-benchmark-tools = { path = &quot;/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools&quot;, editable = true }</span>
3865
  <span class="c1"># ///</span>
3866
  <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
3867
  <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
 
3917
  <span class="n">gen</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">gen_qkv</span><span class="p">,</span>
3918
  <span class="n">ref</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">ref_math</span><span class="p">,</span>
3919
  <span class="n">cmp</span><span class="o">=</span><span class="n">kbt</span><span class="o">.</span><span class="n">attn</span><span class="o">.</span><span class="n">cmp_allclose</span><span class="p">,</span>
3920
+ <span class="n">profile_trace</span><span class="o">=</span><span class="kc">True</span>
3921
  <span class="p">)</span>
3922
  <span class="n">kbt</span><span class="o">.</span><span class="n">summarize</span><span class="p">([</span><span class="s2">&quot;attn.jsonl&quot;</span><span class="p">])</span>
3923
  </pre></div>
 
3926
  </div>
3927
  </div>
3928
  <div id="output-benchmark" class="cell-output">
3929
+ <div class="cell-stderr"> × Failed to resolve script requirement
3930
+ ╰─▶ Distribution not found at:
3931
+ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3932
  </div>
3933
  </div>
3934
  </div>
index.html CHANGED
@@ -79,7 +79,9 @@
79
  <body>
80
  <h1>Index of /</h1>
81
  <ul>
 
82
  <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
 
83
  </ul>
84
  </body>
85
  </html>
 
79
  <body>
80
  <h1>Index of /</h1>
81
  <ul>
82
+ <li><a href='activation/index.html' class='dir'>activation/</a></li>
83
  <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
84
+ <li><a href='layer_norm/index.html' class='dir'>layer_norm/</a></li>
85
  </ul>
86
  </body>
87
  </html>
layer_norm/impls/cells/benchmark.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels",
7
+ # "kernels-benchmark-tools",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
12
+ # ///
13
+ import torch
14
+ from kernels import get_kernel
15
+ import kernels_benchmark_tools as kbt
16
+
17
+ layer_norm_kernel = get_kernel("kernels-community/layer-norm")
18
+
19
+ def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
20
+ B, S, D = x.shape
21
+ # The kernel expects [N, D] input; support beta (bias) if provided.
22
+ out = layer_norm_kernel.dropout_add_ln_fwd(
23
+ input=x.view(-1, D),
24
+ gamma=weight,
25
+ beta=bias,
26
+ rowscale=None,
27
+ colscale=None,
28
+ x0_subset=None,
29
+ z_subset=None,
30
+ dropout_p=0.0,
31
+ epsilon=eps,
32
+ rowscale_const=1.0,
33
+ z_numrows=S,
34
+ gen=None,
35
+ residual_in_fp32=False,
36
+ is_rms_norm=False,
37
+ )[0].view(B, S, D)
38
+ return out
39
+
40
+ kbt.add(
41
+ "hf_kernels_layer_norm",
42
+ hf_kernels_layer_norm,
43
+ tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
44
+ )
45
+
46
+ if __name__ == "__main__":
47
+ device = "cuda" if torch.cuda.is_available() else "cpu"
48
+ dtype = "float32" if device == "cpu" else "bfloat16"
49
+
50
+ wl = list(kbt.layer_norm.llama_workloads(dtype)) if device == "cuda" else list(kbt.layer_norm.cpu_workloads(dtype))
51
+
52
+ kbt.run(
53
+ wl,
54
+ jsonl="ln.jsonl",
55
+ reps=5,
56
+ warmup=2,
57
+ gen=kbt.layer_norm.gen_inputs,
58
+ ref=kbt.layer_norm.ref_layer_norm,
59
+ cmp=kbt.layer_norm.cmp_allclose,
60
+ profile_trace=False,
61
+ )
62
+ kbt.summarize(["ln.jsonl"])
layer_norm/impls/cells/nv.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import subprocess
2
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
layer_norm/impls/hf_kernels_layer_norm.html ADDED
The diff for this file is too large to render. See raw diff
 
layer_norm/impls/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /layer_norm/impls</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /layer_norm/impls</h1>
84
+ <ul>
85
+ <li><a href='hf_kernels_layer_norm.html' class='file'>hf_kernels_layer_norm.html</a></li>
86
+ <li><a href='torch_layer_norm.html' class='file'>torch_layer_norm.html</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
layer_norm/impls/torch_layer_norm.html ADDED
The diff for this file is too large to render. See raw diff
 
layer_norm/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /layer_norm</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /layer_norm</h1>
84
+ <ul>
85
+ <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
+ <li><a href='results/index.html' class='dir'>results/</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
layer_norm/results/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /layer_norm/results</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /layer_norm/results</h1>
84
+ <ul>
85
+ <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
86
+ </ul>
87
+ </body>
88
+ </html>