Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- activation/impls/artifacts/benchmark/activation.jsonl +9 -9
- activation/impls/hf_kernels_swiglu.html +101 -99
- activation/impls/torch_swiglu.html +124 -124
- activation/results/artifacts/combine/latency.svg +2 -2
- activation/results/combined_results.html +76 -76
- causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
- causal_conv1d/impls/cells/benchmark.py +18 -9
- causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
- causal_conv1d/impls/torch_causal_conv1d.html +0 -0
- causal_conv1d/results/artifacts/combine/latency.svg +1 -1
- causal_conv1d/results/combined_results.html +141 -141
- deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl +4 -0
- deformable_detr/impls/cells/benchmark.py +118 -0
- deformable_detr/impls/cells/nv.py +2 -0
- deformable_detr/impls/hf_kernels_deformable_detr.html +0 -0
- deformable_detr/impls/index.html +89 -0
- deformable_detr/impls/torch_deformable_detr.html +0 -0
- deformable_detr/index.html +89 -0
- deformable_detr/results/artifacts/combine/latency.svg +3 -0
- deformable_detr/results/cells/combine.py +26 -0
- deformable_detr/results/combined_results.html +0 -0
- deformable_detr/results/index.html +88 -0
- flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
- flash_attn/impls/cells/benchmark.py +8 -10
- flash_attn/impls/flash_attention.html +143 -149
- flash_attn/impls/hf_kernels_flash_attn.html +97 -102
- flash_attn/impls/hf_kernels_flash_attn3.html +79 -79
- flash_attn/impls/mem_efficient_attention.html +133 -133
- flash_attn/impls/sage_attention.html +18 -14
- flash_attn/impls/xformers.html +137 -91
- flash_attn/results/artifacts/combine/latency.svg +2 -2
- flash_attn/results/combined_results.html +143 -143
- index.html +205 -51
- layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
- layer_norm/impls/cells/benchmark.py +5 -28
- layer_norm/impls/hf_kernels_layer_norm.html +59 -56
- layer_norm/impls/torch_layer_norm.html +56 -62
- layer_norm/results/artifacts/combine/latency.svg +2 -2
- layer_norm/results/combined_results.html +53 -53
- openai_moe/impls/artifacts/benchmark/openai_moe.jsonl +8 -0
- openai_moe/impls/binned_torch.html +0 -0
- openai_moe/impls/cells/benchmark.py +136 -0
- openai_moe/impls/cells/nv.py +2 -0
- openai_moe/impls/gpt_oss_moe.html +0 -0
- openai_moe/impls/index.html +89 -0
- openai_moe/index.html +89 -0
- openai_moe/results/artifacts/combine/latency.svg +3 -0
- openai_moe/results/cells/combine.py +27 -0
- openai_moe/results/combined_results.html +0 -0
- openai_moe/results/index.html +88 -0
activation/impls/artifacts/benchmark/activation.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024230000008174102, "p50": 0.024741000004269154, "p90": 0.025410999967334646, "mean": 0.024872599999525846, "iqr": 0.0011599999538702832, "raw_times": [0.024251000013464363, 0.025730000004386966, 0.024230000008174102, 0.025410999967334646, 0.024741000004269154], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03134100001034312, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026611000009779673, "p50": 0.029731000040555955, "p90": 0.03027100001418148, "mean": 0.029349000021738902, "iqr": 0.0009999999974752427, "raw_times": [0.026611000009779673, 0.029731000040555955, 0.030861000027471164, 0.03027100001418148, 0.02927100001670624], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.034871000025304966, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027259999967554904, "p50": 0.02879100003383428, "p90": 0.030951000042023225, "mean": 0.029224800016436348, "iqr": 0.0029600000175378227, "raw_times": [0.027991000024485402, 0.031131000014283927, 0.02879100003383428, 0.030951000042023225, 0.027259999967554904], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.0323909999906391, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025391000008312403, "p50": 0.02888100004838634, "p90": 0.029160999986288516, "mean": 0.028055000007043418, "iqr": 0.001839999981712026, "raw_times": [0.025391000008312403, 0.02888100004838634, 0.02952099998765334, 0.029160999986288516, 0.02732100000457649], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031509999985246395, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.026630000036220736, "p50": 0.027450000004591857, "p90": 0.027921000025799003, "mean": 0.02735460001304091, "iqr": 0.0010800000040944724, "raw_times": [0.026630000036220736, 0.027450000004591857, 0.02684100002170453, 0.027921000025799003, 0.027930999976888415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03172099997073019, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025049999976545223, "p50": 0.02733100001250932, "p90": 0.028329999963716546, "mean": 0.02741439998317219, "iqr": 0.0016189999882953998, "raw_times": [0.025049999976545223, 0.029649999987668707, 0.028329999963716546, 0.02733100001250932, 0.026710999975421146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029950999987704563, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028341000017917395, "p50": 0.02927099995986282, "p90": 0.029501000028631097, "mean": 0.02909080000108588, "iqr": 0.0009110000291912002, "raw_times": [0.028341000017917395, 0.02927099995986282, 0.029501000028631097, 0.029750999999578198, 0.028589999999439897], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03009099998507736, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024770999971224228, "p50": 0.02814099997294761, "p90": 0.028720999978304462, "mean": 0.0278467999919485, "iqr": 0.0007409999511764909, "raw_times": [0.024770999971224228, 0.02798000002712797, 0.028720999978304462, 0.02814099997294761, 0.029621000010138232], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031990999957542954, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-31T20:01:08Z", "run": "c3623842075144ab92176d6468514bae", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027751000004627713, "p50": 0.028230999987499672, "p90": 0.029471000004832604, "mean": 0.028608800005258672, "iqr": 0.0016500000015184924, "raw_times": [0.028230999987499672, 0.027751000004627713, 0.02782100000331411, 0.02977000002601926, 0.029471000004832604], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030850999962694914, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
activation/impls/hf_kernels_swiglu.html
CHANGED
|
@@ -4106,11 +4106,12 @@ body[data-tool="eraser"] .main-content {
|
|
| 4106 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
-
Cell: nv | 0.
|
| 4110 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 4112 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
-
<a href="https://github.com/huggingface/kernels-
|
|
|
|
| 4114 |
</div>
|
| 4115 |
<div id="code-nv" class="cell-code" data-lines="2">
|
| 4116 |
<div class="code-wrap">
|
|
@@ -4122,7 +4123,7 @@ Cell: nv | 0.23s
|
|
| 4122 |
</div>
|
| 4123 |
</div>
|
| 4124 |
<div id="output-nv" class="cell-output">
|
| 4125 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 4126 |
+-----------------------------------------------------------------------------------------+
|
| 4127 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 4128 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -4131,7 +4132,7 @@ Cell: nv | 0.23s
|
|
| 4131 |
| | | MIG M. |
|
| 4132 |
|=========================================+========================+======================|
|
| 4133 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 4134 |
-
| N/A
|
| 4135 |
| | | N/A |
|
| 4136 |
+-----------------------------------------+------------------------+----------------------+
|
| 4137 |
|
|
@@ -4155,11 +4156,12 @@ Cell: nv | 0.23s
|
|
| 4155 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4156 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4157 |
</span> |
|
| 4158 |
-
Cell: benchmark | 4.
|
| 4159 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4160 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4161 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4162 |
-
<a href="https://github.com/huggingface/kernels-
|
|
|
|
| 4163 |
</div>
|
| 4164 |
<div id="code-benchmark" class="cell-code" data-lines="34">
|
| 4165 |
<div class="code-wrap">
|
|
@@ -4211,17 +4213,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
|
|
| 4211 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4212 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4213 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4214 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4215 |
-
hf_kernels_swiglu
|
| 4216 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4217 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4218 |
-
Activity Buffer Request
|
| 4219 |
-
aten::empty
|
| 4220 |
-
cudaLaunchKernel 2.
|
| 4221 |
-
cudaDeviceSynchronize 0.
|
| 4222 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4223 |
-
Self CPU time total: 1.
|
| 4224 |
-
Self CUDA time total: 4.
|
| 4225 |
|
| 4226 |
|
| 4227 |
|
|
@@ -4231,17 +4233,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
|
|
| 4231 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4232 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4233 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4234 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4235 |
-
hf_kernels_swiglu 6.
|
| 4236 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4237 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4238 |
-
Activity Buffer Request
|
| 4239 |
-
aten::empty 1.
|
| 4240 |
-
cudaLaunchKernel 1.
|
| 4241 |
-
cudaDeviceSynchronize 0.
|
| 4242 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4243 |
-
Self CPU time total: 1.
|
| 4244 |
-
Self CUDA time total: 3.
|
| 4245 |
|
| 4246 |
|
| 4247 |
|
|
@@ -4251,17 +4253,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
|
|
| 4251 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4252 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4253 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4254 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.
|
| 4255 |
-
hf_kernels_swiglu 6.
|
| 4256 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4257 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4258 |
-
Activity Buffer Request
|
| 4259 |
-
aten::empty 1.
|
| 4260 |
-
cudaLaunchKernel 1.
|
| 4261 |
-
cudaDeviceSynchronize 0.
|
| 4262 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4263 |
-
Self CPU time total: 1.
|
| 4264 |
-
Self CUDA time total: 4.
|
| 4265 |
|
| 4266 |
|
| 4267 |
|
|
@@ -4271,17 +4273,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
|
|
| 4271 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4272 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4273 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4274 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4275 |
-
hf_kernels_swiglu 5.
|
| 4276 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4277 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4278 |
-
Activity Buffer Request
|
| 4279 |
-
aten::empty 1.
|
| 4280 |
-
cudaLaunchKernel
|
| 4281 |
-
cudaDeviceSynchronize 0.
|
| 4282 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4283 |
-
Self CPU time total: 1.
|
| 4284 |
-
Self CUDA time total: 4.
|
| 4285 |
|
| 4286 |
|
| 4287 |
|
|
@@ -4291,17 +4293,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
|
|
| 4291 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4292 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4293 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4294 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4295 |
-
hf_kernels_swiglu
|
| 4296 |
-
_activation_beeaae6::silu_and_mul 4.
|
| 4297 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4298 |
-
Activity Buffer Request
|
| 4299 |
-
aten::empty
|
| 4300 |
-
cudaLaunchKernel
|
| 4301 |
-
cudaDeviceSynchronize 1.
|
| 4302 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4303 |
-
Self CPU time total:
|
| 4304 |
-
Self CUDA time total: 5.
|
| 4305 |
|
| 4306 |
|
| 4307 |
|
|
@@ -4311,17 +4313,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
|
|
| 4311 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4312 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4313 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4314 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4315 |
-
hf_kernels_swiglu
|
| 4316 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4317 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4318 |
-
Activity Buffer Request 82.
|
| 4319 |
-
aten::empty 1.
|
| 4320 |
-
cudaLaunchKernel
|
| 4321 |
-
cudaDeviceSynchronize 0.
|
| 4322 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4323 |
-
Self CPU time total: 1.
|
| 4324 |
-
Self CUDA time total: 7.
|
| 4325 |
|
| 4326 |
|
| 4327 |
|
|
@@ -4331,17 +4333,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
|
|
| 4331 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4332 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4333 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4334 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4335 |
-
hf_kernels_swiglu
|
| 4336 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4337 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4338 |
-
Activity Buffer Request
|
| 4339 |
-
aten::empty 1.
|
| 4340 |
-
cudaLaunchKernel 8.
|
| 4341 |
-
cudaDeviceSynchronize 0.
|
| 4342 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4343 |
-
Self CPU time total: 1.
|
| 4344 |
-
Self CUDA time total: 6.
|
| 4345 |
|
| 4346 |
|
| 4347 |
|
|
@@ -4351,17 +4353,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
|
|
| 4351 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4352 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4353 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4354 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4355 |
-
hf_kernels_swiglu
|
| 4356 |
-
_activation_beeaae6::silu_and_mul
|
| 4357 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4358 |
-
Activity Buffer Request
|
| 4359 |
-
aten::empty
|
| 4360 |
-
cudaLaunchKernel
|
| 4361 |
-
cudaDeviceSynchronize 1.
|
| 4362 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4363 |
-
Self CPU time total:
|
| 4364 |
-
Self CUDA time total: 9.
|
| 4365 |
|
| 4366 |
|
| 4367 |
|
|
@@ -4371,17 +4373,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
|
|
| 4371 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4372 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4373 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4374 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4375 |
-
hf_kernels_swiglu 22.
|
| 4376 |
-
_activation_beeaae6::silu_and_mul
|
| 4377 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4378 |
-
Activity Buffer Request
|
| 4379 |
-
aten::empty 4.
|
| 4380 |
-
cudaLaunchKernel
|
| 4381 |
-
cudaDeviceSynchronize 1.
|
| 4382 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4383 |
-
Self CPU time total:
|
| 4384 |
-
Self CUDA time total: 13.
|
| 4385 |
|
| 4386 |
|
| 4387 |
impl wl p50(ms) ok
|
|
@@ -4398,12 +4400,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
|
|
| 4398 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4399 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4400 |
<div class="uv-logs-content" style="display: none;">
|
| 4401 |
-
Installed 15 packages in
|
| 4402 |
</div>
|
| 4403 |
</div>
|
| 4404 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4405 |
-
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00,
|
| 4406 |
-
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00,
|
| 4407 |
<div class="cell-artifacts">
|
| 4408 |
<h4>Artifacts:</h4>
|
| 4409 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 4106 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
+
Cell: nv | 0.26s
|
| 4110 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 4112 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/hf_kernels_swiglu.md" target="_blank" class="github-btn">GitHub</a>
|
| 4114 |
+
<a href="https://huggingface.co/kernels-community/activation" target="_blank" class="hf-btn">🤗 HF</a>
|
| 4115 |
</div>
|
| 4116 |
<div id="code-nv" class="cell-code" data-lines="2">
|
| 4117 |
<div class="code-wrap">
|
|
|
|
| 4123 |
</div>
|
| 4124 |
</div>
|
| 4125 |
<div id="output-nv" class="cell-output">
|
| 4126 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:17 2025
|
| 4127 |
+-----------------------------------------------------------------------------------------+
|
| 4128 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 4129 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 4132 |
| | | MIG M. |
|
| 4133 |
|=========================================+========================+======================|
|
| 4134 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 4135 |
+
| N/A 33C P0 108W / 350W | 0MiB / 46068MiB | 88% Default |
|
| 4136 |
| | | N/A |
|
| 4137 |
+-----------------------------------------+------------------------+----------------------+
|
| 4138 |
|
|
|
|
| 4156 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4157 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4158 |
</span> |
|
| 4159 |
+
Cell: benchmark | 4.19s
|
| 4160 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4161 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4162 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4163 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/hf_kernels_swiglu.md" target="_blank" class="github-btn">GitHub</a>
|
| 4164 |
+
<a href="https://huggingface.co/kernels-community/activation" target="_blank" class="hf-btn">🤗 HF</a>
|
| 4165 |
</div>
|
| 4166 |
<div id="code-benchmark" class="cell-code" data-lines="34">
|
| 4167 |
<div class="code-wrap">
|
|
|
|
| 4213 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4214 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4215 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4216 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 105.055us 2585.65% 105.055us 105.055us 1
|
| 4217 |
+
hf_kernels_swiglu 11.41% 202.714us 99.64% 1.770ms 1.770ms 0.000us 0.00% 5.471us 5.471us 1
|
| 4218 |
+
_activation_beeaae6::silu_and_mul 1.18% 21.050us 84.47% 1.501ms 500.190us 4.063us 100.00% 5.471us 1.824us 3
|
| 4219 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3
|
| 4220 |
+
Activity Buffer Request 80.70% 1.434ms 80.70% 1.434ms 1.434ms 1.408us 34.65% 1.408us 1.408us 1
|
| 4221 |
+
aten::empty 3.76% 66.772us 3.76% 66.772us 22.257us 0.000us 0.00% 0.000us 0.000us 3
|
| 4222 |
+
cudaLaunchKernel 2.58% 45.872us 2.58% 45.872us 15.291us 0.000us 0.00% 0.000us 0.000us 3
|
| 4223 |
+
cudaDeviceSynchronize 0.36% 6.420us 0.36% 6.420us 6.420us 0.000us 0.00% 0.000us 0.000us 1
|
| 4224 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4225 |
+
Self CPU time total: 1.776ms
|
| 4226 |
+
Self CUDA time total: 4.063us
|
| 4227 |
|
| 4228 |
|
| 4229 |
|
|
|
|
| 4233 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4234 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4235 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4236 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.119us 1540.69% 61.119us 61.119us 1
|
| 4237 |
+
hf_kernels_swiglu 6.50% 104.811us 99.67% 1.607ms 1.607ms 0.000us 0.00% 5.279us 5.279us 1
|
| 4238 |
+
_activation_beeaae6::silu_and_mul 1.26% 20.331us 91.95% 1.482ms 494.073us 3.967us 100.00% 5.279us 1.760us 3
|
| 4239 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3
|
| 4240 |
+
Activity Buffer Request 89.13% 1.437ms 89.13% 1.437ms 1.437ms 1.312us 33.07% 1.312us 1.312us 1
|
| 4241 |
+
aten::empty 1.22% 19.632us 1.22% 19.632us 6.544us 0.000us 0.00% 0.000us 0.000us 3
|
| 4242 |
+
cudaLaunchKernel 1.56% 25.120us 1.56% 25.120us 8.373us 0.000us 0.00% 0.000us 0.000us 3
|
| 4243 |
+
cudaDeviceSynchronize 0.33% 5.360us 0.33% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1
|
| 4244 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4245 |
+
Self CPU time total: 1.612ms
|
| 4246 |
+
Self CUDA time total: 3.967us
|
| 4247 |
|
| 4248 |
|
| 4249 |
|
|
|
|
| 4253 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4254 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4255 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4256 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.488us 1288.31% 63.488us 63.488us 1
|
| 4257 |
+
hf_kernels_swiglu 6.89% 111.363us 99.67% 1.611ms 1.611ms 0.000us 0.00% 6.592us 6.592us 1
|
| 4258 |
+
_activation_beeaae6::silu_and_mul 1.36% 22.028us 91.47% 1.479ms 492.912us 4.928us 100.00% 6.592us 2.197us 3
|
| 4259 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.928us 100.00% 4.928us 1.643us 3
|
| 4260 |
+
Activity Buffer Request 88.52% 1.431ms 88.52% 1.431ms 1.431ms 1.664us 33.77% 1.664us 1.664us 1
|
| 4261 |
+
aten::empty 1.30% 21.081us 1.30% 21.081us 7.027us 0.000us 0.00% 0.000us 0.000us 3
|
| 4262 |
+
cudaLaunchKernel 1.59% 25.652us 1.59% 25.652us 8.551us 0.000us 0.00% 0.000us 0.000us 3
|
| 4263 |
+
cudaDeviceSynchronize 0.33% 5.390us 0.33% 5.390us 5.390us 0.000us 0.00% 0.000us 0.000us 1
|
| 4264 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4265 |
+
Self CPU time total: 1.617ms
|
| 4266 |
+
Self CUDA time total: 4.928us
|
| 4267 |
|
| 4268 |
|
| 4269 |
|
|
|
|
| 4273 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4274 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4275 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4276 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.000us 1585.82% 68.000us 68.000us 1
|
| 4277 |
+
hf_kernels_swiglu 5.97% 106.915us 99.70% 1.784ms 1.784ms 0.000us 0.00% 5.760us 5.760us 1
|
| 4278 |
+
_activation_beeaae6::silu_and_mul 1.16% 20.770us 92.62% 1.658ms 552.564us 4.288us 100.00% 5.760us 1.920us 3
|
| 4279 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3
|
| 4280 |
+
Activity Buffer Request 80.58% 1.442ms 80.58% 1.442ms 1.442ms 1.472us 34.33% 1.472us 1.472us 1
|
| 4281 |
+
aten::empty 1.10% 19.770us 1.10% 19.770us 6.590us 0.000us 0.00% 0.000us 0.000us 3
|
| 4282 |
+
cudaLaunchKernel 10.88% 194.785us 10.88% 194.785us 64.928us 0.000us 0.00% 0.000us 0.000us 3
|
| 4283 |
+
cudaDeviceSynchronize 0.30% 5.350us 0.30% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1
|
| 4284 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4285 |
+
Self CPU time total: 1.790ms
|
| 4286 |
+
Self CUDA time total: 4.288us
|
| 4287 |
|
| 4288 |
|
| 4289 |
|
|
|
|
| 4293 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4294 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4295 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4296 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.599us 1108.28% 65.599us 65.599us 1
|
| 4297 |
+
hf_kernels_swiglu 18.75% 89.073us 98.88% 469.813us 469.813us 0.000us 0.00% 7.903us 7.903us 1
|
| 4298 |
+
_activation_beeaae6::silu_and_mul 4.69% 22.280us 76.20% 362.069us 120.690us 5.919us 100.00% 7.903us 2.634us 3
|
| 4299 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 100.00% 5.919us 1.973us 3
|
| 4300 |
+
Activity Buffer Request 38.23% 181.645us 38.23% 181.645us 181.645us 1.984us 33.52% 1.984us 1.984us 1
|
| 4301 |
+
aten::empty 3.93% 18.671us 3.93% 18.671us 6.224us 0.000us 0.00% 0.000us 0.000us 3
|
| 4302 |
+
cudaLaunchKernel 33.28% 158.144us 33.28% 158.144us 52.715us 0.000us 0.00% 0.000us 0.000us 3
|
| 4303 |
+
cudaDeviceSynchronize 1.12% 5.330us 1.12% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1
|
| 4304 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4305 |
+
Self CPU time total: 475.143us
|
| 4306 |
+
Self CUDA time total: 5.919us
|
| 4307 |
|
| 4308 |
|
| 4309 |
|
|
|
|
| 4313 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4314 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4315 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4316 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.207us 906.60% 70.207us 70.207us 1
|
| 4317 |
+
hf_kernels_swiglu 6.12% 106.261us 99.74% 1.733ms 1.733ms 0.000us 0.00% 10.336us 10.336us 1
|
| 4318 |
+
_activation_beeaae6::silu_and_mul 1.25% 21.782us 92.41% 1.606ms 535.254us 7.744us 100.00% 10.336us 3.445us 3
|
| 4319 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 100.00% 7.744us 2.581us 3
|
| 4320 |
+
Activity Buffer Request 82.36% 1.431ms 82.36% 1.431ms 1.431ms 2.592us 33.47% 2.592us 2.592us 1
|
| 4321 |
+
aten::empty 1.21% 21.081us 1.21% 21.081us 7.027us 0.000us 0.00% 0.000us 0.000us 3
|
| 4322 |
+
cudaLaunchKernel 8.80% 152.893us 8.80% 152.893us 50.964us 0.000us 0.00% 0.000us 0.000us 3
|
| 4323 |
+
cudaDeviceSynchronize 0.26% 4.511us 0.26% 4.511us 4.511us 0.000us 0.00% 0.000us 0.000us 1
|
| 4324 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4325 |
+
Self CPU time total: 1.738ms
|
| 4326 |
+
Self CUDA time total: 7.744us
|
| 4327 |
|
| 4328 |
|
| 4329 |
|
|
|
|
| 4333 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4334 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4335 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4336 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.214us 1045.06% 69.214us 69.214us 1
|
| 4337 |
+
hf_kernels_swiglu 7.00% 122.783us 99.73% 1.750ms 1.750ms 0.000us 0.00% 8.830us 8.830us 1
|
| 4338 |
+
_activation_beeaae6::silu_and_mul 1.22% 21.430us 91.58% 1.607ms 535.694us 6.623us 100.00% 8.830us 2.943us 3
|
| 4339 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.623us 100.00% 6.623us 2.208us 3
|
| 4340 |
+
Activity Buffer Request 81.74% 1.434ms 81.74% 1.434ms 1.434ms 2.207us 33.32% 2.207us 2.207us 1
|
| 4341 |
+
aten::empty 1.15% 20.211us 1.15% 20.211us 6.737us 0.000us 0.00% 0.000us 0.000us 3
|
| 4342 |
+
cudaLaunchKernel 8.62% 151.304us 8.62% 151.304us 50.435us 0.000us 0.00% 0.000us 0.000us 3
|
| 4343 |
+
cudaDeviceSynchronize 0.27% 4.780us 0.27% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1
|
| 4344 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4345 |
+
Self CPU time total: 1.755ms
|
| 4346 |
+
Self CUDA time total: 6.623us
|
| 4347 |
|
| 4348 |
|
| 4349 |
|
|
|
|
| 4353 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4354 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4355 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4356 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.152us 692.52% 65.152us 65.152us 1
|
| 4357 |
+
hf_kernels_swiglu 21.62% 91.474us 98.93% 418.571us 418.571us 0.000us 0.00% 12.576us 12.576us 1
|
| 4358 |
+
_activation_beeaae6::silu_and_mul 4.88% 20.631us 69.03% 292.067us 97.356us 9.408us 100.00% 12.576us 4.192us 3
|
| 4359 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.408us 100.00% 9.408us 3.136us 3
|
| 4360 |
+
Activity Buffer Request 28.63% 121.143us 28.63% 121.143us 121.143us 3.168us 33.67% 3.168us 3.168us 1
|
| 4361 |
+
aten::empty 8.28% 35.030us 8.28% 35.030us 11.677us 0.000us 0.00% 0.000us 0.000us 3
|
| 4362 |
+
cudaLaunchKernel 35.52% 150.293us 35.52% 150.293us 50.098us 0.000us 0.00% 0.000us 0.000us 3
|
| 4363 |
+
cudaDeviceSynchronize 1.07% 4.530us 1.07% 4.530us 4.530us 0.000us 0.00% 0.000us 0.000us 1
|
| 4364 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4365 |
+
Self CPU time total: 423.101us
|
| 4366 |
+
Self CUDA time total: 9.408us
|
| 4367 |
|
| 4368 |
|
| 4369 |
|
|
|
|
| 4373 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4374 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4375 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4376 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.197us 514.72% 67.197us 67.197us 1
|
| 4377 |
+
hf_kernels_swiglu 22.39% 97.642us 98.93% 431.481us 431.481us 0.000us 0.00% 17.439us 17.439us 1
|
| 4378 |
+
_activation_beeaae6::silu_and_mul 4.99% 21.781us 71.94% 313.789us 104.596us 13.055us 100.00% 17.439us 5.813us 3
|
| 4379 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.055us 100.00% 13.055us 4.352us 3
|
| 4380 |
+
Activity Buffer Request 32.48% 141.684us 32.48% 141.684us 141.684us 4.384us 33.58% 4.384us 4.384us 1
|
| 4381 |
+
aten::empty 4.60% 20.050us 4.60% 20.050us 6.683us 0.000us 0.00% 0.000us 0.000us 3
|
| 4382 |
+
cudaLaunchKernel 34.47% 150.324us 34.47% 150.324us 50.108us 0.000us 0.00% 0.000us 0.000us 3
|
| 4383 |
+
cudaDeviceSynchronize 1.07% 4.681us 1.07% 4.681us 4.681us 0.000us 0.00% 0.000us 0.000us 1
|
| 4384 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4385 |
+
Self CPU time total: 436.162us
|
| 4386 |
+
Self CUDA time total: 13.055us
|
| 4387 |
|
| 4388 |
|
| 4389 |
impl wl p50(ms) ok
|
|
|
|
| 4400 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4401 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4402 |
<div class="uv-logs-content" style="display: none;">
|
| 4403 |
+
Installed 15 packages in 15ms
|
| 4404 |
</div>
|
| 4405 |
</div>
|
| 4406 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4407 |
+
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 15.31it/s]
|
| 4408 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 21.41it/s]</div>
|
| 4409 |
<div class="cell-artifacts">
|
| 4410 |
<h4>Artifacts:</h4>
|
| 4411 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/impls/torch_swiglu.html
CHANGED
|
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 4106 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
-
Cell: nv | 0.
|
| 4110 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 4112 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
-
<a href="https://github.com/huggingface/kernels-
|
| 4114 |
</div>
|
| 4115 |
<div id="code-nv" class="cell-code" data-lines="2">
|
| 4116 |
<div class="code-wrap">
|
|
@@ -4122,7 +4122,7 @@ Cell: nv | 0.23s
|
|
| 4122 |
</div>
|
| 4123 |
</div>
|
| 4124 |
<div id="output-nv" class="cell-output">
|
| 4125 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 4126 |
+-----------------------------------------------------------------------------------------+
|
| 4127 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 4128 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -4131,7 +4131,7 @@ Cell: nv | 0.23s
|
|
| 4131 |
| | | MIG M. |
|
| 4132 |
|=========================================+========================+======================|
|
| 4133 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 4134 |
-
| N/A
|
| 4135 |
| | | N/A |
|
| 4136 |
+-----------------------------------------+------------------------+----------------------+
|
| 4137 |
|
|
@@ -4155,11 +4155,11 @@ Cell: nv | 0.23s
|
|
| 4155 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4156 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4157 |
</span> |
|
| 4158 |
-
Cell: benchmark |
|
| 4159 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4160 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4161 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4162 |
-
<a href="https://github.com/huggingface/kernels-
|
| 4163 |
</div>
|
| 4164 |
<div id="code-benchmark" class="cell-code" data-lines="28">
|
| 4165 |
<div class="code-wrap">
|
|
@@ -4205,20 +4205,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
|
|
| 4205 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4206 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4207 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4208 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4209 |
-
torch_eager 11.
|
| 4210 |
-
aten::silu 3.37% 63.
|
| 4211 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us
|
| 4212 |
-
aten::mul 1.
|
| 4213 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4214 |
-
Activity Buffer Request 76.
|
| 4215 |
-
aten::slice 2.
|
| 4216 |
-
aten::as_strided 0.
|
| 4217 |
-
cudaLaunchKernel 3.
|
| 4218 |
-
cudaDeviceSynchronize 0.
|
| 4219 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4220 |
-
Self CPU time total: 1.
|
| 4221 |
-
Self CUDA time total: 12.
|
| 4222 |
|
| 4223 |
|
| 4224 |
|
|
@@ -4228,20 +4228,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
|
|
| 4228 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4229 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4230 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4231 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4232 |
-
torch_eager 6.
|
| 4233 |
-
aten::silu 2.
|
| 4234 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4235 |
-
aten::mul 1.
|
| 4236 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.
|
| 4237 |
-
Activity Buffer Request 84.
|
| 4238 |
-
aten::slice 1.
|
| 4239 |
-
aten::as_strided 0.
|
| 4240 |
-
cudaLaunchKernel 2.
|
| 4241 |
-
cudaDeviceSynchronize 0.
|
| 4242 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4243 |
-
Self CPU time total: 1.
|
| 4244 |
-
Self CUDA time total: 12.
|
| 4245 |
|
| 4246 |
|
| 4247 |
|
|
@@ -4251,20 +4251,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
|
|
| 4251 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4252 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4253 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4254 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4255 |
-
torch_eager 6.
|
| 4256 |
-
aten::silu 2.
|
| 4257 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4258 |
-
aten::mul 1.
|
| 4259 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4260 |
-
Activity Buffer Request 84.
|
| 4261 |
-
aten::slice 1.
|
| 4262 |
-
aten::as_strided 0.
|
| 4263 |
-
cudaLaunchKernel 2.
|
| 4264 |
-
cudaDeviceSynchronize 0.
|
| 4265 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4266 |
-
Self CPU time total: 1.
|
| 4267 |
-
Self CUDA time total: 13.
|
| 4268 |
|
| 4269 |
|
| 4270 |
|
|
@@ -4274,20 +4274,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
|
|
| 4274 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4275 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4276 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4277 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4278 |
-
torch_eager
|
| 4279 |
-
aten::silu 2.
|
| 4280 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.
|
| 4281 |
-
aten::mul 1.
|
| 4282 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4283 |
-
Activity Buffer Request
|
| 4284 |
-
aten::slice 1.
|
| 4285 |
-
aten::as_strided 0.
|
| 4286 |
-
cudaLaunchKernel
|
| 4287 |
-
cudaDeviceSynchronize 0.
|
| 4288 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4289 |
-
Self CPU time total: 1.
|
| 4290 |
-
Self CUDA time total: 12.
|
| 4291 |
|
| 4292 |
|
| 4293 |
|
|
@@ -4297,20 +4297,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
|
|
| 4297 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4298 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4299 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4300 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4301 |
-
torch_eager 5.
|
| 4302 |
-
aten::silu 2.
|
| 4303 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4304 |
-
aten::mul 1.
|
| 4305 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4306 |
-
Activity Buffer Request 78.
|
| 4307 |
-
aten::slice 1.
|
| 4308 |
-
aten::as_strided 0.
|
| 4309 |
-
cudaLaunchKernel 9.
|
| 4310 |
-
cudaDeviceSynchronize 0.28%
|
| 4311 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4312 |
-
Self CPU time total: 1.
|
| 4313 |
-
Self CUDA time total: 13.
|
| 4314 |
|
| 4315 |
|
| 4316 |
|
|
@@ -4320,20 +4320,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
|
|
| 4320 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4321 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4322 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4323 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4324 |
-
torch_eager
|
| 4325 |
-
aten::silu
|
| 4326 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4327 |
-
aten::mul 5.
|
| 4328 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4329 |
-
Activity Buffer Request
|
| 4330 |
-
aten::slice
|
| 4331 |
-
aten::as_strided 1.
|
| 4332 |
-
cudaLaunchKernel
|
| 4333 |
-
cudaDeviceSynchronize
|
| 4334 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4335 |
-
Self CPU time total:
|
| 4336 |
-
Self CUDA time total: 15.
|
| 4337 |
|
| 4338 |
|
| 4339 |
|
|
@@ -4343,20 +4343,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
|
|
| 4343 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4344 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4345 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4346 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4347 |
-
torch_eager
|
| 4348 |
-
aten::silu 2.
|
| 4349 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4350 |
-
aten::mul 1.
|
| 4351 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4352 |
-
Activity Buffer Request
|
| 4353 |
-
aten::slice 1.
|
| 4354 |
-
aten::as_strided 0.
|
| 4355 |
-
cudaLaunchKernel 9.
|
| 4356 |
-
cudaDeviceSynchronize 0.
|
| 4357 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4358 |
-
Self CPU time total: 1.
|
| 4359 |
-
Self CUDA time total: 14.
|
| 4360 |
|
| 4361 |
|
| 4362 |
|
|
@@ -4366,20 +4366,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
|
|
| 4366 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4367 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4368 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4369 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4370 |
-
torch_eager
|
| 4371 |
-
aten::silu
|
| 4372 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4373 |
-
aten::mul
|
| 4374 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4375 |
-
Activity Buffer Request
|
| 4376 |
-
aten::slice 5.
|
| 4377 |
-
aten::as_strided 1.
|
| 4378 |
-
cudaLaunchKernel 35.
|
| 4379 |
-
cudaDeviceSynchronize
|
| 4380 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4381 |
-
Self CPU time total:
|
| 4382 |
-
Self CUDA time total: 15.
|
| 4383 |
|
| 4384 |
|
| 4385 |
|
|
@@ -4389,20 +4389,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
|
|
| 4389 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4390 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4391 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4392 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4393 |
-
torch_eager 6.
|
| 4394 |
-
aten::silu 2.
|
| 4395 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.
|
| 4396 |
-
aten::mul 1.
|
| 4397 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.
|
| 4398 |
-
Activity Buffer Request
|
| 4399 |
-
aten::slice 1.
|
| 4400 |
-
aten::as_strided 0.
|
| 4401 |
-
cudaLaunchKernel 9.
|
| 4402 |
-
cudaDeviceSynchronize 0.
|
| 4403 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4404 |
-
Self CPU time total: 1.
|
| 4405 |
-
Self CUDA time total: 22.
|
| 4406 |
|
| 4407 |
|
| 4408 |
impl wl p50(ms) ok
|
|
@@ -4419,7 +4419,7 @@ torch_eager cuda_T512_D768 0.05 True
|
|
| 4419 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4420 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4421 |
<div class="uv-logs-content" style="display: none;">
|
| 4422 |
-
Installed 37 packages in
|
| 4423 |
</div>
|
| 4424 |
</div>
|
| 4425 |
<div class="cell-artifacts">
|
|
|
|
| 4106 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
+
Cell: nv | 0.26s
|
| 4110 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 4112 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/torch_swiglu.md" target="_blank" class="github-btn">GitHub</a>
|
| 4114 |
</div>
|
| 4115 |
<div id="code-nv" class="cell-code" data-lines="2">
|
| 4116 |
<div class="code-wrap">
|
|
|
|
| 4122 |
</div>
|
| 4123 |
</div>
|
| 4124 |
<div id="output-nv" class="cell-output">
|
| 4125 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:17 2025
|
| 4126 |
+-----------------------------------------------------------------------------------------+
|
| 4127 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 4128 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 4131 |
| | | MIG M. |
|
| 4132 |
|=========================================+========================+======================|
|
| 4133 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 4134 |
+
| N/A 33C P0 108W / 350W | 0MiB / 46068MiB | 88% Default |
|
| 4135 |
| | | N/A |
|
| 4136 |
+-----------------------------------------+------------------------+----------------------+
|
| 4137 |
|
|
|
|
| 4155 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4156 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4157 |
</span> |
|
| 4158 |
+
Cell: benchmark | 7.02s
|
| 4159 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4160 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4161 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4162 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/activation/impls/torch_swiglu.md" target="_blank" class="github-btn">GitHub</a>
|
| 4163 |
</div>
|
| 4164 |
<div id="code-benchmark" class="cell-code" data-lines="28">
|
| 4165 |
<div class="code-wrap">
|
|
|
|
| 4205 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4206 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4207 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4208 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 188.575us 1476.70% 188.575us 188.575us 1
|
| 4209 |
+
torch_eager 11.13% 210.826us 99.56% 1.887ms 1.887ms 0.000us 0.00% 15.106us 15.106us 1
|
| 4210 |
+
aten::silu 3.37% 63.781us 82.44% 1.562ms 520.736us 6.497us 50.88% 8.833us 2.944us 3
|
| 4211 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 50.88% 6.497us 2.166us 3
|
| 4212 |
+
aten::mul 1.86% 35.170us 2.95% 55.841us 18.614us 6.273us 49.12% 6.273us 2.091us 3
|
| 4213 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.273us 49.12% 6.273us 2.091us 3
|
| 4214 |
+
Activity Buffer Request 76.78% 1.455ms 76.78% 1.455ms 1.455ms 2.336us 18.29% 2.336us 2.336us 1
|
| 4215 |
+
aten::slice 2.45% 46.380us 3.05% 57.842us 9.640us 0.000us 0.00% 0.000us 0.000us 6
|
| 4216 |
+
aten::as_strided 0.60% 11.462us 0.60% 11.462us 1.910us 0.000us 0.00% 0.000us 0.000us 6
|
| 4217 |
+
cudaLaunchKernel 3.38% 64.112us 3.38% 64.112us 10.685us 0.000us 0.00% 0.000us 0.000us 6
|
| 4218 |
+
cudaDeviceSynchronize 0.44% 8.280us 0.44% 8.280us 8.280us 0.000us 0.00% 0.000us 0.000us 1
|
| 4219 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4220 |
+
Self CPU time total: 1.895ms
|
| 4221 |
+
Self CUDA time total: 12.770us
|
| 4222 |
|
| 4223 |
|
| 4224 |
|
|
|
|
| 4228 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4229 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4230 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4231 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.926us 1234.87% 152.926us 152.926us 1
|
| 4232 |
+
torch_eager 6.55% 113.093us 99.67% 1.721ms 1.721ms 0.000us 0.00% 14.560us 14.560us 1
|
| 4233 |
+
aten::silu 2.40% 41.391us 88.69% 1.532ms 510.609us 6.400us 51.68% 8.576us 2.859us 3
|
| 4234 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 51.68% 6.400us 2.133us 3
|
| 4235 |
+
aten::mul 1.50% 25.830us 2.63% 45.361us 15.120us 5.984us 48.32% 5.984us 1.995us 3
|
| 4236 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
|
| 4237 |
+
Activity Buffer Request 84.72% 1.463ms 84.72% 1.463ms 1.463ms 2.176us 17.57% 2.176us 2.176us 1
|
| 4238 |
+
aten::slice 1.43% 24.741us 1.80% 31.062us 5.177us 0.000us 0.00% 0.000us 0.000us 6
|
| 4239 |
+
aten::as_strided 0.37% 6.321us 0.37% 6.321us 1.054us 0.000us 0.00% 0.000us 0.000us 6
|
| 4240 |
+
cudaLaunchKernel 2.71% 46.721us 2.71% 46.721us 7.787us 0.000us 0.00% 0.000us 0.000us 6
|
| 4241 |
+
cudaDeviceSynchronize 0.33% 5.741us 0.33% 5.741us 5.741us 0.000us 0.00% 0.000us 0.000us 1
|
| 4242 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4243 |
+
Self CPU time total: 1.727ms
|
| 4244 |
+
Self CUDA time total: 12.384us
|
| 4245 |
|
| 4246 |
|
| 4247 |
|
|
|
|
| 4251 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4252 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4253 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4254 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 152.413us 1147.86% 152.413us 152.413us 1
|
| 4255 |
+
torch_eager 6.17% 105.134us 99.68% 1.699ms 1.699ms 0.000us 0.00% 15.581us 15.581us 1
|
| 4256 |
+
aten::silu 2.58% 43.990us 88.96% 1.517ms 505.533us 6.814us 51.32% 9.117us 3.039us 3
|
| 4257 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.814us 51.32% 6.814us 2.271us 3
|
| 4258 |
+
aten::mul 1.63% 27.711us 2.72% 46.371us 15.457us 6.464us 48.68% 6.464us 2.155us 3
|
| 4259 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.68% 6.464us 2.155us 3
|
| 4260 |
+
Activity Buffer Request 84.84% 1.446ms 84.84% 1.446ms 1.446ms 2.303us 17.34% 2.303us 2.303us 1
|
| 4261 |
+
aten::slice 1.47% 24.990us 1.83% 31.250us 5.208us 0.000us 0.00% 0.000us 0.000us 6
|
| 4262 |
+
aten::as_strided 0.37% 6.260us 0.37% 6.260us 1.043us 0.000us 0.00% 0.000us 0.000us 6
|
| 4263 |
+
cudaLaunchKernel 2.63% 44.871us 2.63% 44.871us 7.478us 0.000us 0.00% 0.000us 0.000us 6
|
| 4264 |
+
cudaDeviceSynchronize 0.32% 5.431us 0.32% 5.431us 5.431us 0.000us 0.00% 0.000us 0.000us 1
|
| 4265 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4266 |
+
Self CPU time total: 1.705ms
|
| 4267 |
+
Self CUDA time total: 13.278us
|
| 4268 |
|
| 4269 |
|
| 4270 |
|
|
|
|
| 4274 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4275 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4276 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4277 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.359us 1219.84% 155.359us 155.359us 1
|
| 4278 |
+
torch_eager 6.31% 109.593us 99.71% 1.733ms 1.733ms 0.000us 0.00% 14.944us 14.944us 1
|
| 4279 |
+
aten::silu 2.48% 43.021us 88.93% 1.545ms 515.160us 6.560us 51.51% 8.768us 2.923us 3
|
| 4280 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
|
| 4281 |
+
aten::mul 1.62% 28.091us 2.66% 46.261us 15.420us 6.176us 48.49% 6.176us 2.059us 3
|
| 4282 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
|
| 4283 |
+
Activity Buffer Request 74.70% 1.298ms 74.70% 1.298ms 1.298ms 2.208us 17.34% 2.208us 2.208us 1
|
| 4284 |
+
aten::slice 1.46% 25.370us 1.82% 31.631us 5.272us 0.000us 0.00% 0.000us 0.000us 6
|
| 4285 |
+
aten::as_strided 0.36% 6.261us 0.36% 6.261us 1.043us 0.000us 0.00% 0.000us 0.000us 6
|
| 4286 |
+
cudaLaunchKernel 12.80% 222.405us 12.80% 222.405us 37.068us 0.000us 0.00% 0.000us 0.000us 6
|
| 4287 |
+
cudaDeviceSynchronize 0.29% 4.960us 0.29% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
|
| 4288 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4289 |
+
Self CPU time total: 1.738ms
|
| 4290 |
+
Self CUDA time total: 12.736us
|
| 4291 |
|
| 4292 |
|
| 4293 |
|
|
|
|
| 4297 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4298 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4299 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4300 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.122us 1152.94% 153.122us 153.122us 1
|
| 4301 |
+
torch_eager 5.95% 108.905us 99.72% 1.827ms 1.827ms 0.000us 0.00% 15.585us 15.585us 1
|
| 4302 |
+
aten::silu 2.26% 41.441us 89.57% 1.641ms 546.874us 6.816us 51.32% 9.120us 3.040us 3
|
| 4303 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.32% 6.816us 2.272us 3
|
| 4304 |
+
aten::mul 1.45% 26.581us 2.47% 45.261us 15.087us 6.465us 48.68% 6.465us 2.155us 3
|
| 4305 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.68% 6.465us 2.155us 3
|
| 4306 |
+
Activity Buffer Request 78.54% 1.439ms 78.54% 1.439ms 1.439ms 2.304us 17.35% 2.304us 2.304us 1
|
| 4307 |
+
aten::slice 1.41% 25.869us 1.74% 31.870us 5.312us 0.000us 0.00% 0.000us 0.000us 6
|
| 4308 |
+
aten::as_strided 0.33% 6.001us 0.33% 6.001us 1.000us 0.000us 0.00% 0.000us 0.000us 6
|
| 4309 |
+
cudaLaunchKernel 9.78% 179.164us 9.78% 179.164us 29.861us 0.000us 0.00% 0.000us 0.000us 6
|
| 4310 |
+
cudaDeviceSynchronize 0.28% 5.090us 0.28% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1
|
| 4311 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4312 |
+
Self CPU time total: 1.832ms
|
| 4313 |
+
Self CUDA time total: 13.281us
|
| 4314 |
|
| 4315 |
|
| 4316 |
|
|
|
|
| 4320 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4321 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4322 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4323 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.877us 970.08% 150.877us 150.877us 1
|
| 4324 |
+
torch_eager 20.61% 104.763us 99.03% 503.283us 503.283us 0.000us 0.00% 18.241us 18.241us 1
|
| 4325 |
+
aten::silu 8.60% 43.701us 63.19% 321.148us 107.049us 7.969us 51.24% 10.657us 3.552us 3
|
| 4326 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.969us 51.24% 7.969us 2.656us 3
|
| 4327 |
+
aten::mul 5.45% 27.720us 8.99% 45.690us 15.230us 7.584us 48.76% 7.584us 2.528us 3
|
| 4328 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.76% 7.584us 2.528us 3
|
| 4329 |
+
Activity Buffer Request 24.24% 123.213us 24.24% 123.213us 123.213us 2.688us 17.28% 2.688us 2.688us 1
|
| 4330 |
+
aten::slice 5.04% 25.603us 6.23% 31.682us 5.280us 0.000us 0.00% 0.000us 0.000us 6
|
| 4331 |
+
aten::as_strided 1.20% 6.079us 1.20% 6.079us 1.013us 0.000us 0.00% 0.000us 0.000us 6
|
| 4332 |
+
cudaLaunchKernel 33.88% 172.204us 33.88% 172.204us 28.701us 0.000us 0.00% 0.000us 0.000us 6
|
| 4333 |
+
cudaDeviceSynchronize 0.97% 4.940us 0.97% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1
|
| 4334 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4335 |
+
Self CPU time total: 508.223us
|
| 4336 |
+
Self CUDA time total: 15.553us
|
| 4337 |
|
| 4338 |
|
| 4339 |
|
|
|
|
| 4343 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4344 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4345 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4346 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.541us 1089.44% 156.541us 156.541us 1
|
| 4347 |
+
torch_eager 6.81% 125.673us 99.72% 1.840ms 1.840ms 0.000us 0.00% 16.866us 16.866us 1
|
| 4348 |
+
aten::silu 2.28% 42.101us 88.57% 1.634ms 544.654us 7.361us 51.23% 9.858us 3.286us 3
|
| 4349 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 51.23% 7.361us 2.454us 3
|
| 4350 |
+
aten::mul 1.53% 28.200us 2.53% 46.622us 15.541us 7.008us 48.77% 7.008us 2.336us 3
|
| 4351 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.77% 7.008us 2.336us 3
|
| 4352 |
+
Activity Buffer Request 77.96% 1.438ms 77.96% 1.438ms 1.438ms 2.497us 17.38% 2.497us 2.497us 1
|
| 4353 |
+
aten::slice 1.46% 26.979us 1.81% 33.310us 5.552us 0.000us 0.00% 0.000us 0.000us 6
|
| 4354 |
+
aten::as_strided 0.34% 6.331us 0.34% 6.331us 1.055us 0.000us 0.00% 0.000us 0.000us 6
|
| 4355 |
+
cudaLaunchKernel 9.33% 172.076us 9.33% 172.076us 28.679us 0.000us 0.00% 0.000us 0.000us 6
|
| 4356 |
+
cudaDeviceSynchronize 0.28% 5.210us 0.28% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1
|
| 4357 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4358 |
+
Self CPU time total: 1.845ms
|
| 4359 |
+
Self CUDA time total: 14.369us
|
| 4360 |
|
| 4361 |
|
| 4362 |
|
|
|
|
| 4366 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4367 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4368 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4369 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.754us 962.92% 149.754us 149.754us 1
|
| 4370 |
+
torch_eager 21.77% 106.163us 98.85% 481.952us 481.952us 0.000us 0.00% 18.240us 18.240us 1
|
| 4371 |
+
aten::silu 8.65% 42.151us 61.90% 301.788us 100.596us 7.968us 51.23% 10.656us 3.552us 3
|
| 4372 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.23% 7.968us 2.656us 3
|
| 4373 |
+
aten::mul 5.09% 24.801us 8.77% 42.752us 14.251us 7.584us 48.77% 7.584us 2.528us 3
|
| 4374 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.77% 7.584us 2.528us 3
|
| 4375 |
+
Activity Buffer Request 21.73% 105.953us 21.73% 105.953us 105.953us 2.688us 17.28% 2.688us 2.688us 1
|
| 4376 |
+
aten::slice 5.14% 25.050us 6.41% 31.249us 5.208us 0.000us 0.00% 0.000us 0.000us 6
|
| 4377 |
+
aten::as_strided 1.27% 6.199us 1.27% 6.199us 1.033us 0.000us 0.00% 0.000us 0.000us 6
|
| 4378 |
+
cudaLaunchKernel 35.20% 171.635us 35.20% 171.635us 28.606us 0.000us 0.00% 0.000us 0.000us 6
|
| 4379 |
+
cudaDeviceSynchronize 1.15% 5.600us 1.15% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1
|
| 4380 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4381 |
+
Self CPU time total: 487.552us
|
| 4382 |
+
Self CUDA time total: 15.552us
|
| 4383 |
|
| 4384 |
|
| 4385 |
|
|
|
|
| 4389 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4390 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4391 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4392 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 187.357us 834.00% 187.357us 187.357us 1
|
| 4393 |
+
torch_eager 6.93% 128.860us 99.74% 1.856ms 1.856ms 0.000us 0.00% 26.369us 26.369us 1
|
| 4394 |
+
aten::silu 2.32% 43.123us 88.23% 1.642ms 547.175us 11.616us 51.71% 15.520us 5.173us 3
|
| 4395 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.616us 51.71% 11.616us 3.872us 3
|
| 4396 |
+
aten::mul 1.63% 30.312us 2.74% 50.922us 16.974us 10.849us 48.29% 10.849us 3.616us 3
|
| 4397 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.849us 48.29% 10.849us 3.616us 3
|
| 4398 |
+
Activity Buffer Request 77.79% 1.447ms 77.79% 1.447ms 1.447ms 3.904us 17.38% 3.904us 3.904us 1
|
| 4399 |
+
aten::slice 1.49% 27.691us 1.84% 34.251us 5.708us 0.000us 0.00% 0.000us 0.000us 6
|
| 4400 |
+
aten::as_strided 0.35% 6.560us 0.35% 6.560us 1.093us 0.000us 0.00% 0.000us 0.000us 6
|
| 4401 |
+
cudaLaunchKernel 9.23% 171.734us 9.23% 171.734us 28.622us 0.000us 0.00% 0.000us 0.000us 6
|
| 4402 |
+
cudaDeviceSynchronize 0.26% 4.930us 0.26% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
|
| 4403 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4404 |
+
Self CPU time total: 1.860ms
|
| 4405 |
+
Self CUDA time total: 22.465us
|
| 4406 |
|
| 4407 |
|
| 4408 |
impl wl p50(ms) ok
|
|
|
|
| 4419 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4420 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4421 |
<div class="uv-logs-content" style="display: none;">
|
| 4422 |
+
Installed 37 packages in 251ms
|
| 4423 |
</div>
|
| 4424 |
</div>
|
| 4425 |
<div class="cell-artifacts">
|
activation/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
activation/results/combined_results.html
CHANGED
|
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4107 |
<rdf:RDF>
|
| 4108 |
<ns2:Work>
|
| 4109 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4110 |
-
<dc:date>2025-10-
|
| 4111 |
<dc:format>image/svg+xml</dc:format>
|
| 4112 |
<dc:creator>
|
| 4113 |
<ns2:Agent>
|
|
@@ -4256,83 +4256,83 @@ body[data-tool="eraser"] .main-content {
|
|
| 4256 |
<g id="matplotlib.axis_2">
|
| 4257 |
<g id="ytick_1">
|
| 4258 |
<g id="grid-y--2" class="grid grid-y">
|
| 4259 |
-
<path d="M 60.23
|
| 4260 |
</g>
|
| 4261 |
<g id="line2d_10">
|
| 4262 |
<defs>
|
| 4263 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4264 |
</defs>
|
| 4265 |
<g>
|
| 4266 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4267 |
</g>
|
| 4268 |
</g>
|
| 4269 |
<g id="text_10">
|
| 4270 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4271 |
</g>
|
| 4272 |
</g>
|
| 4273 |
<g id="ytick_2">
|
| 4274 |
<g id="grid-y--3" class="grid grid-y">
|
| 4275 |
-
<path d="M 60.23
|
| 4276 |
</g>
|
| 4277 |
<g id="line2d_11">
|
| 4278 |
<g>
|
| 4279 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4280 |
</g>
|
| 4281 |
</g>
|
| 4282 |
<g id="text_11">
|
| 4283 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4284 |
</g>
|
| 4285 |
</g>
|
| 4286 |
<g id="ytick_3">
|
| 4287 |
<g id="grid-y--4" class="grid grid-y">
|
| 4288 |
-
<path d="M 60.23
|
| 4289 |
</g>
|
| 4290 |
<g id="line2d_12">
|
| 4291 |
<g>
|
| 4292 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4293 |
</g>
|
| 4294 |
</g>
|
| 4295 |
<g id="text_12">
|
| 4296 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4297 |
</g>
|
| 4298 |
</g>
|
| 4299 |
<g id="ytick_4">
|
| 4300 |
<g id="grid-y--5" class="grid grid-y">
|
| 4301 |
-
<path d="M 60.23
|
| 4302 |
</g>
|
| 4303 |
<g id="line2d_13">
|
| 4304 |
<g>
|
| 4305 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4306 |
</g>
|
| 4307 |
</g>
|
| 4308 |
<g id="text_13">
|
| 4309 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4310 |
</g>
|
| 4311 |
</g>
|
| 4312 |
<g id="ytick_5">
|
| 4313 |
<g id="grid-y--6" class="grid grid-y">
|
| 4314 |
-
<path d="M 60.23
|
| 4315 |
</g>
|
| 4316 |
<g id="line2d_14">
|
| 4317 |
<g>
|
| 4318 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4319 |
</g>
|
| 4320 |
</g>
|
| 4321 |
<g id="text_14">
|
| 4322 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4323 |
</g>
|
| 4324 |
</g>
|
| 4325 |
<g id="ytick_6">
|
| 4326 |
<g id="grid-y--7" class="grid grid-y">
|
| 4327 |
-
<path d="M 60.23
|
| 4328 |
</g>
|
| 4329 |
<g id="line2d_15">
|
| 4330 |
<g>
|
| 4331 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4332 |
</g>
|
| 4333 |
</g>
|
| 4334 |
<g id="text_15">
|
| 4335 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4336 |
</g>
|
| 4337 |
</g>
|
| 4338 |
<g id="label--y" class="ylabel">
|
|
@@ -4340,37 +4340,37 @@ body[data-tool="eraser"] .main-content {
|
|
| 4340 |
</g>
|
| 4341 |
</g>
|
| 4342 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4343 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4344 |
<defs>
|
| 4345 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4346 |
</defs>
|
| 4347 |
<g clip-path="url(#p620c7d392f)">
|
| 4348 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4349 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4350 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4351 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4352 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4353 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4354 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4355 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4356 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="series--torch-eager" class="series">
|
| 4360 |
-
<path d="M 96.005644
|
| 4361 |
<defs>
|
| 4362 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4363 |
</defs>
|
| 4364 |
<g clip-path="url(#p620c7d392f)">
|
| 4365 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4366 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4367 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4368 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4369 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4370 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4371 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="73.
|
| 4372 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="89.
|
| 4373 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4374 |
</g>
|
| 4375 |
</g>
|
| 4376 |
<g id="patch_3">
|
|
@@ -4428,7 +4428,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4428 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4429 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4430 |
</span> |
|
| 4431 |
-
Cell: combine | 4.
|
| 4432 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4433 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4434 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4554,7 +4554,7 @@ Implementations included:
|
|
| 4554 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4555 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4556 |
<div class="uv-logs-content" style="display: none;">
|
| 4557 |
-
Installed 37 packages in
|
| 4558 |
</div>
|
| 4559 |
</div>
|
| 4560 |
<div class="cell-artifacts">
|
|
@@ -4567,7 +4567,7 @@ Installed 37 packages in 222ms
|
|
| 4567 |
<rdf:RDF>
|
| 4568 |
<ns2:Work>
|
| 4569 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4570 |
-
<dc:date>2025-10-
|
| 4571 |
<dc:format>image/svg+xml</dc:format>
|
| 4572 |
<dc:creator>
|
| 4573 |
<ns2:Agent>
|
|
@@ -4716,83 +4716,83 @@ Installed 37 packages in 222ms
|
|
| 4716 |
<g id="matplotlib.axis_2">
|
| 4717 |
<g id="ytick_1">
|
| 4718 |
<g id="grid-y--2" class="grid grid-y">
|
| 4719 |
-
<path d="M 60.23
|
| 4720 |
</g>
|
| 4721 |
<g id="line2d_10">
|
| 4722 |
<defs>
|
| 4723 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4724 |
</defs>
|
| 4725 |
<g>
|
| 4726 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4727 |
</g>
|
| 4728 |
</g>
|
| 4729 |
<g id="text_10">
|
| 4730 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4731 |
</g>
|
| 4732 |
</g>
|
| 4733 |
<g id="ytick_2">
|
| 4734 |
<g id="grid-y--3" class="grid grid-y">
|
| 4735 |
-
<path d="M 60.23
|
| 4736 |
</g>
|
| 4737 |
<g id="line2d_11">
|
| 4738 |
<g>
|
| 4739 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4740 |
</g>
|
| 4741 |
</g>
|
| 4742 |
<g id="text_11">
|
| 4743 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4744 |
</g>
|
| 4745 |
</g>
|
| 4746 |
<g id="ytick_3">
|
| 4747 |
<g id="grid-y--4" class="grid grid-y">
|
| 4748 |
-
<path d="M 60.23
|
| 4749 |
</g>
|
| 4750 |
<g id="line2d_12">
|
| 4751 |
<g>
|
| 4752 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4753 |
</g>
|
| 4754 |
</g>
|
| 4755 |
<g id="text_12">
|
| 4756 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4757 |
</g>
|
| 4758 |
</g>
|
| 4759 |
<g id="ytick_4">
|
| 4760 |
<g id="grid-y--5" class="grid grid-y">
|
| 4761 |
-
<path d="M 60.23
|
| 4762 |
</g>
|
| 4763 |
<g id="line2d_13">
|
| 4764 |
<g>
|
| 4765 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4766 |
</g>
|
| 4767 |
</g>
|
| 4768 |
<g id="text_13">
|
| 4769 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4770 |
</g>
|
| 4771 |
</g>
|
| 4772 |
<g id="ytick_5">
|
| 4773 |
<g id="grid-y--6" class="grid grid-y">
|
| 4774 |
-
<path d="M 60.23
|
| 4775 |
</g>
|
| 4776 |
<g id="line2d_14">
|
| 4777 |
<g>
|
| 4778 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4779 |
</g>
|
| 4780 |
</g>
|
| 4781 |
<g id="text_14">
|
| 4782 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4783 |
</g>
|
| 4784 |
</g>
|
| 4785 |
<g id="ytick_6">
|
| 4786 |
<g id="grid-y--7" class="grid grid-y">
|
| 4787 |
-
<path d="M 60.23
|
| 4788 |
</g>
|
| 4789 |
<g id="line2d_15">
|
| 4790 |
<g>
|
| 4791 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4792 |
</g>
|
| 4793 |
</g>
|
| 4794 |
<g id="text_15">
|
| 4795 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4796 |
</g>
|
| 4797 |
</g>
|
| 4798 |
<g id="label--y" class="ylabel">
|
|
@@ -4800,37 +4800,37 @@ Installed 37 packages in 222ms
|
|
| 4800 |
</g>
|
| 4801 |
</g>
|
| 4802 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4803 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4804 |
<defs>
|
| 4805 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4806 |
</defs>
|
| 4807 |
<g clip-path="url(#p620c7d392f)">
|
| 4808 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4809 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4810 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4811 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4812 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4813 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4814 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4815 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4816 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4817 |
</g>
|
| 4818 |
</g>
|
| 4819 |
<g id="series--torch-eager" class="series">
|
| 4820 |
-
<path d="M 96.005644
|
| 4821 |
<defs>
|
| 4822 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4823 |
</defs>
|
| 4824 |
<g clip-path="url(#p620c7d392f)">
|
| 4825 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4826 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4827 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4828 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4829 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4830 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4831 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="73.
|
| 4832 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="89.
|
| 4833 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4834 |
</g>
|
| 4835 |
</g>
|
| 4836 |
<g id="patch_3">
|
|
|
|
| 4107 |
<rdf:RDF>
|
| 4108 |
<ns2:Work>
|
| 4109 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4110 |
+
<dc:date>2025-10-31T20:14:01.265668</dc:date>
|
| 4111 |
<dc:format>image/svg+xml</dc:format>
|
| 4112 |
<dc:creator>
|
| 4113 |
<ns2:Agent>
|
|
|
|
| 4256 |
<g id="matplotlib.axis_2">
|
| 4257 |
<g id="ytick_1">
|
| 4258 |
<g id="grid-y--2" class="grid grid-y">
|
| 4259 |
+
<path d="M 60.23 447.291581 L 847.294169 447.291581 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4260 |
</g>
|
| 4261 |
<g id="line2d_10">
|
| 4262 |
<defs>
|
| 4263 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4264 |
</defs>
|
| 4265 |
<g>
|
| 4266 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="447.291581" style="stroke: #000000; stroke-width: 0.8" />
|
| 4267 |
</g>
|
| 4268 |
</g>
|
| 4269 |
<g id="text_10">
|
| 4270 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="451.090799" transform="rotate(-0 53.23 451.090799)">0.025</text>
|
| 4271 |
</g>
|
| 4272 |
</g>
|
| 4273 |
<g id="ytick_2">
|
| 4274 |
<g id="grid-y--3" class="grid grid-y">
|
| 4275 |
+
<path d="M 60.23 372.461283 L 847.294169 372.461283 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4276 |
</g>
|
| 4277 |
<g id="line2d_11">
|
| 4278 |
<g>
|
| 4279 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="372.461283" style="stroke: #000000; stroke-width: 0.8" />
|
| 4280 |
</g>
|
| 4281 |
</g>
|
| 4282 |
<g id="text_11">
|
| 4283 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.260501" transform="rotate(-0 53.23 376.260501)">0.030</text>
|
| 4284 |
</g>
|
| 4285 |
</g>
|
| 4286 |
<g id="ytick_3">
|
| 4287 |
<g id="grid-y--4" class="grid grid-y">
|
| 4288 |
+
<path d="M 60.23 297.630984 L 847.294169 297.630984 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4289 |
</g>
|
| 4290 |
<g id="line2d_12">
|
| 4291 |
<g>
|
| 4292 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="297.630984" style="stroke: #000000; stroke-width: 0.8" />
|
| 4293 |
</g>
|
| 4294 |
</g>
|
| 4295 |
<g id="text_12">
|
| 4296 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.430203" transform="rotate(-0 53.23 301.430203)">0.035</text>
|
| 4297 |
</g>
|
| 4298 |
</g>
|
| 4299 |
<g id="ytick_4">
|
| 4300 |
<g id="grid-y--5" class="grid grid-y">
|
| 4301 |
+
<path d="M 60.23 222.800686 L 847.294169 222.800686 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4302 |
</g>
|
| 4303 |
<g id="line2d_13">
|
| 4304 |
<g>
|
| 4305 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="222.800686" style="stroke: #000000; stroke-width: 0.8" />
|
| 4306 |
</g>
|
| 4307 |
</g>
|
| 4308 |
<g id="text_13">
|
| 4309 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="226.599905" transform="rotate(-0 53.23 226.599905)">0.040</text>
|
| 4310 |
</g>
|
| 4311 |
</g>
|
| 4312 |
<g id="ytick_5">
|
| 4313 |
<g id="grid-y--6" class="grid grid-y">
|
| 4314 |
+
<path d="M 60.23 147.970388 L 847.294169 147.970388 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4315 |
</g>
|
| 4316 |
<g id="line2d_14">
|
| 4317 |
<g>
|
| 4318 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="147.970388" style="stroke: #000000; stroke-width: 0.8" />
|
| 4319 |
</g>
|
| 4320 |
</g>
|
| 4321 |
<g id="text_14">
|
| 4322 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="151.769607" transform="rotate(-0 53.23 151.769607)">0.045</text>
|
| 4323 |
</g>
|
| 4324 |
</g>
|
| 4325 |
<g id="ytick_6">
|
| 4326 |
<g id="grid-y--7" class="grid grid-y">
|
| 4327 |
+
<path d="M 60.23 73.14009 L 847.294169 73.14009 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4328 |
</g>
|
| 4329 |
<g id="line2d_15">
|
| 4330 |
<g>
|
| 4331 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="73.14009" style="stroke: #000000; stroke-width: 0.8" />
|
| 4332 |
</g>
|
| 4333 |
</g>
|
| 4334 |
<g id="text_15">
|
| 4335 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="76.939309" transform="rotate(-0 53.23 76.939309)">0.050</text>
|
| 4336 |
</g>
|
| 4337 |
</g>
|
| 4338 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4340 |
</g>
|
| 4341 |
</g>
|
| 4342 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4343 |
+
<path d="M 96.005644 451.16779 L 185.444754 376.487152 L 274.883864 390.555248 L 364.322974 389.208303 L 453.762084 410.624734 L 543.201194 412.405695 L 632.640304 383.371541 L 722.079415 400.283188 L 811.518525 398.936242 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4344 |
<defs>
|
| 4345 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4346 |
</defs>
|
| 4347 |
<g clip-path="url(#p620c7d392f)">
|
| 4348 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4349 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="376.487152" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4350 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="390.555248" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4351 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="389.208303" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4352 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="410.624734" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4353 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="412.405695" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4354 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="383.371541" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4355 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="400.283188" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4356 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="398.936242" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="series--torch-eager" class="series">
|
| 4360 |
+
<path d="M 96.005644 155.288791 L 185.444754 47.08418 L 274.883864 47.967177 L 364.322974 65.193113 L 453.762084 62.798543 L 543.201194 92.28168 L 632.640304 73.424445 L 722.079415 89.138808 L 811.518525 87.342881 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4361 |
<defs>
|
| 4362 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4363 |
</defs>
|
| 4364 |
<g clip-path="url(#p620c7d392f)">
|
| 4365 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="155.288791" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4366 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4367 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="47.967177" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4368 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="65.193113" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4369 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="62.798543" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4370 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="92.28168" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4371 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="73.424445" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4372 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="89.138808" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4373 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="87.342881" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4374 |
</g>
|
| 4375 |
</g>
|
| 4376 |
<g id="patch_3">
|
|
|
|
| 4428 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4429 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4430 |
</span> |
|
| 4431 |
+
Cell: combine | 4.32s
|
| 4432 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4433 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4434 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4554 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4555 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4556 |
<div class="uv-logs-content" style="display: none;">
|
| 4557 |
+
Installed 37 packages in 213ms
|
| 4558 |
</div>
|
| 4559 |
</div>
|
| 4560 |
<div class="cell-artifacts">
|
|
|
|
| 4567 |
<rdf:RDF>
|
| 4568 |
<ns2:Work>
|
| 4569 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4570 |
+
<dc:date>2025-10-31T20:14:01.265668</dc:date>
|
| 4571 |
<dc:format>image/svg+xml</dc:format>
|
| 4572 |
<dc:creator>
|
| 4573 |
<ns2:Agent>
|
|
|
|
| 4716 |
<g id="matplotlib.axis_2">
|
| 4717 |
<g id="ytick_1">
|
| 4718 |
<g id="grid-y--2" class="grid grid-y">
|
| 4719 |
+
<path d="M 60.23 447.291581 L 847.294169 447.291581 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4720 |
</g>
|
| 4721 |
<g id="line2d_10">
|
| 4722 |
<defs>
|
| 4723 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4724 |
</defs>
|
| 4725 |
<g>
|
| 4726 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="447.291581" style="stroke: #000000; stroke-width: 0.8" />
|
| 4727 |
</g>
|
| 4728 |
</g>
|
| 4729 |
<g id="text_10">
|
| 4730 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="451.090799" transform="rotate(-0 53.23 451.090799)">0.025</text>
|
| 4731 |
</g>
|
| 4732 |
</g>
|
| 4733 |
<g id="ytick_2">
|
| 4734 |
<g id="grid-y--3" class="grid grid-y">
|
| 4735 |
+
<path d="M 60.23 372.461283 L 847.294169 372.461283 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4736 |
</g>
|
| 4737 |
<g id="line2d_11">
|
| 4738 |
<g>
|
| 4739 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="372.461283" style="stroke: #000000; stroke-width: 0.8" />
|
| 4740 |
</g>
|
| 4741 |
</g>
|
| 4742 |
<g id="text_11">
|
| 4743 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.260501" transform="rotate(-0 53.23 376.260501)">0.030</text>
|
| 4744 |
</g>
|
| 4745 |
</g>
|
| 4746 |
<g id="ytick_3">
|
| 4747 |
<g id="grid-y--4" class="grid grid-y">
|
| 4748 |
+
<path d="M 60.23 297.630984 L 847.294169 297.630984 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4749 |
</g>
|
| 4750 |
<g id="line2d_12">
|
| 4751 |
<g>
|
| 4752 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="297.630984" style="stroke: #000000; stroke-width: 0.8" />
|
| 4753 |
</g>
|
| 4754 |
</g>
|
| 4755 |
<g id="text_12">
|
| 4756 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="301.430203" transform="rotate(-0 53.23 301.430203)">0.035</text>
|
| 4757 |
</g>
|
| 4758 |
</g>
|
| 4759 |
<g id="ytick_4">
|
| 4760 |
<g id="grid-y--5" class="grid grid-y">
|
| 4761 |
+
<path d="M 60.23 222.800686 L 847.294169 222.800686 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4762 |
</g>
|
| 4763 |
<g id="line2d_13">
|
| 4764 |
<g>
|
| 4765 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="222.800686" style="stroke: #000000; stroke-width: 0.8" />
|
| 4766 |
</g>
|
| 4767 |
</g>
|
| 4768 |
<g id="text_13">
|
| 4769 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="226.599905" transform="rotate(-0 53.23 226.599905)">0.040</text>
|
| 4770 |
</g>
|
| 4771 |
</g>
|
| 4772 |
<g id="ytick_5">
|
| 4773 |
<g id="grid-y--6" class="grid grid-y">
|
| 4774 |
+
<path d="M 60.23 147.970388 L 847.294169 147.970388 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4775 |
</g>
|
| 4776 |
<g id="line2d_14">
|
| 4777 |
<g>
|
| 4778 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="147.970388" style="stroke: #000000; stroke-width: 0.8" />
|
| 4779 |
</g>
|
| 4780 |
</g>
|
| 4781 |
<g id="text_14">
|
| 4782 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="151.769607" transform="rotate(-0 53.23 151.769607)">0.045</text>
|
| 4783 |
</g>
|
| 4784 |
</g>
|
| 4785 |
<g id="ytick_6">
|
| 4786 |
<g id="grid-y--7" class="grid grid-y">
|
| 4787 |
+
<path d="M 60.23 73.14009 L 847.294169 73.14009 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4788 |
</g>
|
| 4789 |
<g id="line2d_15">
|
| 4790 |
<g>
|
| 4791 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="73.14009" style="stroke: #000000; stroke-width: 0.8" />
|
| 4792 |
</g>
|
| 4793 |
</g>
|
| 4794 |
<g id="text_15">
|
| 4795 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="76.939309" transform="rotate(-0 53.23 76.939309)">0.050</text>
|
| 4796 |
</g>
|
| 4797 |
</g>
|
| 4798 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4800 |
</g>
|
| 4801 |
</g>
|
| 4802 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4803 |
+
<path d="M 96.005644 451.16779 L 185.444754 376.487152 L 274.883864 390.555248 L 364.322974 389.208303 L 453.762084 410.624734 L 543.201194 412.405695 L 632.640304 383.371541 L 722.079415 400.283188 L 811.518525 398.936242 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4804 |
<defs>
|
| 4805 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4806 |
</defs>
|
| 4807 |
<g clip-path="url(#p620c7d392f)">
|
| 4808 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4809 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="376.487152" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4810 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="390.555248" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4811 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="389.208303" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4812 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="410.624734" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4813 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="412.405695" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4814 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="383.371541" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4815 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="400.283188" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4816 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="398.936242" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4817 |
</g>
|
| 4818 |
</g>
|
| 4819 |
<g id="series--torch-eager" class="series">
|
| 4820 |
+
<path d="M 96.005644 155.288791 L 185.444754 47.08418 L 274.883864 47.967177 L 364.322974 65.193113 L 453.762084 62.798543 L 543.201194 92.28168 L 632.640304 73.424445 L 722.079415 89.138808 L 811.518525 87.342881 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4821 |
<defs>
|
| 4822 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4823 |
</defs>
|
| 4824 |
<g clip-path="url(#p620c7d392f)">
|
| 4825 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="155.288791" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4826 |
<use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4827 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="47.967177" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4828 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="65.193113" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4829 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="62.798543" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4830 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="92.28168" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4831 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="73.424445" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4832 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="89.138808" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4833 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="87.342881" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4834 |
</g>
|
| 4835 |
</g>
|
| 4836 |
<g id="patch_3">
|
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
| 10 |
-
{"ts": "2025-10-
|
| 11 |
-
{"ts": "2025-10-
|
| 12 |
-
{"ts": "2025-10-
|
| 13 |
-
{"ts": "2025-10-
|
| 14 |
-
{"ts": "2025-10-
|
| 15 |
-
{"ts": "2025-10-
|
| 16 |
-
{"ts": "2025-10-
|
| 17 |
-
{"ts": "2025-10-
|
| 18 |
-
{"ts": "2025-10-
|
| 19 |
-
{"ts": "2025-10-
|
| 20 |
-
{"ts": "2025-10-
|
| 21 |
-
{"ts": "2025-10-
|
| 22 |
-
{"ts": "2025-10-
|
| 23 |
-
{"ts": "2025-10-
|
| 24 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06906199996592477, "p50": 0.07093199997143529, "p90": 0.07169200000589626, "mean": 0.07107379998387842, "iqr": 0.0011000000199601345, "raw_times": [0.07093199997143529, 0.07309099999019963, 0.07059199998593613, 0.07169200000589626, 0.06906199996592477], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07642200000645971, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08730199999718025, "p50": 0.08879199998546028, "p90": 0.08886199998414668, "mean": 0.0890762000040013, "iqr": 0.00037899997096246807, "raw_times": [0.08730199999718025, 0.08879199998546028, 0.08848300001318421, 0.08886199998414668, 0.09194200004003505], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.091862999965997, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08465199999818651, "p50": 0.08821300002637145, "p90": 0.08871199997884105, "mean": 0.08770840000806857, "iqr": 0.0007599999776175537, "raw_times": [0.08465199999818651, 0.0879520000012235, 0.08821300002637145, 0.08901300003572032, 0.08871199997884105], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09156300001222917, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08501199999955134, "p50": 0.08710200000905388, "p90": 0.08719199996676252, "mean": 0.08665020000080403, "iqr": 0.001349999934063817, "raw_times": [0.08501199999955134, 0.08710200000905388, 0.08719199996676252, 0.0858420000326987, 0.08810299999595372], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09103200000026845, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08575200001814665, "p50": 0.08690200002092752, "p90": 0.08706200003416598, "mean": 0.08684220001669019, "iqr": 0.00029900002118665725, "raw_times": [0.08773199999723147, 0.08676300001297932, 0.08690200002092752, 0.08706200003416598, 0.08575200001814665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09036199998035954, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08490200002597703, "p50": 0.08731200000511308, "p90": 0.0877829999694768, "mean": 0.08806820000017979, "iqr": 0.001451000002816727, "raw_times": [0.09401200003367194, 0.08731200000511308, 0.08633199996666008, 0.08490200002597703, 0.0877829999694768], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0907329999790818, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0847820000444699, "p50": 0.08513199998105847, "p90": 0.08660200001031626, "mean": 0.08566600000676772, "iqr": 0.0016600000094513234, "raw_times": [0.08494200000086494, 0.0847820000444699, 0.08687199999712902, 0.08660200001031626, 0.08513199998105847], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0911219999579771, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08356199998615921, "p50": 0.0846430000365217, "p90": 0.08576199996923606, "mean": 0.08508039999242101, "iqr": 0.0011189999895577785, "raw_times": [0.08356199998615921, 0.0867919999905098, 0.08464299997967828, 0.08576199996923606, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08955300000934585, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08469199997307442, "p50": 0.08614199998646654, "p90": 0.08723299998791845, "mean": 0.08654439999418173, "iqr": 0.0011309999763398082, "raw_times": [0.08469199997307442, 0.08610200001157864, 0.08614199998646654, 0.08855300001187061, 0.08723299998791845], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09115300002804361, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08576300001550408, "p50": 0.08703200001036748, "p90": 0.08823299998539369, "mean": 0.09075460000076419, "iqr": 0.0015310000094359566, "raw_times": [0.10604300001659794, 0.08823299998539369, 0.08703200001036748, 0.08670199997595773, 0.08576300001550408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985199997368909, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14525299997103502, "p50": 0.1457439999512644, "p90": 0.1459139999724357, "mean": 0.1457395999750588, "iqr": 0.00044099999740865314, "raw_times": [0.14525299997103502, 0.14547299997502705, 0.1457439999512644, 0.14631400000553185, 0.1459139999724357], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1472430000148961, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16037399996093882, "p50": 0.16231400002197915, "p90": 0.16309400001546237, "mean": 0.1622881999992387, "iqr": 0.0012190000120426703, "raw_times": [0.16309400001546237, 0.16231400002197915, 0.16378399999439353, 0.1618750000034197, 0.16037399996093882], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.16341399998509587, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08445299999948475, "p50": 0.08518200002072263, "p90": 0.08666200000106983, "mean": 0.08572240001285536, "iqr": 0.0017899999988912896, "raw_times": [0.08445299999948475, 0.08744300004082106, 0.08518200002072263, 0.08666200000106983, 0.08487200000217854], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0890119999894523, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08437200000344092, "p50": 0.08463200003916427, "p90": 0.08609200000364581, "mean": 0.08522400000856578, "iqr": 0.0015900000107649248, "raw_times": [0.08463200003916427, 0.08609200000364581, 0.08652200000369703, 0.08437200000344092, 0.08450199999288088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08977199996706986, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08375199996635274, "p50": 0.08519199997181204, "p90": 0.08627200003274993, "mean": 0.08607399998936671, "iqr": 0.0020100000597267353, "raw_times": [0.08375199996635274, 0.0842619999730232, 0.08627200003274993, 0.08519199997181204, 0.09089200000289566], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08821199998010343, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08470200003785067, "p50": 0.08566200000359458, "p90": 0.08573299999170558, "mean": 0.08566220001284819, "iqr": 0.0006109999617365247, "raw_times": [0.08470200003785067, 0.08709200000112105, 0.08512200002996906, 0.08566200000359458, 0.08573299999170558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08864200003699807, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08451200000081371, "p50": 0.08525300000883362, "p90": 0.08580199994412396, "mean": 0.08525219999455658, "iqr": 0.0009299999419454252, "raw_times": [0.08580199994412396, 0.08525300000883362, 0.08451200000081371, 0.08487200000217854, 0.08582200001683304], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08942300001990588, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08533199996918484, "p50": 0.08693199998788259, "p90": 0.09015199998430035, "mean": 0.08883799998784525, "iqr": 0.0043200000163778896, "raw_times": [0.08533199996918484, 0.09015199998430035, 0.08583199996792246, 0.08693199998788259, 0.09594200002993603], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09176200001093093, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08384200003774822, "p50": 0.08611200001951147, "p90": 0.08663199997727133, "mean": 0.08570400000280642, "iqr": 0.001730000008137722, "raw_times": [0.08384200003774822, 0.08611200001951147, 0.08703200001036748, 0.08663199997727133, 0.08490199996913361], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941200002254845, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08507300003657292, "p50": 0.0865819999944506, "p90": 0.08741199997075455, "mean": 0.09195439998848087, "iqr": 0.0020300000187489786, "raw_times": [0.11532299998862072, 0.0865819999944506, 0.08741199997075455, 0.08538199995200557, 0.08507300003657292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08733200002097874, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09419299999535724, "p50": 0.09539199999153425, "p90": 0.09730299996135727, "mean": 0.09678459998667677, "iqr": 0.002380999944762152, "raw_times": [0.10211299996853995, 0.09730299996135727, 0.09492200001659512, 0.09539199999153425, 0.09419299999535724], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09651299995994123, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.10080199996309602, "p50": 0.10192199999892182, "p90": 0.1026219999857858, "mean": 0.10294419998899684, "iqr": 0.0008999999749903509, "raw_times": [0.10765299998638511, 0.10172200001079545, 0.1026219999857858, 0.10192199999892182, 0.10080199996309602], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10299199999508346, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4861929999719905, "p50": 0.4890019999947981, "p90": 0.48961200002395344, "mean": 0.48862639999924795, "iqr": 0.001079000014669873, "raw_times": [0.48979199999621414, 0.4861929999719905, 0.48961200002395344, 0.4890019999947981, 0.48853300000928357], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.48705300002893637, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-31T20:01:11Z", "run": "a7ca8117e1294b1ba730e0240038ddbc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.49736299996538946, "p50": 0.49848299994437184, "p90": 0.49918199999865465, "mean": 0.4987367999774506, "iqr": 0.0007590000450363732, "raw_times": [0.4984229999536183, 0.49848299994437184, 0.49918199999865465, 0.5002330000252186, 0.49736299996538946], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4985730000157673, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
causal_conv1d/impls/cells/benchmark.py
CHANGED
|
@@ -4,28 +4,37 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
| 11 |
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
# ///
|
| 13 |
import torch
|
|
|
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
from kernels import get_kernel
|
| 17 |
|
| 18 |
-
# Load the causal conv1d kernel
|
| 19 |
-
causal_conv1d = get_kernel("kernels-community/causal-conv1d")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
run_benchmark(
|
| 27 |
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
|
| 28 |
-
impl_name="
|
| 29 |
-
impl_tags={"family": "
|
| 30 |
-
impl_func=
|
| 31 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
| 10 |
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 11 |
# ///
|
| 12 |
import torch
|
| 13 |
+
import torch.nn.functional as F
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
def torch_causal_conv1d(input_tensor, weight, bias):
|
| 19 |
+
# Convert to weight dtype for computation
|
| 20 |
+
x = input_tensor.to(weight.dtype)
|
| 21 |
+
dim = weight.shape[0]
|
| 22 |
+
width = weight.shape[1]
|
| 23 |
+
seqlen = input_tensor.shape[-1]
|
| 24 |
|
| 25 |
+
# Depthwise causal conv1d using PyTorch
|
| 26 |
+
out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
|
| 27 |
+
|
| 28 |
+
# Truncate to original sequence length
|
| 29 |
+
out = out[..., :seqlen]
|
| 30 |
+
|
| 31 |
+
# Convert back to original dtype
|
| 32 |
+
return out.to(input_tensor.dtype)
|
| 33 |
|
| 34 |
|
| 35 |
run_benchmark(
|
| 36 |
kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
|
| 37 |
+
impl_name="torch_eager",
|
| 38 |
+
impl_tags={"family": "pytorch", "backend": "eager"},
|
| 39 |
+
impl_func=torch_causal_conv1d,
|
| 40 |
)
|
causal_conv1d/impls/hf_kernels_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/impls/torch_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
causal_conv1d/results/combined_results.html
CHANGED
|
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4107 |
<rdf:RDF>
|
| 4108 |
<ns2:Work>
|
| 4109 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4110 |
-
<dc:date>2025-10-
|
| 4111 |
<dc:format>image/svg+xml</dc:format>
|
| 4112 |
<dc:creator>
|
| 4113 |
<ns2:Agent>
|
|
@@ -4451,70 +4451,70 @@ body[data-tool="eraser"] .main-content {
|
|
| 4451 |
<g id="matplotlib.axis_2">
|
| 4452 |
<g id="ytick_1">
|
| 4453 |
<g id="grid-y--2" class="grid grid-y">
|
| 4454 |
-
<path d="M 47.72 375.
|
| 4455 |
</g>
|
| 4456 |
<g id="line2d_25">
|
| 4457 |
<defs>
|
| 4458 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4459 |
</defs>
|
| 4460 |
<g>
|
| 4461 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="375.
|
| 4462 |
</g>
|
| 4463 |
</g>
|
| 4464 |
<g id="text_25">
|
| 4465 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4466 |
</g>
|
| 4467 |
</g>
|
| 4468 |
<g id="ytick_2">
|
| 4469 |
<g id="grid-y--3" class="grid grid-y">
|
| 4470 |
-
<path d="M 47.72 292.
|
| 4471 |
</g>
|
| 4472 |
<g id="line2d_26">
|
| 4473 |
<g>
|
| 4474 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="292.
|
| 4475 |
</g>
|
| 4476 |
</g>
|
| 4477 |
<g id="text_26">
|
| 4478 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.
|
| 4479 |
</g>
|
| 4480 |
</g>
|
| 4481 |
<g id="ytick_3">
|
| 4482 |
<g id="grid-y--4" class="grid grid-y">
|
| 4483 |
-
<path d="M 47.72 209.
|
| 4484 |
</g>
|
| 4485 |
<g id="line2d_27">
|
| 4486 |
<g>
|
| 4487 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="209.
|
| 4488 |
</g>
|
| 4489 |
</g>
|
| 4490 |
<g id="text_27">
|
| 4491 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.
|
| 4492 |
</g>
|
| 4493 |
</g>
|
| 4494 |
<g id="ytick_4">
|
| 4495 |
<g id="grid-y--5" class="grid grid-y">
|
| 4496 |
-
<path d="M 47.72
|
| 4497 |
</g>
|
| 4498 |
<g id="line2d_28">
|
| 4499 |
<g>
|
| 4500 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4501 |
</g>
|
| 4502 |
</g>
|
| 4503 |
<g id="text_28">
|
| 4504 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="ytick_5">
|
| 4508 |
<g id="grid-y--6" class="grid grid-y">
|
| 4509 |
-
<path d="M 47.72 44.
|
| 4510 |
</g>
|
| 4511 |
<g id="line2d_29">
|
| 4512 |
<g>
|
| 4513 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="44.
|
| 4514 |
</g>
|
| 4515 |
</g>
|
| 4516 |
<g id="text_29">
|
| 4517 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="label--y" class="ylabel">
|
|
@@ -4522,66 +4522,66 @@ body[data-tool="eraser"] .main-content {
|
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4525 |
-
<path d="M 83.325193 420.186871 L 114.286231
|
| 4526 |
<defs>
|
| 4527 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4528 |
</defs>
|
| 4529 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4530 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4531 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="
|
| 4532 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 4533 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="
|
| 4534 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="415.
|
| 4535 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="416.
|
| 4536 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="416.
|
| 4537 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="
|
| 4538 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="
|
| 4539 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.
|
| 4540 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 4541 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="
|
| 4542 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="
|
| 4543 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="
|
| 4544 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="
|
| 4545 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="
|
| 4546 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="415.
|
| 4547 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 4548 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="416.
|
| 4549 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 4550 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="
|
| 4551 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="
|
| 4552 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="416.
|
| 4553 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="
|
| 4554 |
</g>
|
| 4555 |
</g>
|
| 4556 |
<g id="series--torch-eager" class="series">
|
| 4557 |
-
<path d="M 83.325193
|
| 4558 |
<defs>
|
| 4559 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4560 |
</defs>
|
| 4561 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4562 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="
|
| 4563 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="
|
| 4564 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="385.
|
| 4565 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 4566 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 4567 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 4568 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="
|
| 4569 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="
|
| 4570 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="
|
| 4571 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 4572 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 4573 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="
|
| 4574 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="
|
| 4575 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 4576 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 4577 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="
|
| 4578 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="
|
| 4579 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 4580 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="
|
| 4581 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="
|
| 4582 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="379.
|
| 4583 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="
|
| 4584 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 4585 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4586 |
</g>
|
| 4587 |
</g>
|
|
@@ -4640,7 +4640,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4640 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4641 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4642 |
</span> |
|
| 4643 |
-
Cell: combine | 4.
|
| 4644 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4645 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4646 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4753,28 +4753,28 @@ hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
|
|
| 4753 |
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
|
| 4754 |
torch_eager cuda_B2_D2048_S128_W2 0.09 True
|
| 4755 |
torch_eager cuda_B2_D2048_S128_W4 0.08 True
|
| 4756 |
-
torch_eager cuda_B2_D2048_S2048_W2 0.
|
| 4757 |
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
|
| 4758 |
torch_eager cuda_B2_D2048_S512_W2 0.09 True
|
| 4759 |
-
torch_eager cuda_B2_D2048_S512_W4 0.
|
| 4760 |
torch_eager cuda_B2_D64_S128_W2 0.07 True
|
| 4761 |
torch_eager cuda_B2_D64_S128_W4 0.09 True
|
| 4762 |
torch_eager cuda_B2_D64_S2048_W2 0.09 True
|
| 4763 |
-
torch_eager cuda_B2_D64_S2048_W4 0.
|
| 4764 |
torch_eager cuda_B2_D64_S512_W2 0.09 True
|
| 4765 |
torch_eager cuda_B2_D64_S512_W4 0.09 True
|
| 4766 |
-
torch_eager cuda_B4_D2048_S128_W2 0.
|
| 4767 |
-
torch_eager cuda_B4_D2048_S128_W4 0.
|
| 4768 |
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
|
| 4769 |
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
|
| 4770 |
-
torch_eager cuda_B4_D2048_S512_W2 0.
|
| 4771 |
torch_eager cuda_B4_D2048_S512_W4 0.10 True
|
| 4772 |
-
torch_eager cuda_B4_D64_S128_W2 0.
|
| 4773 |
torch_eager cuda_B4_D64_S128_W4 0.08 True
|
| 4774 |
-
torch_eager cuda_B4_D64_S2048_W2 0.
|
| 4775 |
-
torch_eager cuda_B4_D64_S2048_W4 0.
|
| 4776 |
-
torch_eager cuda_B4_D64_S512_W2 0.
|
| 4777 |
-
torch_eager cuda_B4_D64_S512_W4 0.
|
| 4778 |
|
| 4779 |
GENERATING COMBINED VISUALIZATION
|
| 4780 |
|
|
@@ -4794,7 +4794,7 @@ Implementations included:
|
|
| 4794 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4796 |
<div class="uv-logs-content" style="display: none;">
|
| 4797 |
-
Installed 37 packages in
|
| 4798 |
</div>
|
| 4799 |
</div>
|
| 4800 |
<div class="cell-artifacts">
|
|
@@ -4807,7 +4807,7 @@ Installed 37 packages in 211ms
|
|
| 4807 |
<rdf:RDF>
|
| 4808 |
<ns2:Work>
|
| 4809 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4810 |
-
<dc:date>2025-10-
|
| 4811 |
<dc:format>image/svg+xml</dc:format>
|
| 4812 |
<dc:creator>
|
| 4813 |
<ns2:Agent>
|
|
@@ -5151,70 +5151,70 @@ Installed 37 packages in 211ms
|
|
| 5151 |
<g id="matplotlib.axis_2">
|
| 5152 |
<g id="ytick_1">
|
| 5153 |
<g id="grid-y--2" class="grid grid-y">
|
| 5154 |
-
<path d="M 47.72 375.
|
| 5155 |
</g>
|
| 5156 |
<g id="line2d_25">
|
| 5157 |
<defs>
|
| 5158 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 5159 |
</defs>
|
| 5160 |
<g>
|
| 5161 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="375.
|
| 5162 |
</g>
|
| 5163 |
</g>
|
| 5164 |
<g id="text_25">
|
| 5165 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 5166 |
</g>
|
| 5167 |
</g>
|
| 5168 |
<g id="ytick_2">
|
| 5169 |
<g id="grid-y--3" class="grid grid-y">
|
| 5170 |
-
<path d="M 47.72 292.
|
| 5171 |
</g>
|
| 5172 |
<g id="line2d_26">
|
| 5173 |
<g>
|
| 5174 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="292.
|
| 5175 |
</g>
|
| 5176 |
</g>
|
| 5177 |
<g id="text_26">
|
| 5178 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.
|
| 5179 |
</g>
|
| 5180 |
</g>
|
| 5181 |
<g id="ytick_3">
|
| 5182 |
<g id="grid-y--4" class="grid grid-y">
|
| 5183 |
-
<path d="M 47.72 209.
|
| 5184 |
</g>
|
| 5185 |
<g id="line2d_27">
|
| 5186 |
<g>
|
| 5187 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="209.
|
| 5188 |
</g>
|
| 5189 |
</g>
|
| 5190 |
<g id="text_27">
|
| 5191 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.
|
| 5192 |
</g>
|
| 5193 |
</g>
|
| 5194 |
<g id="ytick_4">
|
| 5195 |
<g id="grid-y--5" class="grid grid-y">
|
| 5196 |
-
<path d="M 47.72
|
| 5197 |
</g>
|
| 5198 |
<g id="line2d_28">
|
| 5199 |
<g>
|
| 5200 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 5201 |
</g>
|
| 5202 |
</g>
|
| 5203 |
<g id="text_28">
|
| 5204 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.
|
| 5205 |
</g>
|
| 5206 |
</g>
|
| 5207 |
<g id="ytick_5">
|
| 5208 |
<g id="grid-y--6" class="grid grid-y">
|
| 5209 |
-
<path d="M 47.72 44.
|
| 5210 |
</g>
|
| 5211 |
<g id="line2d_29">
|
| 5212 |
<g>
|
| 5213 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="44.
|
| 5214 |
</g>
|
| 5215 |
</g>
|
| 5216 |
<g id="text_29">
|
| 5217 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 5218 |
</g>
|
| 5219 |
</g>
|
| 5220 |
<g id="label--y" class="ylabel">
|
|
@@ -5222,66 +5222,66 @@ Installed 37 packages in 211ms
|
|
| 5222 |
</g>
|
| 5223 |
</g>
|
| 5224 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 5225 |
-
<path d="M 83.325193 420.186871 L 114.286231
|
| 5226 |
<defs>
|
| 5227 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 5228 |
</defs>
|
| 5229 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5230 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5231 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="
|
| 5232 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 5233 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="
|
| 5234 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="415.
|
| 5235 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="416.
|
| 5236 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="416.
|
| 5237 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="
|
| 5238 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="
|
| 5239 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.
|
| 5240 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 5241 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="
|
| 5242 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="
|
| 5243 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="
|
| 5244 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="
|
| 5245 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="
|
| 5246 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="415.
|
| 5247 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 5248 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="416.
|
| 5249 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 5250 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="
|
| 5251 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="
|
| 5252 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="416.
|
| 5253 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="
|
| 5254 |
</g>
|
| 5255 |
</g>
|
| 5256 |
<g id="series--torch-eager" class="series">
|
| 5257 |
-
<path d="M 83.325193
|
| 5258 |
<defs>
|
| 5259 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5260 |
</defs>
|
| 5261 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5262 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="
|
| 5263 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="
|
| 5264 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="385.
|
| 5265 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 5266 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 5267 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 5268 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="
|
| 5269 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="
|
| 5270 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="
|
| 5271 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 5272 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 5273 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="
|
| 5274 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="
|
| 5275 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 5276 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 5277 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="
|
| 5278 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="
|
| 5279 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 5280 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="
|
| 5281 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="
|
| 5282 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="379.
|
| 5283 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="
|
| 5284 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 5285 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5286 |
</g>
|
| 5287 |
</g>
|
|
|
|
| 4107 |
<rdf:RDF>
|
| 4108 |
<ns2:Work>
|
| 4109 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4110 |
+
<dc:date>2025-10-31T20:14:05.716143</dc:date>
|
| 4111 |
<dc:format>image/svg+xml</dc:format>
|
| 4112 |
<dc:creator>
|
| 4113 |
<ns2:Agent>
|
|
|
|
| 4451 |
<g id="matplotlib.axis_2">
|
| 4452 |
<g id="ytick_1">
|
| 4453 |
<g id="grid-y--2" class="grid grid-y">
|
| 4454 |
+
<path d="M 47.72 375.317309 L 831.034248 375.317309 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4455 |
</g>
|
| 4456 |
<g id="line2d_25">
|
| 4457 |
<defs>
|
| 4458 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4459 |
</defs>
|
| 4460 |
<g>
|
| 4461 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="375.317309" style="stroke: #000000; stroke-width: 0.8" />
|
| 4462 |
</g>
|
| 4463 |
</g>
|
| 4464 |
<g id="text_25">
|
| 4465 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.116528" transform="rotate(-0 40.72 379.116528)">0.1</text>
|
| 4466 |
</g>
|
| 4467 |
</g>
|
| 4468 |
<g id="ytick_2">
|
| 4469 |
<g id="grid-y--3" class="grid grid-y">
|
| 4470 |
+
<path d="M 47.72 292.576412 L 831.034248 292.576412 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4471 |
</g>
|
| 4472 |
<g id="line2d_26">
|
| 4473 |
<g>
|
| 4474 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="292.576412" style="stroke: #000000; stroke-width: 0.8" />
|
| 4475 |
</g>
|
| 4476 |
</g>
|
| 4477 |
<g id="text_26">
|
| 4478 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.375631" transform="rotate(-0 40.72 296.375631)">0.2</text>
|
| 4479 |
</g>
|
| 4480 |
</g>
|
| 4481 |
<g id="ytick_3">
|
| 4482 |
<g id="grid-y--4" class="grid grid-y">
|
| 4483 |
+
<path d="M 47.72 209.835514 L 831.034248 209.835514 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4484 |
</g>
|
| 4485 |
<g id="line2d_27">
|
| 4486 |
<g>
|
| 4487 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="209.835514" style="stroke: #000000; stroke-width: 0.8" />
|
| 4488 |
</g>
|
| 4489 |
</g>
|
| 4490 |
<g id="text_27">
|
| 4491 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.634733" transform="rotate(-0 40.72 213.634733)">0.3</text>
|
| 4492 |
</g>
|
| 4493 |
</g>
|
| 4494 |
<g id="ytick_4">
|
| 4495 |
<g id="grid-y--5" class="grid grid-y">
|
| 4496 |
+
<path d="M 47.72 127.094617 L 831.034248 127.094617 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4497 |
</g>
|
| 4498 |
<g id="line2d_28">
|
| 4499 |
<g>
|
| 4500 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="127.094617" style="stroke: #000000; stroke-width: 0.8" />
|
| 4501 |
</g>
|
| 4502 |
</g>
|
| 4503 |
<g id="text_28">
|
| 4504 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.893835" transform="rotate(-0 40.72 130.893835)">0.4</text>
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="ytick_5">
|
| 4508 |
<g id="grid-y--6" class="grid grid-y">
|
| 4509 |
+
<path d="M 47.72 44.353719 L 831.034248 44.353719 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4510 |
</g>
|
| 4511 |
<g id="line2d_29">
|
| 4512 |
<g>
|
| 4513 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="44.353719" style="stroke: #000000; stroke-width: 0.8" />
|
| 4514 |
</g>
|
| 4515 |
</g>
|
| 4516 |
<g id="text_29">
|
| 4517 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="48.152938" transform="rotate(-0 40.72 48.152938)">0.5</text>
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4525 |
+
<path d="M 83.325193 420.186871 L 114.286231 415.032113 L 145.247268 414.700322 L 176.208306 414.808712 L 207.169343 415.503736 L 238.130381 416.677829 L 269.091418 416.322043 L 300.052455 416.2815 L 331.013493 416.364241 L 361.97453 415.247239 L 392.935568 416.78622 L 423.896605 416.686103 L 454.857643 416.223582 L 485.81868 417.390228 L 516.779718 415.991907 L 547.740755 415.279508 L 578.701793 415.702314 L 609.66283 416.082095 L 640.623868 416.173937 L 671.584905 415.884344 L 702.545943 416.157389 L 733.50698 416.115191 L 764.468018 416.686103 L 795.429055 415.967085 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4526 |
<defs>
|
| 4527 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4528 |
</defs>
|
| 4529 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4530 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4531 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="415.032113" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4532 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="414.700322" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4533 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="414.808712" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4534 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="415.503736" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4535 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="416.677829" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4536 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="416.322043" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4537 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="416.2815" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4538 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.364241" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4539 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.247239" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4540 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="416.78622" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4541 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4542 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="416.223582" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4543 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="417.390228" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4544 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="415.991907" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4545 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="415.279508" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4546 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="415.702314" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4547 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="416.082095" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4548 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="416.173937" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4549 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="415.884344" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4550 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="416.157389" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4551 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="416.115191" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4552 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4553 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="415.967085" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4554 |
</g>
|
| 4555 |
</g>
|
| 4556 |
<g id="series--torch-eager" class="series">
|
| 4557 |
+
<path d="M 83.325193 399.368433 L 114.286231 384.590909 L 145.247268 385.069979 L 176.208306 385.98923 L 207.169343 386.154712 L 238.130381 385.815474 L 269.091418 387.619226 L 300.052455 388.023829 L 331.013493 386.783543 L 361.97453 386.047149 L 392.935568 337.468313 L 423.896605 323.758146 L 454.857643 387.577855 L 485.81868 388.03293 L 516.779718 387.569581 L 547.740755 387.180699 L 578.701793 387.519109 L 609.66283 386.12989 L 640.623868 386.808365 L 671.584905 386.419483 L 702.545943 379.13001 L 733.50698 373.727029 L 764.468018 53.453563 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4558 |
<defs>
|
| 4559 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4560 |
</defs>
|
| 4561 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4562 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="399.368433" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4563 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="384.590909" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4564 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="385.069979" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4565 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="385.98923" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4566 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="386.154712" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4567 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="385.815474" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4568 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="387.619226" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4569 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="388.023829" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4570 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="386.783543" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4571 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="386.047149" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4572 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="337.468313" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4573 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="323.758146" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4574 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="387.577855" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4575 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="388.03293" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4576 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="387.569581" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4577 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="387.180699" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4578 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="387.519109" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4579 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="386.12989" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4580 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="386.808365" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4581 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="386.419483" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4582 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="379.13001" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4583 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="373.727029" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4584 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="53.453563" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4585 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4586 |
</g>
|
| 4587 |
</g>
|
|
|
|
| 4640 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4641 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4642 |
</span> |
|
| 4643 |
+
Cell: combine | 4.43s
|
| 4644 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4645 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4646 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4753 |
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
|
| 4754 |
torch_eager cuda_B2_D2048_S128_W2 0.09 True
|
| 4755 |
torch_eager cuda_B2_D2048_S128_W4 0.08 True
|
| 4756 |
+
torch_eager cuda_B2_D2048_S2048_W2 0.15 True
|
| 4757 |
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
|
| 4758 |
torch_eager cuda_B2_D2048_S512_W2 0.09 True
|
| 4759 |
+
torch_eager cuda_B2_D2048_S512_W4 0.09 True
|
| 4760 |
torch_eager cuda_B2_D64_S128_W2 0.07 True
|
| 4761 |
torch_eager cuda_B2_D64_S128_W4 0.09 True
|
| 4762 |
torch_eager cuda_B2_D64_S2048_W2 0.09 True
|
| 4763 |
+
torch_eager cuda_B2_D64_S2048_W4 0.09 True
|
| 4764 |
torch_eager cuda_B2_D64_S512_W2 0.09 True
|
| 4765 |
torch_eager cuda_B2_D64_S512_W4 0.09 True
|
| 4766 |
+
torch_eager cuda_B4_D2048_S128_W2 0.09 True
|
| 4767 |
+
torch_eager cuda_B4_D2048_S128_W4 0.09 True
|
| 4768 |
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
|
| 4769 |
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
|
| 4770 |
+
torch_eager cuda_B4_D2048_S512_W2 0.10 True
|
| 4771 |
torch_eager cuda_B4_D2048_S512_W4 0.10 True
|
| 4772 |
+
torch_eager cuda_B4_D64_S128_W2 0.09 True
|
| 4773 |
torch_eager cuda_B4_D64_S128_W4 0.08 True
|
| 4774 |
+
torch_eager cuda_B4_D64_S2048_W2 0.09 True
|
| 4775 |
+
torch_eager cuda_B4_D64_S2048_W4 0.09 True
|
| 4776 |
+
torch_eager cuda_B4_D64_S512_W2 0.09 True
|
| 4777 |
+
torch_eager cuda_B4_D64_S512_W4 0.09 True
|
| 4778 |
|
| 4779 |
GENERATING COMBINED VISUALIZATION
|
| 4780 |
|
|
|
|
| 4794 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4796 |
<div class="uv-logs-content" style="display: none;">
|
| 4797 |
+
Installed 37 packages in 238ms
|
| 4798 |
</div>
|
| 4799 |
</div>
|
| 4800 |
<div class="cell-artifacts">
|
|
|
|
| 4807 |
<rdf:RDF>
|
| 4808 |
<ns2:Work>
|
| 4809 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4810 |
+
<dc:date>2025-10-31T20:14:05.716143</dc:date>
|
| 4811 |
<dc:format>image/svg+xml</dc:format>
|
| 4812 |
<dc:creator>
|
| 4813 |
<ns2:Agent>
|
|
|
|
| 5151 |
<g id="matplotlib.axis_2">
|
| 5152 |
<g id="ytick_1">
|
| 5153 |
<g id="grid-y--2" class="grid grid-y">
|
| 5154 |
+
<path d="M 47.72 375.317309 L 831.034248 375.317309 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5155 |
</g>
|
| 5156 |
<g id="line2d_25">
|
| 5157 |
<defs>
|
| 5158 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 5159 |
</defs>
|
| 5160 |
<g>
|
| 5161 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="375.317309" style="stroke: #000000; stroke-width: 0.8" />
|
| 5162 |
</g>
|
| 5163 |
</g>
|
| 5164 |
<g id="text_25">
|
| 5165 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.116528" transform="rotate(-0 40.72 379.116528)">0.1</text>
|
| 5166 |
</g>
|
| 5167 |
</g>
|
| 5168 |
<g id="ytick_2">
|
| 5169 |
<g id="grid-y--3" class="grid grid-y">
|
| 5170 |
+
<path d="M 47.72 292.576412 L 831.034248 292.576412 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5171 |
</g>
|
| 5172 |
<g id="line2d_26">
|
| 5173 |
<g>
|
| 5174 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="292.576412" style="stroke: #000000; stroke-width: 0.8" />
|
| 5175 |
</g>
|
| 5176 |
</g>
|
| 5177 |
<g id="text_26">
|
| 5178 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.375631" transform="rotate(-0 40.72 296.375631)">0.2</text>
|
| 5179 |
</g>
|
| 5180 |
</g>
|
| 5181 |
<g id="ytick_3">
|
| 5182 |
<g id="grid-y--4" class="grid grid-y">
|
| 5183 |
+
<path d="M 47.72 209.835514 L 831.034248 209.835514 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5184 |
</g>
|
| 5185 |
<g id="line2d_27">
|
| 5186 |
<g>
|
| 5187 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="209.835514" style="stroke: #000000; stroke-width: 0.8" />
|
| 5188 |
</g>
|
| 5189 |
</g>
|
| 5190 |
<g id="text_27">
|
| 5191 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.634733" transform="rotate(-0 40.72 213.634733)">0.3</text>
|
| 5192 |
</g>
|
| 5193 |
</g>
|
| 5194 |
<g id="ytick_4">
|
| 5195 |
<g id="grid-y--5" class="grid grid-y">
|
| 5196 |
+
<path d="M 47.72 127.094617 L 831.034248 127.094617 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5197 |
</g>
|
| 5198 |
<g id="line2d_28">
|
| 5199 |
<g>
|
| 5200 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="127.094617" style="stroke: #000000; stroke-width: 0.8" />
|
| 5201 |
</g>
|
| 5202 |
</g>
|
| 5203 |
<g id="text_28">
|
| 5204 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="130.893835" transform="rotate(-0 40.72 130.893835)">0.4</text>
|
| 5205 |
</g>
|
| 5206 |
</g>
|
| 5207 |
<g id="ytick_5">
|
| 5208 |
<g id="grid-y--6" class="grid grid-y">
|
| 5209 |
+
<path d="M 47.72 44.353719 L 831.034248 44.353719 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 5210 |
</g>
|
| 5211 |
<g id="line2d_29">
|
| 5212 |
<g>
|
| 5213 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="44.353719" style="stroke: #000000; stroke-width: 0.8" />
|
| 5214 |
</g>
|
| 5215 |
</g>
|
| 5216 |
<g id="text_29">
|
| 5217 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="48.152938" transform="rotate(-0 40.72 48.152938)">0.5</text>
|
| 5218 |
</g>
|
| 5219 |
</g>
|
| 5220 |
<g id="label--y" class="ylabel">
|
|
|
|
| 5222 |
</g>
|
| 5223 |
</g>
|
| 5224 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 5225 |
+
<path d="M 83.325193 420.186871 L 114.286231 415.032113 L 145.247268 414.700322 L 176.208306 414.808712 L 207.169343 415.503736 L 238.130381 416.677829 L 269.091418 416.322043 L 300.052455 416.2815 L 331.013493 416.364241 L 361.97453 415.247239 L 392.935568 416.78622 L 423.896605 416.686103 L 454.857643 416.223582 L 485.81868 417.390228 L 516.779718 415.991907 L 547.740755 415.279508 L 578.701793 415.702314 L 609.66283 416.082095 L 640.623868 416.173937 L 671.584905 415.884344 L 702.545943 416.157389 L 733.50698 416.115191 L 764.468018 416.686103 L 795.429055 415.967085 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 5226 |
<defs>
|
| 5227 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 5228 |
</defs>
|
| 5229 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5230 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5231 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="415.032113" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5232 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="414.700322" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5233 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="414.808712" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5234 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="415.503736" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5235 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="416.677829" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5236 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="416.322043" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5237 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="416.2815" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5238 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.364241" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5239 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="415.247239" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5240 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="416.78622" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5241 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5242 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="416.223582" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5243 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="417.390228" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5244 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="415.991907" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5245 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="415.279508" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5246 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="415.702314" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5247 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="416.082095" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5248 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="416.173937" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5249 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="415.884344" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5250 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="416.157389" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5251 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="416.115191" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5252 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="416.686103" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5253 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="415.967085" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5254 |
</g>
|
| 5255 |
</g>
|
| 5256 |
<g id="series--torch-eager" class="series">
|
| 5257 |
+
<path d="M 83.325193 399.368433 L 114.286231 384.590909 L 145.247268 385.069979 L 176.208306 385.98923 L 207.169343 386.154712 L 238.130381 385.815474 L 269.091418 387.619226 L 300.052455 388.023829 L 331.013493 386.783543 L 361.97453 386.047149 L 392.935568 337.468313 L 423.896605 323.758146 L 454.857643 387.577855 L 485.81868 388.03293 L 516.779718 387.569581 L 547.740755 387.180699 L 578.701793 387.519109 L 609.66283 386.12989 L 640.623868 386.808365 L 671.584905 386.419483 L 702.545943 379.13001 L 733.50698 373.727029 L 764.468018 53.453563 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 5258 |
<defs>
|
| 5259 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5260 |
</defs>
|
| 5261 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5262 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="399.368433" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5263 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="384.590909" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5264 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="385.069979" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5265 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="385.98923" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5266 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="386.154712" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5267 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="385.815474" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5268 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="387.619226" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5269 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="388.023829" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5270 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="386.783543" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5271 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="386.047149" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5272 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="337.468313" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5273 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="323.758146" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5274 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="387.577855" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5275 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="388.03293" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5276 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="387.569581" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5277 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="387.180699" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5278 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="387.519109" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5279 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="386.12989" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5280 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="386.808365" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5281 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="386.419483" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5282 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="379.13001" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5283 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="373.727029" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5284 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="53.453563" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5285 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5286 |
</g>
|
| 5287 |
</g>
|
deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-31T20:13:50Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.3733269999629556, "p50": 3.3932979999917734, "p90": 3.4002180000243243, "mean": 3.393551400040451, "iqr": 0.010580999969533877, "raw_times": [3.3896370000547904, 3.4002180000243243, 3.3932979999917734, 3.3733269999629556, 3.411277000168411], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.4049870000671945, "peak_bytes": 5929472, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-31T20:13:51Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.99112300010529, "p50": 4.007804000139004, "p90": 4.020502999992459, "mean": 4.014501400024528, "iqr": 0.017490000118414173, "raw_times": [4.050064000011844, 4.020502999992459, 4.007804000139004, 4.003012999874045, 3.99112300010529], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.017783999870517, "peak_bytes": 15161856, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-31T20:13:51Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.004662999932407, "p50": 4.020202999981848, "p90": 4.030714000009539, "mean": 4.022331200030749, "iqr": 0.011850999953821884, "raw_times": [4.018863000055717, 4.004662999932407, 4.0372130001742335, 4.020202999981848, 4.030714000009539], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.032904000041526, "peak_bytes": 11958784, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-31T20:13:52Z", "run": "c1c013d99d9f4c1199d0a550b8476fb2", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.005022999990615, "p50": 4.020072999992408, "p90": 4.0240040000298904, "mean": 4.01746140000796, "iqr": 0.009850999958871398, "raw_times": [4.014153000071019, 4.005022999990615, 4.024053999955868, 4.0240040000298904, 4.020072999992408], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.024974000003567, "peak_bytes": 30977024, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
|
deformable_detr/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# ]
|
| 8 |
+
#
|
| 9 |
+
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 11 |
+
# ///
|
| 12 |
+
import torch
|
| 13 |
+
import sys
|
| 14 |
+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def torch_deformable_detr(
|
| 18 |
+
value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
|
| 19 |
+
):
|
| 20 |
+
"""
|
| 21 |
+
PyTorch native reference implementation of multi-scale deformable attention.
|
| 22 |
+
Uses vectorized bilinear interpolation for reasonable performance.
|
| 23 |
+
"""
|
| 24 |
+
bs, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
|
| 25 |
+
_, _, _, channels = value.shape
|
| 26 |
+
|
| 27 |
+
output = torch.zeros(bs, num_queries, num_heads, channels, device=value.device, dtype=value.dtype)
|
| 28 |
+
|
| 29 |
+
# Split value tensor by levels
|
| 30 |
+
value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1)
|
| 31 |
+
|
| 32 |
+
# Iterate through each level (can't avoid this loop easily)
|
| 33 |
+
for level_idx in range(num_levels):
|
| 34 |
+
h, w = spatial_shapes[level_idx].tolist()
|
| 35 |
+
value_level = value_list[level_idx] # (bs, h*w, num_heads, channels)
|
| 36 |
+
|
| 37 |
+
# Reshape to spatial grid: (bs, num_heads, channels, h, w)
|
| 38 |
+
value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2)
|
| 39 |
+
|
| 40 |
+
# Get sampling locations and weights for this level
|
| 41 |
+
# loc: (bs, num_queries, num_heads, num_points, 2)
|
| 42 |
+
loc = sampling_locations[:, :, :, level_idx, :, :]
|
| 43 |
+
# weight: (bs, num_queries, num_heads, num_points)
|
| 44 |
+
weight = attention_weights[:, :, :, level_idx, :]
|
| 45 |
+
|
| 46 |
+
# Convert normalized coordinates to pixel coordinates
|
| 47 |
+
# loc[..., 0] is x (width), loc[..., 1] is y (height)
|
| 48 |
+
x = loc[..., 0] * w - 0.5 # (bs, num_queries, num_heads, num_points)
|
| 49 |
+
y = loc[..., 1] * h - 0.5
|
| 50 |
+
|
| 51 |
+
# Get integer coordinates for bilinear interpolation
|
| 52 |
+
x0 = torch.floor(x).long()
|
| 53 |
+
y0 = torch.floor(y).long()
|
| 54 |
+
x1 = x0 + 1
|
| 55 |
+
y1 = y0 + 1
|
| 56 |
+
|
| 57 |
+
# Compute interpolation weights BEFORE clamping (important!)
|
| 58 |
+
lw = x - x0.float() # weight for x direction
|
| 59 |
+
lh = y - y0.float() # weight for y direction
|
| 60 |
+
hw = 1 - lw
|
| 61 |
+
hh = 1 - lh
|
| 62 |
+
|
| 63 |
+
# Create mask for valid sample locations
|
| 64 |
+
valid = (y > -1) & (x > -1) & (y < h) & (x < w)
|
| 65 |
+
|
| 66 |
+
# Create masks for each corner being in bounds
|
| 67 |
+
mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float()
|
| 68 |
+
mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float()
|
| 69 |
+
mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float()
|
| 70 |
+
mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float()
|
| 71 |
+
|
| 72 |
+
# Clamp coordinates for safe indexing
|
| 73 |
+
x0_clamped = torch.clamp(x0, 0, w - 1)
|
| 74 |
+
x1_clamped = torch.clamp(x1, 0, w - 1)
|
| 75 |
+
y0_clamped = torch.clamp(y0, 0, h - 1)
|
| 76 |
+
y1_clamped = torch.clamp(y1, 0, h - 1)
|
| 77 |
+
|
| 78 |
+
# Bilinear interpolation weights for all 4 corners
|
| 79 |
+
w_tl = (hh * hw).unsqueeze(-1) # top-left: (bs, num_queries, num_heads, num_points, 1)
|
| 80 |
+
w_tr = (hh * lw).unsqueeze(-1) # top-right
|
| 81 |
+
w_bl = (lh * hw).unsqueeze(-1) # bottom-left
|
| 82 |
+
w_br = (lh * lw).unsqueeze(-1) # bottom-right
|
| 83 |
+
|
| 84 |
+
# Gather values from the 4 corners using advanced indexing
|
| 85 |
+
batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points)
|
| 86 |
+
head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points)
|
| 87 |
+
|
| 88 |
+
# Gather corner values with clamped indices, then apply corner masks
|
| 89 |
+
v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl
|
| 90 |
+
v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr
|
| 91 |
+
v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl
|
| 92 |
+
v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br
|
| 93 |
+
|
| 94 |
+
# Bilinear interpolation
|
| 95 |
+
sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
|
| 96 |
+
|
| 97 |
+
# Apply valid mask (only accumulate if entire sample location is valid)
|
| 98 |
+
sampled = sampled * valid.unsqueeze(-1).float()
|
| 99 |
+
|
| 100 |
+
# Apply attention weights and sum over points
|
| 101 |
+
# weight: (bs, num_queries, num_heads, num_points)
|
| 102 |
+
# Expand weight: (bs, num_queries, num_heads, num_points, 1)
|
| 103 |
+
weighted_sampled = sampled * weight.unsqueeze(-1)
|
| 104 |
+
|
| 105 |
+
# Sum over points: (bs, num_queries, num_heads, channels)
|
| 106 |
+
output += weighted_sampled.sum(dim=3)
|
| 107 |
+
|
| 108 |
+
# Flatten last two dimensions to match kernel output
|
| 109 |
+
return output.reshape(bs, num_queries, num_heads * channels)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
run_benchmark(
|
| 113 |
+
kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
|
| 114 |
+
impl_name="torch_eager",
|
| 115 |
+
impl_tags={"family": "pytorch", "backend": "eager"},
|
| 116 |
+
impl_func=torch_deformable_detr,
|
| 117 |
+
dtype="float32",
|
| 118 |
+
)
|
deformable_detr/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
deformable_detr/impls/hf_kernels_deformable_detr.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
deformable_detr/impls/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /deformable_detr/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /deformable_detr/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='hf_kernels_deformable_detr.html' class='file'>hf_kernels_deformable_detr.html</a></li>
|
| 86 |
+
<li><a href='torch_deformable_detr.html' class='file'>torch_deformable_detr.html</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
deformable_detr/impls/torch_deformable_detr.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
deformable_detr/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /deformable_detr</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /deformable_detr</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
deformable_detr/results/artifacts/combine/latency.svg
ADDED
|
|
Git LFS Details
|
deformable_detr/results/cells/combine.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 14 |
+
|
| 15 |
+
# Map display names to uvnote environment variables
|
| 16 |
+
cache_env_map = {
|
| 17 |
+
"HF Kernels Deformable DETR": "UVNOTE_FILE_HF_KERNELS_DEFORMABLE_DETR_BENCHMARK",
|
| 18 |
+
"PyTorch Deformable DETR": "UVNOTE_FILE_TORCH_DEFORMABLE_DETR_BENCHMARK",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# Generate combined results with visualization
|
| 22 |
+
generate_combined_results(
|
| 23 |
+
cache_env_map=cache_env_map,
|
| 24 |
+
output_filename="deformable_detr.jsonl",
|
| 25 |
+
svg_filename="latency.svg"
|
| 26 |
+
)
|
deformable_detr/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
deformable_detr/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /deformable_detr/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /deformable_detr/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|
flash_attn/impls/artifacts/benchmark/attention.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.208432000112225, "p50": 1.215130999980829, "p90": 1.2198710001030122, "mean": 1.215487200033749, "iqr": 0.006680000069536618, "raw_times": [1.2208109999392036, 1.208432000112225, 1.2198710001030122, 1.2131910000334756, 1.215130999980829], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2240119999660237, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.26713200006634, "p50": 1.2766830000146001, "p90": 1.277253000125711, "mean": 1.2749268000789016, "iqr": 0.004750000016429112, "raw_times": [1.277253000125711, 1.26713200006634, 1.2766830000146001, 1.281063000078575, 1.2725030001092819], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2717629999769997, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2928539999847999, "p50": 1.3003640001443273, "p90": 1.3163240000721999, "mean": 1.3067478000721167, "iqr": 0.01689100008661626, "raw_times": [1.3003640001443273, 1.2928539999847999, 1.2994329999855836, 1.3163240000721999, 1.3247640001736727], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3026630001604644, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.3232850001259067, "p50": 1.3295650001055037, "p90": 1.3361950000216893, "mean": 1.332684600038192, "iqr": 0.007890999995652237, "raw_times": [1.328304000026037, 1.3361950000216893, 1.3295650001055037, 1.3232850001259067, 1.3460739999118232], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3245140000890387, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4790479999646777, "p50": 1.4950690001569455, "p90": 1.4989779999723396, "mean": 1.4914904000306706, "iqr": 0.017840000055002747, "raw_times": [1.5032190001420531, 1.4950690001569455, 1.4790479999646777, 1.4811379999173369, 1.4989779999723396], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.5107090000583412, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-31T20:13:46Z", "run": "0cb1af490a594cbca21d4dd4012a3c10", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.511368999899787, "p50": 1.5117090001695033, "p90": 1.512698999931672, "mean": 1.516499199988175, "iqr": 0.00113999999484804, "raw_times": [1.511368999899787, 1.512698999931672, 1.5117090001695033, 1.511558999936824, 1.5351600000030885], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.5183190000698232, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -4,7 +4,6 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -13,19 +12,18 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
from kernels import get_kernel
|
| 17 |
|
| 18 |
-
# Load the flash attention 3 kernel
|
| 19 |
-
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
run_benchmark(
|
| 27 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 28 |
-
impl_name="
|
| 29 |
-
impl_tags={"family": "
|
| 30 |
-
impl_func=
|
| 31 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def torch_flash(q, k, v):
|
| 18 |
+
qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
|
| 19 |
+
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
|
| 20 |
+
o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
|
| 21 |
+
return o.transpose(1, 2).contiguous()
|
| 22 |
|
| 23 |
|
| 24 |
run_benchmark(
|
| 25 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 26 |
+
impl_name="torch_flash_ma",
|
| 27 |
+
impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
|
| 28 |
+
impl_func=torch_flash,
|
| 29 |
)
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -4110,7 +4110,7 @@ Cell: nv | 0.21s
|
|
| 4110 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 4112 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
-
<a href="https://github.com/huggingface/kernels-
|
| 4114 |
</div>
|
| 4115 |
<div id="code-nv" class="cell-code" data-lines="3">
|
| 4116 |
<div class="code-wrap">
|
|
@@ -4123,7 +4123,7 @@ Cell: nv | 0.21s
|
|
| 4123 |
</div>
|
| 4124 |
</div>
|
| 4125 |
<div id="output-nv" class="cell-output">
|
| 4126 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 4127 |
+-----------------------------------------------------------------------------------------+
|
| 4128 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 4129 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -4132,7 +4132,7 @@ Cell: nv | 0.21s
|
|
| 4132 |
| | | MIG M. |
|
| 4133 |
|=========================================+========================+======================|
|
| 4134 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 4135 |
-
| N/A
|
| 4136 |
| | | N/A |
|
| 4137 |
+-----------------------------------------+------------------------+----------------------+
|
| 4138 |
|
|
@@ -4154,13 +4154,13 @@ Cell: nv | 0.21s
|
|
| 4154 |
<span class="collapse-indicators">
|
| 4155 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 4156 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4157 |
-
<span id="uv-indicator-benchmark"
|
| 4158 |
</span> |
|
| 4159 |
-
Cell: benchmark |
|
| 4160 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4161 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4162 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4163 |
-
<a href="https://github.com/huggingface/kernels-
|
| 4164 |
</div>
|
| 4165 |
<div id="code-benchmark" class="cell-code" data-lines="29">
|
| 4166 |
<div class="code-wrap">
|
|
@@ -4207,29 +4207,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
|
|
| 4207 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4208 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4209 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4210 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4211 |
-
torch_flash_ma
|
| 4212 |
-
aten::scaled_dot_product_attention 0.
|
| 4213 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4214 |
-
aten::_flash_attention_forward 0.
|
| 4215 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4216 |
-
aten::contiguous 0.
|
| 4217 |
-
aten::clone 0.
|
| 4218 |
-
aten::copy_ 1.
|
| 4219 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4220 |
-
Activity Buffer Request
|
| 4221 |
-
aten::transpose 1.
|
| 4222 |
-
aten::as_strided 0.
|
| 4223 |
-
aten::empty_like 0.
|
| 4224 |
-
aten::empty 1.93
|
| 4225 |
-
cudaLaunchKernel 2.
|
| 4226 |
-
aten::empty_strided 0.32% 16.
|
| 4227 |
-
cudaDeviceGetAttribute 0.05% 2.
|
| 4228 |
-
cudaFuncSetAttribute 0.
|
| 4229 |
-
cudaDeviceSynchronize
|
| 4230 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4231 |
-
Self CPU time total: 5.
|
| 4232 |
-
Self CUDA time total: 3.
|
| 4233 |
|
| 4234 |
|
| 4235 |
|
|
@@ -4239,29 +4239,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
|
|
| 4239 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4240 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4241 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4242 |
-
torch_flash_ma 4.
|
| 4243 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4244 |
-
aten::scaled_dot_product_attention 0.
|
| 4245 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4246 |
-
aten::_flash_attention_forward 0.
|
| 4247 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4248 |
-
aten::contiguous 0.
|
| 4249 |
-
aten::clone 0.
|
| 4250 |
-
aten::copy_ 1.
|
| 4251 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4252 |
-
Activity Buffer Request
|
| 4253 |
-
aten::transpose 0.
|
| 4254 |
-
aten::as_strided 0.
|
| 4255 |
-
aten::empty_like 0.
|
| 4256 |
-
aten::empty 1.
|
| 4257 |
-
cudaLaunchKernel 2.
|
| 4258 |
-
aten::empty_strided 0.
|
| 4259 |
-
cudaDeviceGetAttribute 0.
|
| 4260 |
-
cudaFuncSetAttribute 0.
|
| 4261 |
-
cudaDeviceSynchronize
|
| 4262 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4263 |
-
Self CPU time total: 5.
|
| 4264 |
-
Self CUDA time total: 3.
|
| 4265 |
|
| 4266 |
|
| 4267 |
|
|
@@ -4271,29 +4271,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
|
|
| 4271 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4272 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4273 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4274 |
-
torch_flash_ma 4.
|
| 4275 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4276 |
-
aten::scaled_dot_product_attention 0.50% 26.
|
| 4277 |
-
aten::_scaled_dot_product_flash_attention 0.35% 18.
|
| 4278 |
-
aten::_flash_attention_forward 0.
|
| 4279 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4280 |
-
aten::contiguous 0.
|
| 4281 |
-
aten::clone 0.
|
| 4282 |
-
aten::copy_ 1.
|
| 4283 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.
|
| 4284 |
-
Activity Buffer Request 27.
|
| 4285 |
-
aten::transpose
|
| 4286 |
-
aten::as_strided 0.
|
| 4287 |
-
aten::empty_like 0.
|
| 4288 |
-
aten::empty 1.
|
| 4289 |
-
cudaLaunchKernel
|
| 4290 |
-
aten::empty_strided 0.27% 14.
|
| 4291 |
-
cudaDeviceGetAttribute 0.04%
|
| 4292 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4293 |
-
cudaDeviceSynchronize 58.
|
| 4294 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4295 |
-
Self CPU time total: 5.
|
| 4296 |
-
Self CUDA time total: 3.
|
| 4297 |
|
| 4298 |
|
| 4299 |
|
|
@@ -4303,29 +4303,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
|
|
| 4303 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4304 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4305 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4306 |
-
torch_flash_ma 4.
|
| 4307 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4308 |
-
aten::scaled_dot_product_attention 0.
|
| 4309 |
-
aten::_scaled_dot_product_flash_attention 0.34%
|
| 4310 |
-
aten::_flash_attention_forward 0.
|
| 4311 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4312 |
-
aten::contiguous 0.
|
| 4313 |
-
aten::clone 0.
|
| 4314 |
-
aten::copy_ 1.
|
| 4315 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4316 |
-
Activity Buffer Request 26.
|
| 4317 |
-
aten::transpose 0.
|
| 4318 |
-
aten::as_strided 0.
|
| 4319 |
-
aten::empty_like 0.
|
| 4320 |
-
aten::empty 1.
|
| 4321 |
-
cudaLaunchKernel 5.
|
| 4322 |
-
aten::empty_strided 0.
|
| 4323 |
-
cudaDeviceGetAttribute 0.
|
| 4324 |
-
cudaFuncSetAttribute 0.07%
|
| 4325 |
-
cudaDeviceSynchronize
|
| 4326 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4327 |
-
Self CPU time total: 5.
|
| 4328 |
-
Self CUDA time total: 3.
|
| 4329 |
|
| 4330 |
|
| 4331 |
|
|
@@ -4335,29 +4335,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
|
|
| 4335 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4336 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4337 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4338 |
-
torch_flash_ma
|
| 4339 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4340 |
-
aten::scaled_dot_product_attention 0.
|
| 4341 |
-
aten::_scaled_dot_product_flash_attention 0.34% 20.
|
| 4342 |
-
aten::_flash_attention_forward 0.
|
| 4343 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4344 |
-
aten::contiguous 0.
|
| 4345 |
-
aten::clone 0.
|
| 4346 |
-
aten::copy_ 1.
|
| 4347 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4348 |
-
Activity Buffer Request 24.
|
| 4349 |
-
aten::transpose
|
| 4350 |
-
aten::as_strided 0.
|
| 4351 |
-
aten::empty_like 0.
|
| 4352 |
-
aten::empty 1.
|
| 4353 |
-
cudaLaunchKernel
|
| 4354 |
-
aten::empty_strided 0.
|
| 4355 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4356 |
-
cudaFuncSetAttribute 0.07% 4.
|
| 4357 |
-
cudaDeviceSynchronize
|
| 4358 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4359 |
-
Self CPU time total: 6.
|
| 4360 |
-
Self CUDA time total: 4.
|
| 4361 |
|
| 4362 |
|
| 4363 |
|
|
@@ -4367,45 +4367,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
|
|
| 4367 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4368 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4369 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4370 |
-
torch_flash_ma
|
| 4371 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4372 |
-
aten::scaled_dot_product_attention 0.
|
| 4373 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4374 |
-
aten::_flash_attention_forward 0.
|
| 4375 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4376 |
-
aten::contiguous 0.
|
| 4377 |
-
aten::clone 0.
|
| 4378 |
-
aten::copy_ 1.
|
| 4379 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4380 |
-
Activity Buffer Request 23.
|
| 4381 |
-
aten::transpose 0.
|
| 4382 |
-
aten::as_strided 0.
|
| 4383 |
-
aten::empty_like 0.
|
| 4384 |
-
aten::empty 1.
|
| 4385 |
-
cudaLaunchKernel
|
| 4386 |
-
aten::empty_strided 0.
|
| 4387 |
-
cudaDeviceGetAttribute 0.03% 1.
|
| 4388 |
-
cudaFuncSetAttribute 0.
|
| 4389 |
-
cudaDeviceSynchronize
|
| 4390 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4391 |
-
Self CPU time total: 6.
|
| 4392 |
-
Self CUDA time total: 4.
|
| 4393 |
|
| 4394 |
|
| 4395 |
impl wl p50(ms) ok
|
| 4396 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4397 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4398 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4399 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4400 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4401 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4402 |
</pre></div>
|
| 4403 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4404 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4405 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4406 |
-
Installed 37 packages in 225ms
|
| 4407 |
-
</div>
|
| 4408 |
-
</div>
|
| 4409 |
<div class="cell-artifacts">
|
| 4410 |
<h4>Artifacts:</h4>
|
| 4411 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 4110 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 4112 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 4114 |
</div>
|
| 4115 |
<div id="code-nv" class="cell-code" data-lines="3">
|
| 4116 |
<div class="code-wrap">
|
|
|
|
| 4123 |
</div>
|
| 4124 |
</div>
|
| 4125 |
<div id="output-nv" class="cell-output">
|
| 4126 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:13:43 2025
|
| 4127 |
+-----------------------------------------------------------------------------------------+
|
| 4128 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 4129 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 4132 |
| | | MIG M. |
|
| 4133 |
|=========================================+========================+======================|
|
| 4134 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 4135 |
+
| N/A 43C P0 83W / 350W | 0MiB / 46068MiB | 11% Default |
|
| 4136 |
| | | N/A |
|
| 4137 |
+-----------------------------------------+------------------------+----------------------+
|
| 4138 |
|
|
|
|
| 4154 |
<span class="collapse-indicators">
|
| 4155 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 4156 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4157 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4158 |
</span> |
|
| 4159 |
+
Cell: benchmark | 3.87s
|
| 4160 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4161 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4162 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4163 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/flash_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 4164 |
</div>
|
| 4165 |
<div id="code-benchmark" class="cell-code" data-lines="29">
|
| 4166 |
<div class="code-wrap">
|
|
|
|
| 4207 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4208 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4209 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4210 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.600ms 101.99% 3.600ms 3.600ms 1
|
| 4211 |
+
torch_flash_ma 6.70% 350.157us 46.68% 2.439ms 2.439ms 0.000us 0.00% 3.570ms 3.570ms 1
|
| 4212 |
+
aten::scaled_dot_product_attention 0.81% 42.281us 4.26% 222.626us 74.209us 0.000us 0.00% 2.816ms 938.781us 3
|
| 4213 |
+
aten::_scaled_dot_product_flash_attention 0.52% 27.002us 3.45% 180.345us 60.115us 0.000us 0.00% 2.816ms 938.781us 3
|
| 4214 |
+
aten::_flash_attention_forward 0.79% 41.210us 2.54% 132.453us 44.151us 2.816ms 79.78% 2.816ms 938.781us 3
|
| 4215 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.816ms 79.78% 2.816ms 938.781us 3
|
| 4216 |
+
aten::contiguous 0.29% 15.041us 34.44% 1.800ms 149.962us 0.000us 0.00% 753.884us 62.824us 12
|
| 4217 |
+
aten::clone 0.75% 38.969us 34.15% 1.785ms 148.709us 0.000us 0.00% 753.884us 62.824us 12
|
| 4218 |
+
aten::copy_ 1.73% 90.324us 31.78% 1.661ms 138.388us 713.788us 20.22% 753.884us 62.824us 12
|
| 4219 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 713.788us 20.22% 713.788us 59.482us 12
|
| 4220 |
+
Activity Buffer Request 28.08% 1.467ms 28.08% 1.467ms 1.467ms 40.096us 1.14% 40.096us 40.096us 1
|
| 4221 |
+
aten::transpose 1.25% 65.371us 1.68% 87.543us 3.648us 0.000us 0.00% 0.000us 0.000us 24
|
| 4222 |
+
aten::as_strided 0.42% 22.172us 0.42% 22.172us 0.924us 0.000us 0.00% 0.000us 0.000us 24
|
| 4223 |
+
aten::empty_like 0.53% 27.463us 2.06% 107.524us 7.168us 0.000us 0.00% 0.000us 0.000us 15
|
| 4224 |
+
aten::empty 1.78% 93.220us 1.78% 93.220us 3.884us 0.000us 0.00% 0.000us 0.000us 24
|
| 4225 |
+
cudaLaunchKernel 2.49% 130.035us 2.49% 130.035us 8.669us 0.000us 0.00% 0.000us 0.000us 15
|
| 4226 |
+
aten::empty_strided 0.32% 16.730us 0.32% 16.730us 5.577us 0.000us 0.00% 0.000us 0.000us 3
|
| 4227 |
+
cudaDeviceGetAttribute 0.05% 2.690us 0.05% 2.690us 0.448us 0.000us 0.00% 0.000us 0.000us 6
|
| 4228 |
+
cudaFuncSetAttribute 0.17% 9.000us 0.17% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3
|
| 4229 |
+
cudaDeviceSynchronize 53.32% 2.786ms 53.32% 2.786ms 2.786ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4230 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4231 |
+
Self CPU time total: 5.225ms
|
| 4232 |
+
Self CUDA time total: 3.530ms
|
| 4233 |
|
| 4234 |
|
| 4235 |
|
|
|
|
| 4239 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4240 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4241 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4242 |
+
torch_flash_ma 4.88% 260.255us 42.26% 2.252ms 2.252ms 0.000us 0.00% 3.798ms 3.798ms 1
|
| 4243 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.753ms 100.28% 3.753ms 3.753ms 1
|
| 4244 |
+
aten::scaled_dot_product_attention 0.49% 25.890us 3.50% 186.735us 62.245us 0.000us 0.00% 2.976ms 991.858us 3
|
| 4245 |
+
aten::_scaled_dot_product_flash_attention 0.33% 17.842us 3.02% 160.845us 53.615us 0.000us 0.00% 2.976ms 991.858us 3
|
| 4246 |
+
aten::_flash_attention_forward 0.74% 39.289us 2.26% 120.363us 40.121us 2.976ms 79.51% 2.976ms 991.858us 3
|
| 4247 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.976ms 79.51% 2.976ms 991.858us 3
|
| 4248 |
+
aten::contiguous 0.20% 10.403us 33.03% 1.760ms 146.680us 0.000us 0.00% 822.042us 68.504us 12
|
| 4249 |
+
aten::clone 0.53% 28.238us 32.84% 1.750ms 145.813us 0.000us 0.00% 822.042us 68.504us 12
|
| 4250 |
+
aten::copy_ 1.51% 80.312us 31.12% 1.659ms 138.210us 766.874us 20.49% 822.042us 68.504us 12
|
| 4251 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 766.874us 20.49% 766.874us 63.906us 12
|
| 4252 |
+
Activity Buffer Request 28.02% 1.493ms 28.02% 1.493ms 1.493ms 55.168us 1.47% 55.168us 55.168us 1
|
| 4253 |
+
aten::transpose 0.94% 50.313us 1.27% 67.673us 2.820us 0.000us 0.00% 0.000us 0.000us 24
|
| 4254 |
+
aten::as_strided 0.33% 17.360us 0.33% 17.360us 0.723us 0.000us 0.00% 0.000us 0.000us 24
|
| 4255 |
+
aten::empty_like 0.40% 21.528us 1.56% 83.370us 5.558us 0.000us 0.00% 0.000us 0.000us 15
|
| 4256 |
+
aten::empty 1.43% 76.263us 1.43% 76.263us 3.178us 0.000us 0.00% 0.000us 0.000us 24
|
| 4257 |
+
cudaLaunchKernel 2.08% 110.943us 2.08% 110.943us 7.396us 0.000us 0.00% 0.000us 0.000us 15
|
| 4258 |
+
aten::empty_strided 0.27% 14.621us 0.27% 14.621us 4.874us 0.000us 0.00% 0.000us 0.000us 3
|
| 4259 |
+
cudaDeviceGetAttribute 0.03% 1.781us 0.03% 1.781us 0.297us 0.000us 0.00% 0.000us 0.000us 6
|
| 4260 |
+
cudaFuncSetAttribute 0.08% 4.011us 0.08% 4.011us 1.337us 0.000us 0.00% 0.000us 0.000us 3
|
| 4261 |
+
cudaDeviceSynchronize 57.74% 3.077ms 57.74% 3.077ms 3.077ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4262 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4263 |
+
Self CPU time total: 5.329ms
|
| 4264 |
+
Self CUDA time total: 3.742ms
|
| 4265 |
|
| 4266 |
|
| 4267 |
|
|
|
|
| 4271 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4272 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4273 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4274 |
+
torch_flash_ma 4.87% 262.676us 41.62% 2.245ms 2.245ms 0.000us 0.00% 3.882ms 3.882ms 1
|
| 4275 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.834ms 100.29% 3.834ms 3.834ms 1
|
| 4276 |
+
aten::scaled_dot_product_attention 0.50% 26.770us 3.49% 188.015us 62.672us 0.000us 0.00% 3.044ms 1.015ms 3
|
| 4277 |
+
aten::_scaled_dot_product_flash_attention 0.35% 18.803us 2.99% 161.245us 53.748us 0.000us 0.00% 3.044ms 1.015ms 3
|
| 4278 |
+
aten::_flash_attention_forward 0.74% 39.829us 2.21% 119.102us 39.701us 3.044ms 79.61% 3.044ms 1.015ms 3
|
| 4279 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.044ms 79.61% 3.044ms 1.015ms 3
|
| 4280 |
+
aten::contiguous 0.18% 9.451us 32.36% 1.746ms 145.465us 0.000us 0.00% 838.367us 69.864us 12
|
| 4281 |
+
aten::clone 0.54% 28.881us 32.18% 1.736ms 144.678us 0.000us 0.00% 838.367us 69.864us 12
|
| 4282 |
+
aten::copy_ 1.51% 81.201us 30.48% 1.644ms 137.016us 779.615us 20.39% 838.367us 69.864us 12
|
| 4283 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.615us 20.39% 779.615us 64.968us 12
|
| 4284 |
+
Activity Buffer Request 27.31% 1.473ms 27.31% 1.473ms 1.473ms 58.752us 1.54% 58.752us 58.752us 1
|
| 4285 |
+
aten::transpose 1.01% 54.592us 1.34% 72.471us 3.020us 0.000us 0.00% 0.000us 0.000us 24
|
| 4286 |
+
aten::as_strided 0.33% 17.879us 0.33% 17.879us 0.745us 0.000us 0.00% 0.000us 0.000us 24
|
| 4287 |
+
aten::empty_like 0.37% 20.117us 1.53% 82.751us 5.517us 0.000us 0.00% 0.000us 0.000us 15
|
| 4288 |
+
aten::empty 1.41% 76.295us 1.41% 76.295us 3.179us 0.000us 0.00% 0.000us 0.000us 24
|
| 4289 |
+
cudaLaunchKernel 2.13% 114.795us 2.13% 114.795us 7.653us 0.000us 0.00% 0.000us 0.000us 15
|
| 4290 |
+
aten::empty_strided 0.27% 14.801us 0.27% 14.801us 4.934us 0.000us 0.00% 0.000us 0.000us 3
|
| 4291 |
+
cudaDeviceGetAttribute 0.04% 2.110us 0.04% 2.110us 0.352us 0.000us 0.00% 0.000us 0.000us 6
|
| 4292 |
+
cudaFuncSetAttribute 0.07% 3.990us 0.07% 3.990us 1.330us 0.000us 0.00% 0.000us 0.000us 3
|
| 4293 |
+
cudaDeviceSynchronize 58.38% 3.149ms 58.38% 3.149ms 3.149ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4294 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4295 |
+
Self CPU time total: 5.395ms
|
| 4296 |
+
Self CUDA time total: 3.823ms
|
| 4297 |
|
| 4298 |
|
| 4299 |
|
|
|
|
| 4303 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4304 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4305 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4306 |
+
torch_flash_ma 4.61% 261.106us 43.54% 2.469ms 2.469ms 0.000us 0.00% 3.945ms 3.945ms 1
|
| 4307 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.898ms 100.28% 3.898ms 3.898ms 1
|
| 4308 |
+
aten::scaled_dot_product_attention 0.46% 26.241us 3.40% 192.654us 64.218us 0.000us 0.00% 3.100ms 1.033ms 3
|
| 4309 |
+
aten::_scaled_dot_product_flash_attention 0.34% 19.509us 2.94% 166.413us 55.471us 0.000us 0.00% 3.100ms 1.033ms 3
|
| 4310 |
+
aten::_flash_attention_forward 0.74% 42.081us 2.16% 122.633us 40.878us 3.100ms 79.76% 3.100ms 1.033ms 3
|
| 4311 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.100ms 79.76% 3.100ms 1.033ms 3
|
| 4312 |
+
aten::contiguous 0.20% 11.161us 34.71% 1.968ms 163.994us 0.000us 0.00% 844.704us 70.392us 12
|
| 4313 |
+
aten::clone 0.52% 29.682us 34.51% 1.957ms 163.064us 0.000us 0.00% 844.704us 70.392us 12
|
| 4314 |
+
aten::copy_ 1.45% 82.261us 32.81% 1.860ms 155.026us 786.784us 20.24% 844.704us 70.392us 12
|
| 4315 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 786.784us 20.24% 786.784us 65.565us 12
|
| 4316 |
+
Activity Buffer Request 26.26% 1.489ms 26.26% 1.489ms 1.489ms 57.920us 1.49% 57.920us 57.920us 1
|
| 4317 |
+
aten::transpose 0.95% 53.820us 1.26% 71.322us 2.972us 0.000us 0.00% 0.000us 0.000us 24
|
| 4318 |
+
aten::as_strided 0.31% 17.502us 0.31% 17.502us 0.729us 0.000us 0.00% 0.000us 0.000us 24
|
| 4319 |
+
aten::empty_like 0.39% 21.943us 1.53% 86.983us 5.799us 0.000us 0.00% 0.000us 0.000us 15
|
| 4320 |
+
aten::empty 1.40% 79.202us 1.40% 79.202us 3.300us 0.000us 0.00% 0.000us 0.000us 24
|
| 4321 |
+
cudaLaunchKernel 5.55% 314.487us 5.55% 314.487us 20.966us 0.000us 0.00% 0.000us 0.000us 15
|
| 4322 |
+
aten::empty_strided 0.26% 14.830us 0.26% 14.830us 4.943us 0.000us 0.00% 0.000us 0.000us 3
|
| 4323 |
+
cudaDeviceGetAttribute 0.04% 2.010us 0.04% 2.010us 0.335us 0.000us 0.00% 0.000us 0.000us 6
|
| 4324 |
+
cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3
|
| 4325 |
+
cudaDeviceSynchronize 56.46% 3.201ms 56.46% 3.201ms 3.201ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4326 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4327 |
+
Self CPU time total: 5.670ms
|
| 4328 |
+
Self CUDA time total: 3.887ms
|
| 4329 |
|
| 4330 |
|
| 4331 |
|
|
|
|
| 4335 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4336 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4337 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4338 |
+
torch_flash_ma 5.12% 312.519us 40.82% 2.493ms 2.493ms 0.000us 0.00% 4.416ms 4.416ms 1
|
| 4339 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.365ms 100.24% 4.365ms 4.365ms 1
|
| 4340 |
+
aten::scaled_dot_product_attention 0.42% 25.922us 3.20% 195.246us 65.082us 0.000us 0.00% 3.547ms 1.182ms 3
|
| 4341 |
+
aten::_scaled_dot_product_flash_attention 0.34% 20.847us 2.77% 169.324us 56.441us 0.000us 0.00% 3.547ms 1.182ms 3
|
| 4342 |
+
aten::_flash_attention_forward 0.72% 44.243us 2.07% 126.303us 42.101us 3.547ms 81.45% 3.547ms 1.182ms 3
|
| 4343 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.547ms 81.45% 3.547ms 1.182ms 3
|
| 4344 |
+
aten::contiguous 0.17% 10.559us 31.73% 1.938ms 161.473us 0.000us 0.00% 869.122us 72.427us 12
|
| 4345 |
+
aten::clone 0.47% 28.763us 31.56% 1.927ms 160.593us 0.000us 0.00% 869.122us 72.427us 12
|
| 4346 |
+
aten::copy_ 1.36% 83.033us 30.01% 1.832ms 152.707us 807.906us 18.55% 869.122us 72.427us 12
|
| 4347 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 807.906us 18.55% 807.906us 67.326us 12
|
| 4348 |
+
Activity Buffer Request 24.51% 1.497ms 24.51% 1.497ms 1.497ms 61.216us 1.41% 61.216us 61.216us 1
|
| 4349 |
+
aten::transpose 0.85% 52.195us 1.14% 69.864us 2.911us 0.000us 0.00% 0.000us 0.000us 24
|
| 4350 |
+
aten::as_strided 0.29% 17.669us 0.29% 17.669us 0.736us 0.000us 0.00% 0.000us 0.000us 24
|
| 4351 |
+
aten::empty_like 0.34% 20.921us 1.44% 87.791us 5.853us 0.000us 0.00% 0.000us 0.000us 15
|
| 4352 |
+
aten::empty 1.30% 79.270us 1.30% 79.270us 3.303us 0.000us 0.00% 0.000us 0.000us 24
|
| 4353 |
+
cudaLaunchKernel 4.55% 277.575us 4.55% 277.575us 18.505us 0.000us 0.00% 0.000us 0.000us 15
|
| 4354 |
+
aten::empty_strided 0.27% 16.520us 0.27% 16.520us 5.507us 0.000us 0.00% 0.000us 0.000us 3
|
| 4355 |
+
cudaDeviceGetAttribute 0.03% 1.960us 0.03% 1.960us 0.327us 0.000us 0.00% 0.000us 0.000us 6
|
| 4356 |
+
cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3
|
| 4357 |
+
cudaDeviceSynchronize 59.18% 3.614ms 59.18% 3.614ms 3.614ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4358 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4359 |
+
Self CPU time total: 6.107ms
|
| 4360 |
+
Self CUDA time total: 4.355ms
|
| 4361 |
|
| 4362 |
|
| 4363 |
|
|
|
|
| 4367 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4368 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4369 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4370 |
+
torch_flash_ma 3.85% 236.256us 38.02% 2.335ms 2.335ms 0.000us 0.00% 4.535ms 4.535ms 1
|
| 4371 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.485ms 100.25% 4.485ms 4.485ms 1
|
| 4372 |
+
aten::scaled_dot_product_attention 0.43% 26.452us 2.98% 183.275us 61.092us 0.000us 0.00% 3.655ms 1.218ms 3
|
| 4373 |
+
aten::_scaled_dot_product_flash_attention 0.30% 18.620us 2.55% 156.823us 52.274us 0.000us 0.00% 3.655ms 1.218ms 3
|
| 4374 |
+
aten::_flash_attention_forward 0.59% 36.060us 1.88% 115.323us 38.441us 3.655ms 81.69% 3.655ms 1.218ms 3
|
| 4375 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 81.69% 3.655ms 1.218ms 3
|
| 4376 |
+
aten::contiguous 0.16% 9.770us 30.40% 1.867ms 155.567us 0.000us 0.00% 880.065us 73.339us 12
|
| 4377 |
+
aten::clone 0.46% 28.179us 30.24% 1.857ms 154.753us 0.000us 0.00% 880.065us 73.339us 12
|
| 4378 |
+
aten::copy_ 1.36% 83.563us 28.74% 1.765ms 147.054us 819.137us 18.31% 880.065us 73.339us 12
|
| 4379 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 819.137us 18.31% 819.137us 68.261us 12
|
| 4380 |
+
Activity Buffer Request 23.24% 1.427ms 23.24% 1.427ms 1.427ms 60.928us 1.36% 60.928us 60.928us 1
|
| 4381 |
+
aten::transpose 0.86% 52.980us 1.16% 71.060us 2.961us 0.000us 0.00% 0.000us 0.000us 24
|
| 4382 |
+
aten::as_strided 0.29% 18.080us 0.29% 18.080us 0.753us 0.000us 0.00% 0.000us 0.000us 24
|
| 4383 |
+
aten::empty_like 0.34% 20.930us 1.37% 83.913us 5.594us 0.000us 0.00% 0.000us 0.000us 15
|
| 4384 |
+
aten::empty 1.25% 77.043us 1.25% 77.043us 3.210us 0.000us 0.00% 0.000us 0.000us 24
|
| 4385 |
+
cudaLaunchKernel 4.54% 278.990us 4.54% 278.990us 18.599us 0.000us 0.00% 0.000us 0.000us 15
|
| 4386 |
+
aten::empty_strided 0.24% 14.661us 0.24% 14.661us 4.887us 0.000us 0.00% 0.000us 0.000us 3
|
| 4387 |
+
cudaDeviceGetAttribute 0.03% 1.978us 0.03% 1.978us 0.330us 0.000us 0.00% 0.000us 0.000us 6
|
| 4388 |
+
cudaFuncSetAttribute 0.06% 3.901us 0.06% 3.901us 1.300us 0.000us 0.00% 0.000us 0.000us 3
|
| 4389 |
+
cudaDeviceSynchronize 61.98% 3.806ms 61.98% 3.806ms 3.806ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4390 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4391 |
+
Self CPU time total: 6.141ms
|
| 4392 |
+
Self CUDA time total: 4.474ms
|
| 4393 |
|
| 4394 |
|
| 4395 |
impl wl p50(ms) ok
|
| 4396 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4397 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
|
| 4398 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
|
| 4399 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
|
| 4400 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.50 True
|
| 4401 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
|
| 4402 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4403 |
<div class="cell-artifacts">
|
| 4404 |
<h4>Artifacts:</h4>
|
| 4405 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -4104,14 +4104,14 @@ body[data-tool="eraser"] .main-content {
|
|
| 4104 |
<span class="collapse-indicators">
|
| 4105 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 4106 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
-
<span id="uv-indicator-benchmark"
|
| 4108 |
</span> |
|
| 4109 |
-
Cell: benchmark |
|
| 4110 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4112 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
-
<a href="https://github.com/huggingface/kernels-
|
| 4114 |
-
<a href="https://huggingface.co/kernels-community/flash-
|
| 4115 |
</div>
|
| 4116 |
<div id="code-benchmark" class="cell-code" data-lines="32">
|
| 4117 |
<div class="code-wrap">
|
|
@@ -4161,21 +4161,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
|
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4163 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4164 |
-
hf_kernels_flash_attn 3.
|
| 4165 |
-
_flash_attn_9e27194::fwd 1.
|
| 4166 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4167 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4168 |
-
Activity Buffer Request
|
| 4169 |
-
cudaDeviceGetAttribute 0.
|
| 4170 |
-
aten::empty_like 0.
|
| 4171 |
-
aten::empty_strided 0.
|
| 4172 |
-
aten::empty 0.
|
| 4173 |
-
cudaFuncSetAttribute 0.26% 11.
|
| 4174 |
-
cudaLaunchKernel 0.
|
| 4175 |
-
cudaDeviceSynchronize 58.
|
| 4176 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4177 |
-
Self CPU time total: 4.
|
| 4178 |
-
Self CUDA time total: 2.
|
| 4179 |
|
| 4180 |
|
| 4181 |
|
|
@@ -4185,21 +4185,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
|
|
| 4185 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4186 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4187 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4188 |
-
hf_kernels_flash_attn
|
| 4189 |
-
_flash_attn_9e27194::fwd 1.
|
| 4190 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4191 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4192 |
-
Activity Buffer Request
|
| 4193 |
-
cudaDeviceGetAttribute 0.
|
| 4194 |
-
aten::empty_like 0.
|
| 4195 |
-
aten::empty_strided 0.
|
| 4196 |
-
aten::empty 0.
|
| 4197 |
-
cudaFuncSetAttribute 0.
|
| 4198 |
-
cudaLaunchKernel 0.
|
| 4199 |
-
cudaDeviceSynchronize 62.
|
| 4200 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4201 |
-
Self CPU time total: 4.
|
| 4202 |
-
Self CUDA time total: 2.
|
| 4203 |
|
| 4204 |
|
| 4205 |
|
|
@@ -4209,21 +4209,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
|
|
| 4209 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4210 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4211 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4212 |
-
hf_kernels_flash_attn 2.
|
| 4213 |
-
_flash_attn_9e27194::fwd 1.
|
| 4214 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4215 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4216 |
-
Activity Buffer Request 31.
|
| 4217 |
-
cudaDeviceGetAttribute 0.09% 4.
|
| 4218 |
-
aten::empty_like 0.16% 7.
|
| 4219 |
-
aten::empty_strided 0.36% 16.
|
| 4220 |
-
aten::empty 0.
|
| 4221 |
-
cudaFuncSetAttribute 0.08% 3.
|
| 4222 |
-
cudaLaunchKernel 0.
|
| 4223 |
-
cudaDeviceSynchronize 63.
|
| 4224 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4225 |
-
Self CPU time total: 4.
|
| 4226 |
-
Self CUDA time total: 3.
|
| 4227 |
|
| 4228 |
|
| 4229 |
|
|
@@ -4233,21 +4233,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
|
|
| 4233 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4234 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4235 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4236 |
-
hf_kernels_flash_attn 2.
|
| 4237 |
-
_flash_attn_9e27194::fwd
|
| 4238 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4239 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4240 |
-
Activity Buffer Request 29.
|
| 4241 |
-
cudaDeviceGetAttribute 0.08% 3.
|
| 4242 |
-
aten::empty_like 0.16% 7.
|
| 4243 |
-
aten::empty_strided 0.37% 17.
|
| 4244 |
-
aten::empty 0.
|
| 4245 |
-
cudaFuncSetAttribute 0.08% 3.
|
| 4246 |
-
cudaLaunchKernel 4.
|
| 4247 |
-
cudaDeviceSynchronize
|
| 4248 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4249 |
-
Self CPU time total: 4.
|
| 4250 |
-
Self CUDA time total: 3.
|
| 4251 |
|
| 4252 |
|
| 4253 |
|
|
@@ -4257,21 +4257,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
|
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4259 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4260 |
-
hf_kernels_flash_attn 2.
|
| 4261 |
-
_flash_attn_9e27194::fwd 0.
|
| 4262 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4263 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4264 |
-
Activity Buffer Request 27.
|
| 4265 |
-
cudaDeviceGetAttribute 0.
|
| 4266 |
-
aten::empty_like 0.14% 7.
|
| 4267 |
-
aten::empty_strided 0.33% 17.
|
| 4268 |
-
aten::empty 0.
|
| 4269 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4270 |
-
cudaLaunchKernel 3.
|
| 4271 |
-
cudaDeviceSynchronize
|
| 4272 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4273 |
-
Self CPU time total: 5.
|
| 4274 |
-
Self CUDA time total: 3.
|
| 4275 |
|
| 4276 |
|
| 4277 |
|
|
@@ -4281,41 +4281,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
|
|
| 4281 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4282 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4283 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4284 |
-
hf_kernels_flash_attn 2.
|
| 4285 |
-
_flash_attn_9e27194::fwd 0.
|
| 4286 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4287 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4288 |
-
Activity Buffer Request
|
| 4289 |
-
cudaDeviceGetAttribute 0.07% 3.
|
| 4290 |
-
aten::empty_like 0.14% 7.
|
| 4291 |
-
aten::empty_strided 0.
|
| 4292 |
-
aten::empty 0.
|
| 4293 |
-
cudaFuncSetAttribute 0.
|
| 4294 |
-
cudaLaunchKernel 3.
|
| 4295 |
-
cudaDeviceSynchronize
|
| 4296 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4297 |
-
Self CPU time total: 5.
|
| 4298 |
-
Self CUDA time total: 3.
|
| 4299 |
|
| 4300 |
|
| 4301 |
impl wl p50(ms) ok
|
| 4302 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4303 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16
|
| 4304 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4305 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4306 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4307 |
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
|
| 4308 |
</pre></div>
|
| 4309 |
-
<div class="
|
| 4310 |
-
|
| 4311 |
-
|
| 4312 |
-
|
| 4313 |
</div>
|
| 4314 |
-
</div>
|
| 4315 |
-
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4316 |
-
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:02, 8.29it/s]
|
| 4317 |
-
Fetching 20 files: 10%|█ | 2/20 [00:06<01:08, 3.82s/it]
|
| 4318 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:06<00:00, 3.06it/s]</div>
|
| 4319 |
<div class="cell-artifacts">
|
| 4320 |
<h4>Artifacts:</h4>
|
| 4321 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 4104 |
<span class="collapse-indicators">
|
| 4105 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 4106 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
+
Cell: benchmark | 5.83s
|
| 4110 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4112 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/hf_kernels_flash_attn.md" target="_blank" class="github-btn">GitHub</a>
|
| 4114 |
+
<a href="https://huggingface.co/kernels-community/flash-attn" target="_blank" class="hf-btn">🤗 HF</a>
|
| 4115 |
</div>
|
| 4116 |
<div id="code-benchmark" class="cell-code" data-lines="32">
|
| 4117 |
<div class="code-wrap">
|
|
|
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4163 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4164 |
+
hf_kernels_flash_attn 3.51% 153.413us 41.11% 1.797ms 1.797ms 0.000us 0.00% 3.733ms 3.733ms 1
|
| 4165 |
+
_flash_attn_9e27194::fwd 1.62% 70.702us 37.60% 1.644ms 547.894us 2.785ms 100.00% 3.733ms 1.244ms 3
|
| 4166 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.786ms 100.05% 2.786ms 2.786ms 1
|
| 4167 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.785ms 100.00% 2.785ms 928.303us 3
|
| 4168 |
+
Activity Buffer Request 32.92% 1.439ms 32.92% 1.439ms 1.439ms 947.706us 34.03% 947.706us 947.706us 1
|
| 4169 |
+
cudaDeviceGetAttribute 0.11% 4.891us 0.11% 4.891us 0.326us 0.000us 0.00% 0.000us 0.000us 15
|
| 4170 |
+
aten::empty_like 0.37% 16.181us 1.17% 51.061us 17.020us 0.000us 0.00% 0.000us 0.000us 3
|
| 4171 |
+
aten::empty_strided 0.80% 34.880us 0.80% 34.880us 11.627us 0.000us 0.00% 0.000us 0.000us 3
|
| 4172 |
+
aten::empty 0.59% 25.681us 0.59% 25.681us 2.853us 0.000us 0.00% 0.000us 0.000us 9
|
| 4173 |
+
cudaFuncSetAttribute 0.26% 11.340us 0.26% 11.340us 3.780us 0.000us 0.00% 0.000us 0.000us 3
|
| 4174 |
+
cudaLaunchKernel 0.93% 40.731us 0.93% 40.731us 13.577us 0.000us 0.00% 0.000us 0.000us 3
|
| 4175 |
+
cudaDeviceSynchronize 58.89% 2.575ms 58.89% 2.575ms 2.575ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4176 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4177 |
+
Self CPU time total: 4.372ms
|
| 4178 |
+
Self CUDA time total: 2.785ms
|
| 4179 |
|
| 4180 |
|
| 4181 |
|
|
|
|
| 4185 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4186 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4187 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4188 |
+
hf_kernels_flash_attn 1.94% 86.682us 37.50% 1.676ms 1.676ms 0.000us 0.00% 3.929ms 3.929ms 1
|
| 4189 |
+
_flash_attn_9e27194::fwd 1.06% 47.570us 35.56% 1.589ms 529.734us 2.938ms 100.00% 3.929ms 1.310ms 3
|
| 4190 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.939ms 100.05% 2.939ms 2.939ms 1
|
| 4191 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.938ms 100.00% 2.938ms 979.209us 3
|
| 4192 |
+
Activity Buffer Request 32.66% 1.460ms 32.66% 1.460ms 1.460ms 991.166us 33.74% 991.166us 991.166us 1
|
| 4193 |
+
cudaDeviceGetAttribute 0.10% 4.450us 0.10% 4.450us 0.297us 0.000us 0.00% 0.000us 0.000us 15
|
| 4194 |
+
aten::empty_like 0.19% 8.440us 0.55% 24.690us 8.230us 0.000us 0.00% 0.000us 0.000us 3
|
| 4195 |
+
aten::empty_strided 0.36% 16.250us 0.36% 16.250us 5.417us 0.000us 0.00% 0.000us 0.000us 3
|
| 4196 |
+
aten::empty 0.51% 22.872us 0.51% 22.872us 2.541us 0.000us 0.00% 0.000us 0.000us 9
|
| 4197 |
+
cudaFuncSetAttribute 0.07% 3.350us 0.07% 3.350us 1.117us 0.000us 0.00% 0.000us 0.000us 3
|
| 4198 |
+
cudaLaunchKernel 0.60% 26.611us 0.60% 26.611us 8.870us 0.000us 0.00% 0.000us 0.000us 3
|
| 4199 |
+
cudaDeviceSynchronize 62.50% 2.794ms 62.50% 2.794ms 2.794ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4200 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4201 |
+
Self CPU time total: 4.469ms
|
| 4202 |
+
Self CUDA time total: 2.938ms
|
| 4203 |
|
| 4204 |
|
| 4205 |
|
|
|
|
| 4209 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4210 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4211 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4212 |
+
hf_kernels_flash_attn 2.38% 109.313us 36.70% 1.683ms 1.683ms 0.000us 0.00% 4.081ms 4.081ms 1
|
| 4213 |
+
_flash_attn_9e27194::fwd 1.05% 48.167us 34.31% 1.574ms 524.567us 3.048ms 100.00% 4.081ms 1.360ms 3
|
| 4214 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.049ms 100.05% 3.049ms 3.049ms 1
|
| 4215 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.048ms 100.00% 3.048ms 1.016ms 3
|
| 4216 |
+
Activity Buffer Request 31.46% 1.443ms 31.46% 1.443ms 1.443ms 1.033ms 33.90% 1.033ms 1.033ms 1
|
| 4217 |
+
cudaDeviceGetAttribute 0.09% 4.231us 0.09% 4.231us 0.282us 0.000us 0.00% 0.000us 0.000us 15
|
| 4218 |
+
aten::empty_like 0.16% 7.250us 0.52% 23.960us 7.987us 0.000us 0.00% 0.000us 0.000us 3
|
| 4219 |
+
aten::empty_strided 0.36% 16.710us 0.36% 16.710us 5.570us 0.000us 0.00% 0.000us 0.000us 3
|
| 4220 |
+
aten::empty 0.46% 21.300us 0.46% 21.300us 2.367us 0.000us 0.00% 0.000us 0.000us 9
|
| 4221 |
+
cudaFuncSetAttribute 0.08% 3.561us 0.08% 3.561us 1.187us 0.000us 0.00% 0.000us 0.000us 3
|
| 4222 |
+
cudaLaunchKernel 0.64% 29.473us 0.64% 29.473us 9.824us 0.000us 0.00% 0.000us 0.000us 3
|
| 4223 |
+
cudaDeviceSynchronize 63.30% 2.903ms 63.30% 2.903ms 2.903ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4224 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4225 |
+
Self CPU time total: 4.586ms
|
| 4226 |
+
Self CUDA time total: 3.048ms
|
| 4227 |
|
| 4228 |
|
| 4229 |
|
|
|
|
| 4233 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4234 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4235 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4236 |
+
hf_kernels_flash_attn 2.13% 103.094us 38.83% 1.884ms 1.884ms 0.000us 0.00% 4.165ms 4.165ms 1
|
| 4237 |
+
_flash_attn_9e27194::fwd 0.99% 47.838us 36.71% 1.781ms 593.521us 3.114ms 100.00% 4.165ms 1.388ms 3
|
| 4238 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.116ms 100.05% 3.116ms 3.116ms 1
|
| 4239 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.114ms 100.00% 3.114ms 1.038ms 3
|
| 4240 |
+
Activity Buffer Request 29.59% 1.435ms 29.59% 1.435ms 1.435ms 1.051ms 33.75% 1.051ms 1.051ms 1
|
| 4241 |
+
cudaDeviceGetAttribute 0.08% 3.800us 0.08% 3.800us 0.253us 0.000us 0.00% 0.000us 0.000us 15
|
| 4242 |
+
aten::empty_like 0.16% 7.891us 0.53% 25.811us 8.604us 0.000us 0.00% 0.000us 0.000us 3
|
| 4243 |
+
aten::empty_strided 0.37% 17.920us 0.37% 17.920us 5.973us 0.000us 0.00% 0.000us 0.000us 3
|
| 4244 |
+
aten::empty 0.45% 21.731us 0.45% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9
|
| 4245 |
+
cudaFuncSetAttribute 0.08% 3.740us 0.08% 3.740us 1.247us 0.000us 0.00% 0.000us 0.000us 3
|
| 4246 |
+
cudaLaunchKernel 4.99% 242.187us 4.99% 242.187us 80.729us 0.000us 0.00% 0.000us 0.000us 3
|
| 4247 |
+
cudaDeviceSynchronize 61.17% 2.967ms 61.17% 2.967ms 2.967ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4248 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4249 |
+
Self CPU time total: 4.851ms
|
| 4250 |
+
Self CUDA time total: 3.114ms
|
| 4251 |
|
| 4252 |
|
| 4253 |
|
|
|
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4259 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4260 |
+
hf_kernels_flash_attn 2.00% 105.522us 34.61% 1.828ms 1.828ms 0.000us 0.00% 4.806ms 4.806ms 1
|
| 4261 |
+
_flash_attn_9e27194::fwd 0.94% 49.622us 32.62% 1.723ms 574.192us 3.597ms 100.00% 4.806ms 1.602ms 3
|
| 4262 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.599ms 100.05% 3.599ms 3.599ms 1
|
| 4263 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.597ms 100.00% 3.597ms 1.199ms 3
|
| 4264 |
+
Activity Buffer Request 27.37% 1.446ms 27.37% 1.446ms 1.446ms 1.209ms 33.59% 1.209ms 1.209ms 1
|
| 4265 |
+
cudaDeviceGetAttribute 0.08% 3.991us 0.08% 3.991us 0.266us 0.000us 0.00% 0.000us 0.000us 15
|
| 4266 |
+
aten::empty_like 0.14% 7.250us 0.47% 24.620us 8.207us 0.000us 0.00% 0.000us 0.000us 3
|
| 4267 |
+
aten::empty_strided 0.33% 17.370us 0.33% 17.370us 5.790us 0.000us 0.00% 0.000us 0.000us 3
|
| 4268 |
+
aten::empty 0.41% 21.681us 0.41% 21.681us 2.409us 0.000us 0.00% 0.000us 0.000us 9
|
| 4269 |
+
cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
|
| 4270 |
+
cudaLaunchKernel 3.28% 173.384us 3.28% 173.384us 57.795us 0.000us 0.00% 0.000us 0.000us 3
|
| 4271 |
+
cudaDeviceSynchronize 65.39% 3.453ms 65.39% 3.453ms 3.453ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4272 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4273 |
+
Self CPU time total: 5.281ms
|
| 4274 |
+
Self CUDA time total: 3.597ms
|
| 4275 |
|
| 4276 |
|
| 4277 |
|
|
|
|
| 4281 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4282 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4283 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4284 |
+
hf_kernels_flash_attn 2.02% 107.892us 33.82% 1.810ms 1.810ms 0.000us 0.00% 4.930ms 4.930ms 1
|
| 4285 |
+
_flash_attn_9e27194::fwd 0.91% 48.918us 31.80% 1.702ms 567.268us 3.687ms 100.00% 4.930ms 1.643ms 3
|
| 4286 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.689ms 100.04% 3.689ms 3.689ms 1
|
| 4287 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.687ms 100.00% 3.687ms 1.229ms 3
|
| 4288 |
+
Activity Buffer Request 26.86% 1.437ms 26.86% 1.437ms 1.437ms 1.242ms 33.69% 1.242ms 1.242ms 1
|
| 4289 |
+
cudaDeviceGetAttribute 0.07% 3.881us 0.07% 3.881us 0.259us 0.000us 0.00% 0.000us 0.000us 15
|
| 4290 |
+
aten::empty_like 0.14% 7.591us 0.49% 26.111us 8.704us 0.000us 0.00% 0.000us 0.000us 3
|
| 4291 |
+
aten::empty_strided 0.35% 18.520us 0.35% 18.520us 6.173us 0.000us 0.00% 0.000us 0.000us 3
|
| 4292 |
+
aten::empty 0.39% 20.640us 0.39% 20.640us 2.293us 0.000us 0.00% 0.000us 0.000us 9
|
| 4293 |
+
cudaFuncSetAttribute 0.07% 3.561us 0.07% 3.561us 1.187us 0.000us 0.00% 0.000us 0.000us 3
|
| 4294 |
+
cudaLaunchKernel 3.01% 161.306us 3.01% 161.306us 53.769us 0.000us 0.00% 0.000us 0.000us 3
|
| 4295 |
+
cudaDeviceSynchronize 66.18% 3.542ms 66.18% 3.542ms 3.542ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4296 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4297 |
+
Self CPU time total: 5.351ms
|
| 4298 |
+
Self CUDA time total: 3.687ms
|
| 4299 |
|
| 4300 |
|
| 4301 |
impl wl p50(ms) ok
|
| 4302 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
|
| 4303 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
|
| 4304 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
|
| 4305 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
|
| 4306 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
|
| 4307 |
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
|
| 4308 |
</pre></div>
|
| 4309 |
+
<div class="cell-stderr">
|
| 4310 |
+
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4311 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:15, 1.19it/s]
|
| 4312 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.87it/s]
|
| 4313 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4314 |
<div class="cell-artifacts">
|
| 4315 |
<h4>Artifacts:</h4>
|
| 4316 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 4106 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
-
Cell: benchmark | 5.
|
| 4110 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4112 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
-
<a href="https://github.com/huggingface/kernels-
|
| 4114 |
<a href="https://huggingface.co/kernels-community/flash-attn3" target="_blank" class="hf-btn">🤗 HF</a>
|
| 4115 |
</div>
|
| 4116 |
<div id="code-benchmark" class="cell-code" data-lines="31">
|
|
@@ -4160,19 +4160,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
|
|
| 4160 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4161 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4162 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4163 |
-
hf_kernels_flash_attn3
|
| 4164 |
-
FlashAttnFunc
|
| 4165 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 4166 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4167 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4168 |
-
Activity Buffer Request 34.
|
| 4169 |
-
aten::empty 1.07%
|
| 4170 |
-
cudaFuncSetAttribute 0.
|
| 4171 |
-
cudaLaunchKernel
|
| 4172 |
-
cudaDeviceSynchronize
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
-
Self CPU time total: 4.
|
| 4175 |
-
Self CUDA time total: 2.
|
| 4176 |
|
| 4177 |
|
| 4178 |
|
|
@@ -4182,19 +4182,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
|
|
| 4182 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4183 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4184 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4185 |
-
hf_kernels_flash_attn3 2.
|
| 4186 |
-
FlashAttnFunc 2.
|
| 4187 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 4188 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4189 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4190 |
-
Activity Buffer Request
|
| 4191 |
-
aten::empty 0.
|
| 4192 |
-
cudaFuncSetAttribute 0.
|
| 4193 |
-
cudaLaunchKernel 0.
|
| 4194 |
-
cudaDeviceSynchronize 58.
|
| 4195 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4196 |
-
Self CPU time total: 4.
|
| 4197 |
-
Self CUDA time total: 2.
|
| 4198 |
|
| 4199 |
|
| 4200 |
|
|
@@ -4204,19 +4204,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
|
|
| 4204 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4205 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4206 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4207 |
-
hf_kernels_flash_attn3 2.
|
| 4208 |
-
FlashAttnFunc 2.
|
| 4209 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 4210 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4211 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4212 |
-
Activity Buffer Request
|
| 4213 |
-
aten::empty 0.
|
| 4214 |
-
cudaFuncSetAttribute 0.
|
| 4215 |
-
cudaLaunchKernel 0.
|
| 4216 |
-
cudaDeviceSynchronize
|
| 4217 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4218 |
-
Self CPU time total: 4.
|
| 4219 |
-
Self CUDA time total: 2.
|
| 4220 |
|
| 4221 |
|
| 4222 |
|
|
@@ -4226,19 +4226,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
|
|
| 4226 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4227 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4228 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4229 |
-
hf_kernels_flash_attn3 2.
|
| 4230 |
-
FlashAttnFunc
|
| 4231 |
-
_flash_attn3_48fe103_dirty::fwd 1.14%
|
| 4232 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4233 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4234 |
-
Activity Buffer Request 30.
|
| 4235 |
-
aten::empty 0.
|
| 4236 |
-
cudaFuncSetAttribute 0.11% 5.
|
| 4237 |
-
cudaLaunchKernel
|
| 4238 |
-
cudaDeviceSynchronize 57.
|
| 4239 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4240 |
-
Self CPU time total: 4.
|
| 4241 |
-
Self CUDA time total: 2.
|
| 4242 |
|
| 4243 |
|
| 4244 |
|
|
@@ -4248,19 +4248,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
|
|
| 4248 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4249 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4250 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4251 |
-
hf_kernels_flash_attn3 2.
|
| 4252 |
-
FlashAttnFunc 1.
|
| 4253 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 4254 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4255 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4256 |
-
Activity Buffer Request 27.
|
| 4257 |
-
aten::empty 0.
|
| 4258 |
-
cudaFuncSetAttribute 0.10% 5.
|
| 4259 |
-
cudaLaunchKernel
|
| 4260 |
-
cudaDeviceSynchronize 62.
|
| 4261 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4262 |
-
Self CPU time total: 5.
|
| 4263 |
-
Self CUDA time total: 3.
|
| 4264 |
|
| 4265 |
|
| 4266 |
|
|
@@ -4270,33 +4270,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
|
|
| 4270 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4271 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4272 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4273 |
-
hf_kernels_flash_attn3 2.
|
| 4274 |
-
FlashAttnFunc 1.
|
| 4275 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 4276 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.
|
| 4277 |
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.468ms 100.00% 3.468ms 1.156ms 3
|
| 4278 |
-
Activity Buffer Request 27.
|
| 4279 |
-
aten::empty 0.
|
| 4280 |
-
cudaFuncSetAttribute 0.10% 5.
|
| 4281 |
-
cudaLaunchKernel
|
| 4282 |
-
cudaDeviceSynchronize 62.
|
| 4283 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4284 |
-
Self CPU time total: 5.
|
| 4285 |
Self CUDA time total: 3.468ms
|
| 4286 |
|
| 4287 |
|
| 4288 |
impl wl p50(ms) ok
|
| 4289 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.
|
| 4290 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4291 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4292 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4293 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4294 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4295 |
</pre></div>
|
| 4296 |
<div class="cell-stderr">
|
| 4297 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4298 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 4299 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 4300 |
</div>
|
| 4301 |
<div class="cell-artifacts">
|
| 4302 |
<h4>Artifacts:</h4>
|
|
|
|
| 4106 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
+
Cell: benchmark | 5.53s
|
| 4110 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4112 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/hf_kernels_flash_attn3.md" target="_blank" class="github-btn">GitHub</a>
|
| 4114 |
<a href="https://huggingface.co/kernels-community/flash-attn3" target="_blank" class="hf-btn">🤗 HF</a>
|
| 4115 |
</div>
|
| 4116 |
<div id="code-benchmark" class="cell-code" data-lines="31">
|
|
|
|
| 4160 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4161 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4162 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4163 |
+
hf_kernels_flash_attn3 3.85% 171.193us 46.01% 2.045ms 2.045ms 0.000us 0.00% 3.614ms 3.614ms 1
|
| 4164 |
+
FlashAttnFunc 3.07% 136.295us 42.15% 1.874ms 624.570us 0.000us 0.00% 3.614ms 1.205ms 3
|
| 4165 |
+
_flash_attn3_48fe103_dirty::fwd 1.94% 86.341us 39.09% 1.737ms 579.138us 2.720ms 100.00% 3.614ms 1.205ms 3
|
| 4166 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.722ms 100.05% 2.722ms 2.722ms 1
|
| 4167 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.720ms 100.00% 2.720ms 906.698us 3
|
| 4168 |
+
Activity Buffer Request 34.72% 1.543ms 34.72% 1.543ms 1.543ms 893.600us 32.85% 893.600us 893.600us 1
|
| 4169 |
+
aten::empty 1.07% 47.441us 1.07% 47.441us 7.907us 0.000us 0.00% 0.000us 0.000us 6
|
| 4170 |
+
cudaFuncSetAttribute 0.31% 13.761us 0.31% 13.761us 4.587us 0.000us 0.00% 0.000us 0.000us 3
|
| 4171 |
+
cudaLaunchKernel 1.05% 46.772us 1.05% 46.772us 15.591us 0.000us 0.00% 0.000us 0.000us 3
|
| 4172 |
+
cudaDeviceSynchronize 53.99% 2.400ms 53.99% 2.400ms 2.400ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
+
Self CPU time total: 4.445ms
|
| 4175 |
+
Self CUDA time total: 2.720ms
|
| 4176 |
|
| 4177 |
|
| 4178 |
|
|
|
|
| 4182 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4183 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4184 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4185 |
+
hf_kernels_flash_attn3 2.41% 104.370us 41.13% 1.784ms 1.784ms 0.000us 0.00% 3.700ms 3.700ms 1
|
| 4186 |
+
FlashAttnFunc 2.00% 86.685us 38.73% 1.679ms 559.738us 0.000us 0.00% 3.700ms 1.233ms 3
|
| 4187 |
+
_flash_attn3_48fe103_dirty::fwd 1.21% 52.631us 36.73% 1.593ms 530.843us 2.768ms 100.00% 3.700ms 1.233ms 3
|
| 4188 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.769ms 100.06% 2.769ms 2.769ms 1
|
| 4189 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.768ms 100.00% 2.768ms 922.559us 3
|
| 4190 |
+
Activity Buffer Request 34.10% 1.479ms 34.10% 1.479ms 1.479ms 932.127us 33.68% 932.127us 932.127us 1
|
| 4191 |
+
aten::empty 0.60% 25.981us 0.60% 25.981us 4.330us 0.000us 0.00% 0.000us 0.000us 6
|
| 4192 |
+
cudaFuncSetAttribute 0.12% 5.050us 0.12% 5.050us 1.683us 0.000us 0.00% 0.000us 0.000us 3
|
| 4193 |
+
cudaLaunchKernel 0.70% 30.140us 0.70% 30.140us 10.047us 0.000us 0.00% 0.000us 0.000us 3
|
| 4194 |
+
cudaDeviceSynchronize 58.87% 2.553ms 58.87% 2.553ms 2.553ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4195 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4196 |
+
Self CPU time total: 4.336ms
|
| 4197 |
+
Self CUDA time total: 2.768ms
|
| 4198 |
|
| 4199 |
|
| 4200 |
|
|
|
|
| 4204 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4205 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4206 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4207 |
+
hf_kernels_flash_attn3 2.29% 102.411us 40.10% 1.791ms 1.791ms 0.000us 0.00% 3.875ms 3.875ms 1
|
| 4208 |
+
FlashAttnFunc 2.01% 89.903us 37.81% 1.688ms 562.801us 0.000us 0.00% 3.875ms 1.292ms 3
|
| 4209 |
+
_flash_attn3_48fe103_dirty::fwd 1.18% 52.613us 35.79% 1.599ms 532.834us 2.892ms 100.00% 3.875ms 1.292ms 3
|
| 4210 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.893ms 100.05% 2.893ms 2.893ms 1
|
| 4211 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.00% 2.892ms 963.972us 3
|
| 4212 |
+
Activity Buffer Request 33.24% 1.485ms 33.24% 1.485ms 1.485ms 983.097us 33.99% 983.097us 983.097us 1
|
| 4213 |
+
aten::empty 0.58% 25.770us 0.58% 25.770us 4.295us 0.000us 0.00% 0.000us 0.000us 6
|
| 4214 |
+
cudaFuncSetAttribute 0.11% 4.820us 0.11% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
|
| 4215 |
+
cudaLaunchKernel 0.69% 30.740us 0.69% 30.740us 10.247us 0.000us 0.00% 0.000us 0.000us 3
|
| 4216 |
+
cudaDeviceSynchronize 59.90% 2.675ms 59.90% 2.675ms 2.675ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4217 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4218 |
+
Self CPU time total: 4.466ms
|
| 4219 |
+
Self CUDA time total: 2.892ms
|
| 4220 |
|
| 4221 |
|
| 4222 |
|
|
|
|
| 4226 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4227 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4228 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4229 |
+
hf_kernels_flash_attn3 2.68% 125.944us 42.11% 1.982ms 1.982ms 0.000us 0.00% 3.932ms 3.932ms 1
|
| 4230 |
+
FlashAttnFunc 1.98% 92.983us 39.44% 1.856ms 618.639us 0.000us 0.00% 3.932ms 1.311ms 3
|
| 4231 |
+
_flash_attn3_48fe103_dirty::fwd 1.14% 53.661us 37.46% 1.763ms 587.645us 2.953ms 100.00% 3.932ms 1.311ms 3
|
| 4232 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.954ms 100.06% 2.954ms 2.954ms 1
|
| 4233 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 100.00% 2.953ms 984.176us 3
|
| 4234 |
+
Activity Buffer Request 30.48% 1.434ms 30.48% 1.434ms 1.434ms 979.803us 33.19% 979.803us 979.803us 1
|
| 4235 |
+
aten::empty 0.58% 27.450us 0.58% 27.450us 4.575us 0.000us 0.00% 0.000us 0.000us 6
|
| 4236 |
+
cudaFuncSetAttribute 0.11% 5.150us 0.11% 5.150us 1.717us 0.000us 0.00% 0.000us 0.000us 3
|
| 4237 |
+
cudaLaunchKernel 5.15% 242.396us 5.15% 242.396us 80.799us 0.000us 0.00% 0.000us 0.000us 3
|
| 4238 |
+
cudaDeviceSynchronize 57.89% 2.724ms 57.89% 2.724ms 2.724ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4239 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4240 |
+
Self CPU time total: 4.706ms
|
| 4241 |
+
Self CUDA time total: 2.953ms
|
| 4242 |
|
| 4243 |
|
| 4244 |
|
|
|
|
| 4248 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4249 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4250 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4251 |
+
hf_kernels_flash_attn3 2.36% 122.892us 37.59% 1.960ms 1.960ms 0.000us 0.00% 4.622ms 4.622ms 1
|
| 4252 |
+
FlashAttnFunc 1.74% 90.533us 35.23% 1.837ms 612.429us 0.000us 0.00% 4.622ms 1.541ms 3
|
| 4253 |
+
_flash_attn3_48fe103_dirty::fwd 0.97% 50.750us 33.49% 1.747ms 582.252us 3.470ms 100.00% 4.622ms 1.541ms 3
|
| 4254 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.472ms 100.05% 3.472ms 3.472ms 1
|
| 4255 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.470ms 100.00% 3.470ms 1.157ms 3
|
| 4256 |
+
Activity Buffer Request 27.49% 1.433ms 27.49% 1.433ms 1.433ms 1.152ms 33.20% 1.152ms 1.152ms 1
|
| 4257 |
+
aten::empty 0.51% 26.592us 0.51% 26.592us 4.432us 0.000us 0.00% 0.000us 0.000us 6
|
| 4258 |
+
cudaFuncSetAttribute 0.10% 5.060us 0.10% 5.060us 1.687us 0.000us 0.00% 0.000us 0.000us 3
|
| 4259 |
+
cudaLaunchKernel 4.43% 230.856us 4.43% 230.856us 76.952us 0.000us 0.00% 0.000us 0.000us 3
|
| 4260 |
+
cudaDeviceSynchronize 62.41% 3.255ms 62.41% 3.255ms 3.255ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4261 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4262 |
+
Self CPU time total: 5.215ms
|
| 4263 |
+
Self CUDA time total: 3.470ms
|
| 4264 |
|
| 4265 |
|
| 4266 |
|
|
|
|
| 4270 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4271 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4272 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4273 |
+
hf_kernels_flash_attn3 2.32% 120.892us 37.51% 1.951ms 1.951ms 0.000us 0.00% 4.639ms 4.639ms 1
|
| 4274 |
+
FlashAttnFunc 1.74% 90.773us 35.18% 1.830ms 610.133us 0.000us 0.00% 4.639ms 1.546ms 3
|
| 4275 |
+
_flash_attn3_48fe103_dirty::fwd 0.99% 51.351us 33.44% 1.740ms 579.875us 3.468ms 100.00% 4.639ms 1.546ms 3
|
| 4276 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.05% 3.469ms 3.469ms 1
|
| 4277 |
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.468ms 100.00% 3.468ms 1.156ms 3
|
| 4278 |
+
Activity Buffer Request 27.26% 1.418ms 27.26% 1.418ms 1.418ms 1.172ms 33.79% 1.172ms 1.172ms 1
|
| 4279 |
+
aten::empty 0.51% 26.560us 0.51% 26.560us 4.427us 0.000us 0.00% 0.000us 0.000us 6
|
| 4280 |
+
cudaFuncSetAttribute 0.10% 5.101us 0.10% 5.101us 1.700us 0.000us 0.00% 0.000us 0.000us 3
|
| 4281 |
+
cudaLaunchKernel 4.58% 238.367us 4.58% 238.367us 79.456us 0.000us 0.00% 0.000us 0.000us 3
|
| 4282 |
+
cudaDeviceSynchronize 62.49% 3.251ms 62.49% 3.251ms 3.251ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4283 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4284 |
+
Self CPU time total: 5.202ms
|
| 4285 |
Self CUDA time total: 3.468ms
|
| 4286 |
|
| 4287 |
|
| 4288 |
impl wl p50(ms) ok
|
| 4289 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
|
| 4290 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
|
| 4291 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
|
| 4292 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.03 True
|
| 4293 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
|
| 4294 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4295 |
</pre></div>
|
| 4296 |
<div class="cell-stderr">
|
| 4297 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4298 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.42it/s]
|
| 4299 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.84it/s]
|
| 4300 |
</div>
|
| 4301 |
<div class="cell-artifacts">
|
| 4302 |
<h4>Artifacts:</h4>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -4110,7 +4110,7 @@ Cell: benchmark | 3.94s
|
|
| 4110 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4112 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
-
<a href="https://github.com/huggingface/kernels-
|
| 4114 |
</div>
|
| 4115 |
<div id="code-benchmark" class="cell-code" data-lines="31">
|
| 4116 |
<div class="code-wrap">
|
|
@@ -4159,28 +4159,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
|
|
| 4159 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4160 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
-
torch_mem_eff 5.
|
| 4163 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4164 |
-
aten::scaled_dot_product_attention 0.
|
| 4165 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4166 |
-
aten::_efficient_attention_forward 0.
|
| 4167 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4168 |
-
aten::contiguous 0.18% 12.
|
| 4169 |
-
aten::clone 0.46%
|
| 4170 |
-
aten::copy_ 1.
|
| 4171 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4172 |
-
Activity Buffer Request 20.
|
| 4173 |
-
aten::transpose 0.
|
| 4174 |
-
aten::as_strided 0.
|
| 4175 |
-
aten::empty_like 0.25% 17.
|
| 4176 |
-
aten::empty 1.
|
| 4177 |
-
cudaLaunchKernel 1.
|
| 4178 |
-
cudaStreamIsCapturing 0.
|
| 4179 |
-
cudaFuncSetAttribute 0.
|
| 4180 |
-
cudaDeviceSynchronize
|
| 4181 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4182 |
-
Self CPU time total:
|
| 4183 |
-
Self CUDA time total: 5.
|
| 4184 |
|
| 4185 |
|
| 4186 |
|
|
@@ -4190,28 +4190,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
|
|
| 4190 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4191 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4192 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4193 |
-
torch_mem_eff 3.
|
| 4194 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4195 |
-
aten::scaled_dot_product_attention 0.
|
| 4196 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4197 |
-
aten::_efficient_attention_forward 0.
|
| 4198 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4199 |
-
aten::contiguous 0.
|
| 4200 |
-
aten::clone 0.
|
| 4201 |
-
aten::copy_ 0.
|
| 4202 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.
|
| 4203 |
-
Activity Buffer Request 19.
|
| 4204 |
-
aten::transpose 0.
|
| 4205 |
-
aten::as_strided 0.
|
| 4206 |
-
aten::empty_like 0.
|
| 4207 |
-
aten::empty
|
| 4208 |
-
cudaLaunchKernel 1.
|
| 4209 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4210 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4211 |
-
cudaDeviceSynchronize
|
| 4212 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4213 |
-
Self CPU time total: 7.
|
| 4214 |
-
Self CUDA time total: 5.
|
| 4215 |
|
| 4216 |
|
| 4217 |
|
|
@@ -4221,28 +4221,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
|
|
| 4221 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4222 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4223 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4224 |
-
torch_mem_eff 3.
|
| 4225 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4226 |
-
aten::scaled_dot_product_attention 0.
|
| 4227 |
-
aten::_scaled_dot_product_efficient_attention 0.25%
|
| 4228 |
-
aten::_efficient_attention_forward 0.
|
| 4229 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4230 |
-
aten::contiguous 0.
|
| 4231 |
-
aten::clone 0.29%
|
| 4232 |
-
aten::copy_ 0.
|
| 4233 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4234 |
-
Activity Buffer Request 19.
|
| 4235 |
-
aten::transpose 0.
|
| 4236 |
-
aten::as_strided 0.23% 17.
|
| 4237 |
-
aten::empty_like 0.
|
| 4238 |
-
aten::empty 0.
|
| 4239 |
-
cudaLaunchKernel 1.
|
| 4240 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4241 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4242 |
-
cudaDeviceSynchronize
|
| 4243 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4244 |
-
Self CPU time total: 7.
|
| 4245 |
-
Self CUDA time total:
|
| 4246 |
|
| 4247 |
|
| 4248 |
|
|
@@ -4252,28 +4252,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
|
|
| 4252 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4253 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4254 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4255 |
-
torch_mem_eff 3.
|
| 4256 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4257 |
-
aten::scaled_dot_product_attention 0.
|
| 4258 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4259 |
-
aten::_efficient_attention_forward 0.
|
| 4260 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4261 |
-
aten::contiguous 0.10% 7.
|
| 4262 |
-
aten::clone 0.
|
| 4263 |
-
aten::copy_ 0.
|
| 4264 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.
|
| 4265 |
-
Activity Buffer Request 18.
|
| 4266 |
-
aten::transpose 0.
|
| 4267 |
-
aten::as_strided 0.
|
| 4268 |
-
aten::empty_like 0.16% 12.
|
| 4269 |
-
aten::empty 0.
|
| 4270 |
-
cudaLaunchKernel
|
| 4271 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4272 |
-
cudaFuncSetAttribute 0.
|
| 4273 |
-
cudaDeviceSynchronize
|
| 4274 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4275 |
-
Self CPU time total: 7.
|
| 4276 |
-
Self CUDA time total:
|
| 4277 |
|
| 4278 |
|
| 4279 |
|
|
@@ -4283,28 +4283,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
|
|
| 4283 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4284 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4285 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4286 |
-
torch_mem_eff 3.
|
| 4287 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4288 |
-
aten::scaled_dot_product_attention 0.
|
| 4289 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4290 |
-
aten::_efficient_attention_forward 0.
|
| 4291 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4292 |
-
aten::contiguous 0.10% 7.
|
| 4293 |
-
aten::clone 0.
|
| 4294 |
-
aten::copy_ 0.
|
| 4295 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.
|
| 4296 |
-
Activity Buffer Request
|
| 4297 |
-
aten::transpose 0.
|
| 4298 |
-
aten::as_strided 0.
|
| 4299 |
-
aten::empty_like 0.15%
|
| 4300 |
-
aten::empty 0.
|
| 4301 |
-
cudaLaunchKernel 3.
|
| 4302 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4303 |
-
cudaFuncSetAttribute 0.
|
| 4304 |
-
cudaDeviceSynchronize
|
| 4305 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4306 |
-
Self CPU time total:
|
| 4307 |
-
Self CUDA time total: 6.
|
| 4308 |
|
| 4309 |
|
| 4310 |
|
|
@@ -4314,37 +4314,37 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
|
|
| 4314 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4315 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4316 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4317 |
-
torch_mem_eff
|
| 4318 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4319 |
-
aten::scaled_dot_product_attention 0.
|
| 4320 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4321 |
-
aten::_efficient_attention_forward 0.
|
| 4322 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4323 |
-
aten::contiguous 0.
|
| 4324 |
-
aten::clone 0.
|
| 4325 |
-
aten::copy_ 0.
|
| 4326 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4327 |
-
Activity Buffer Request 17.
|
| 4328 |
-
aten::transpose 0.
|
| 4329 |
-
aten::as_strided 0.
|
| 4330 |
-
aten::empty_like 0.15% 12.
|
| 4331 |
-
aten::empty 0.
|
| 4332 |
-
cudaLaunchKernel
|
| 4333 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4334 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4335 |
-
cudaDeviceSynchronize
|
| 4336 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4337 |
-
Self CPU time total: 8.
|
| 4338 |
-
Self CUDA time total: 6.
|
| 4339 |
|
| 4340 |
|
| 4341 |
impl wl p50(ms) ok
|
| 4342 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4343 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4344 |
-
torch_mem_eff cuda_attn_L320_bfloat16 1.
|
| 4345 |
-
torch_mem_eff cuda_attn_L384_bfloat16
|
| 4346 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4347 |
-
torch_mem_eff cuda_attn_L512_bfloat16 2.
|
| 4348 |
</pre></div>
|
| 4349 |
<div class="cell-artifacts">
|
| 4350 |
<h4>Artifacts:</h4>
|
|
|
|
| 4110 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4112 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/mem_efficient_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 4114 |
</div>
|
| 4115 |
<div id="code-benchmark" class="cell-code" data-lines="31">
|
| 4116 |
<div class="code-wrap">
|
|
|
|
| 4159 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4160 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4161 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4162 |
+
torch_mem_eff 5.14% 365.276us 32.53% 2.313ms 2.313ms 0.000us 0.00% 5.511ms 5.511ms 1
|
| 4163 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.492ms 100.58% 5.492ms 5.492ms 1
|
| 4164 |
+
aten::scaled_dot_product_attention 0.43% 30.401us 2.47% 175.534us 58.511us 0.000us 0.00% 4.841ms 1.614ms 3
|
| 4165 |
+
aten::_scaled_dot_product_efficient_attention 0.33% 23.489us 2.04% 145.133us 48.378us 0.000us 0.00% 4.841ms 1.614ms 3
|
| 4166 |
+
aten::_efficient_attention_forward 0.51% 36.572us 1.40% 99.733us 33.244us 4.841ms 88.65% 4.841ms 1.614ms 3
|
| 4167 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.841ms 88.65% 4.841ms 1.614ms 3
|
| 4168 |
+
aten::contiguous 0.18% 12.851us 23.99% 1.706ms 189.523us 0.000us 0.00% 670.241us 74.471us 9
|
| 4169 |
+
aten::clone 0.46% 32.742us 23.80% 1.693ms 188.095us 0.000us 0.00% 670.241us 74.471us 9
|
| 4170 |
+
aten::copy_ 1.05% 74.801us 22.33% 1.588ms 176.415us 619.776us 11.35% 670.241us 74.471us 9
|
| 4171 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 619.776us 11.35% 619.776us 68.864us 9
|
| 4172 |
+
Activity Buffer Request 20.17% 1.434ms 20.17% 1.434ms 1.434ms 50.465us 0.92% 50.465us 50.465us 1
|
| 4173 |
+
aten::transpose 0.93% 66.224us 1.25% 88.644us 3.693us 0.000us 0.00% 0.000us 0.000us 24
|
| 4174 |
+
aten::as_strided 0.32% 22.420us 0.32% 22.420us 0.934us 0.000us 0.00% 0.000us 0.000us 24
|
| 4175 |
+
aten::empty_like 0.25% 17.919us 1.02% 72.382us 8.042us 0.000us 0.00% 0.000us 0.000us 9
|
| 4176 |
+
aten::empty 1.14% 81.114us 1.14% 81.114us 3.863us 0.000us 0.00% 0.000us 0.000us 21
|
| 4177 |
+
cudaLaunchKernel 1.46% 103.973us 1.46% 103.973us 8.664us 0.000us 0.00% 0.000us 0.000us 12
|
| 4178 |
+
cudaStreamIsCapturing 0.04% 2.960us 0.04% 2.960us 0.987us 0.000us 0.00% 0.000us 0.000us 3
|
| 4179 |
+
cudaFuncSetAttribute 0.12% 8.310us 0.12% 8.310us 2.770us 0.000us 0.00% 0.000us 0.000us 3
|
| 4180 |
+
cudaDeviceSynchronize 67.47% 4.798ms 67.47% 4.798ms 4.798ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4181 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4182 |
+
Self CPU time total: 7.111ms
|
| 4183 |
+
Self CUDA time total: 5.460ms
|
| 4184 |
|
| 4185 |
|
| 4186 |
|
|
|
|
| 4190 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4191 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4192 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4193 |
+
torch_mem_eff 3.28% 242.746us 28.00% 2.075ms 2.075ms 0.000us 0.00% 5.933ms 5.933ms 1
|
| 4194 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.886ms 100.14% 5.886ms 5.886ms 1
|
| 4195 |
+
aten::scaled_dot_product_attention 0.25% 18.240us 1.89% 140.073us 46.691us 0.000us 0.00% 5.241ms 1.747ms 3
|
| 4196 |
+
aten::_scaled_dot_product_efficient_attention 0.25% 18.689us 1.64% 121.833us 40.611us 0.000us 0.00% 5.241ms 1.747ms 3
|
| 4197 |
+
aten::_efficient_attention_forward 0.38% 28.462us 1.09% 81.063us 27.021us 5.241ms 89.17% 5.241ms 1.747ms 3
|
| 4198 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.241ms 89.17% 5.241ms 1.747ms 3
|
| 4199 |
+
aten::contiguous 0.10% 7.041us 22.26% 1.650ms 183.285us 0.000us 0.00% 691.103us 76.789us 9
|
| 4200 |
+
aten::clone 0.29% 21.342us 22.17% 1.643ms 182.503us 0.000us 0.00% 691.103us 76.789us 9
|
| 4201 |
+
aten::copy_ 0.86% 63.451us 21.24% 1.574ms 174.872us 636.671us 10.83% 691.103us 76.789us 9
|
| 4202 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.671us 10.83% 636.671us 70.741us 9
|
| 4203 |
+
Activity Buffer Request 19.50% 1.445ms 19.50% 1.445ms 1.445ms 54.432us 0.93% 54.432us 54.432us 1
|
| 4204 |
+
aten::transpose 0.64% 47.650us 0.87% 64.701us 2.696us 0.000us 0.00% 0.000us 0.000us 24
|
| 4205 |
+
aten::as_strided 0.23% 17.051us 0.23% 17.051us 0.710us 0.000us 0.00% 0.000us 0.000us 24
|
| 4206 |
+
aten::empty_like 0.16% 11.589us 0.64% 47.330us 5.259us 0.000us 0.00% 0.000us 0.000us 9
|
| 4207 |
+
aten::empty 0.82% 60.521us 0.82% 60.521us 2.882us 0.000us 0.00% 0.000us 0.000us 21
|
| 4208 |
+
cudaLaunchKernel 1.19% 88.044us 1.19% 88.044us 7.337us 0.000us 0.00% 0.000us 0.000us 12
|
| 4209 |
+
cudaStreamIsCapturing 0.03% 2.420us 0.03% 2.420us 0.807us 0.000us 0.00% 0.000us 0.000us 3
|
| 4210 |
+
cudaFuncSetAttribute 0.04% 3.030us 0.04% 3.030us 1.010us 0.000us 0.00% 0.000us 0.000us 3
|
| 4211 |
+
cudaDeviceSynchronize 72.00% 5.335ms 72.00% 5.335ms 5.335ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4212 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4213 |
+
Self CPU time total: 7.410ms
|
| 4214 |
+
Self CUDA time total: 5.878ms
|
| 4215 |
|
| 4216 |
|
| 4217 |
|
|
|
|
| 4221 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4222 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4223 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4224 |
+
torch_mem_eff 3.21% 244.055us 27.47% 2.092ms 2.092ms 0.000us 0.00% 6.130ms 6.130ms 1
|
| 4225 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.080ms 100.14% 6.080ms 6.080ms 1
|
| 4226 |
+
aten::scaled_dot_product_attention 0.23% 17.641us 1.86% 141.944us 47.315us 0.000us 0.00% 5.414ms 1.805ms 3
|
| 4227 |
+
aten::_scaled_dot_product_efficient_attention 0.25% 19.359us 1.63% 124.303us 41.434us 0.000us 0.00% 5.414ms 1.805ms 3
|
| 4228 |
+
aten::_efficient_attention_forward 0.37% 28.219us 1.06% 80.592us 26.864us 5.414ms 89.17% 5.414ms 1.805ms 3
|
| 4229 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.414ms 89.17% 5.414ms 1.805ms 3
|
| 4230 |
+
aten::contiguous 0.11% 8.060us 21.81% 1.661ms 184.510us 0.000us 0.00% 716.192us 79.577us 9
|
| 4231 |
+
aten::clone 0.29% 22.431us 21.70% 1.653ms 183.615us 0.000us 0.00% 716.192us 79.577us 9
|
| 4232 |
+
aten::copy_ 0.81% 61.641us 20.75% 1.580ms 175.564us 657.728us 10.83% 716.192us 79.577us 9
|
| 4233 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 657.728us 10.83% 657.728us 73.081us 9
|
| 4234 |
+
Activity Buffer Request 19.08% 1.453ms 19.08% 1.453ms 1.453ms 58.464us 0.96% 58.464us 58.464us 1
|
| 4235 |
+
aten::transpose 0.69% 52.203us 0.92% 69.763us 2.907us 0.000us 0.00% 0.000us 0.000us 24
|
| 4236 |
+
aten::as_strided 0.23% 17.560us 0.23% 17.560us 0.732us 0.000us 0.00% 0.000us 0.000us 24
|
| 4237 |
+
aten::empty_like 0.15% 11.581us 0.66% 50.023us 5.558us 0.000us 0.00% 0.000us 0.000us 9
|
| 4238 |
+
aten::empty 0.84% 63.785us 0.84% 63.785us 3.037us 0.000us 0.00% 0.000us 0.000us 21
|
| 4239 |
+
cudaLaunchKernel 1.14% 86.832us 1.14% 86.832us 7.236us 0.000us 0.00% 0.000us 0.000us 12
|
| 4240 |
+
cudaStreamIsCapturing 0.03% 2.250us 0.03% 2.250us 0.750us 0.000us 0.00% 0.000us 0.000us 3
|
| 4241 |
+
cudaFuncSetAttribute 0.04% 3.260us 0.04% 3.260us 1.087us 0.000us 0.00% 0.000us 0.000us 3
|
| 4242 |
+
cudaDeviceSynchronize 72.53% 5.522ms 72.53% 5.522ms 5.522ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4243 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4244 |
+
Self CPU time total: 7.614ms
|
| 4245 |
+
Self CUDA time total: 6.072ms
|
| 4246 |
|
| 4247 |
|
| 4248 |
|
|
|
|
| 4252 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4253 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4254 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4255 |
+
torch_mem_eff 3.16% 248.365us 29.29% 2.300ms 2.300ms 0.000us 0.00% 6.163ms 6.163ms 1
|
| 4256 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.114ms 100.14% 6.114ms 6.114ms 1
|
| 4257 |
+
aten::scaled_dot_product_attention 0.24% 19.232us 1.82% 142.774us 47.591us 0.000us 0.00% 5.452ms 1.817ms 3
|
| 4258 |
+
aten::_scaled_dot_product_efficient_attention 0.25% 19.461us 1.57% 123.542us 41.181us 0.000us 0.00% 5.452ms 1.817ms 3
|
| 4259 |
+
aten::_efficient_attention_forward 0.37% 29.029us 1.03% 80.672us 26.891us 5.452ms 89.29% 5.452ms 1.817ms 3
|
| 4260 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.452ms 89.29% 5.452ms 1.817ms 3
|
| 4261 |
+
aten::contiguous 0.10% 7.931us 23.78% 1.867ms 207.435us 0.000us 0.00% 711.072us 79.008us 9
|
| 4262 |
+
aten::clone 0.30% 23.532us 23.68% 1.859ms 206.554us 0.000us 0.00% 711.072us 79.008us 9
|
| 4263 |
+
aten::copy_ 0.81% 63.779us 22.73% 1.785ms 198.306us 653.792us 10.71% 711.072us 79.008us 9
|
| 4264 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.792us 10.71% 653.792us 72.644us 9
|
| 4265 |
+
Activity Buffer Request 18.59% 1.459ms 18.59% 1.459ms 1.459ms 57.280us 0.94% 57.280us 57.280us 1
|
| 4266 |
+
aten::transpose 0.62% 48.610us 0.83% 65.130us 2.714us 0.000us 0.00% 0.000us 0.000us 24
|
| 4267 |
+
aten::as_strided 0.21% 16.520us 0.21% 16.520us 0.688us 0.000us 0.00% 0.000us 0.000us 24
|
| 4268 |
+
aten::empty_like 0.16% 12.281us 0.65% 50.702us 5.634us 0.000us 0.00% 0.000us 0.000us 9
|
| 4269 |
+
aten::empty 0.80% 62.502us 0.80% 62.502us 2.976us 0.000us 0.00% 0.000us 0.000us 21
|
| 4270 |
+
cudaLaunchKernel 3.60% 282.729us 3.60% 282.729us 23.561us 0.000us 0.00% 0.000us 0.000us 12
|
| 4271 |
+
cudaStreamIsCapturing 0.03% 2.471us 0.03% 2.471us 0.824us 0.000us 0.00% 0.000us 0.000us 3
|
| 4272 |
+
cudaFuncSetAttribute 0.05% 4.120us 0.05% 4.120us 1.373us 0.000us 0.00% 0.000us 0.000us 3
|
| 4273 |
+
cudaDeviceSynchronize 70.71% 5.551ms 70.71% 5.551ms 5.551ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4274 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4275 |
+
Self CPU time total: 7.851ms
|
| 4276 |
+
Self CUDA time total: 6.106ms
|
| 4277 |
|
| 4278 |
|
| 4279 |
|
|
|
|
| 4283 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4284 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4285 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4286 |
+
torch_mem_eff 3.01% 243.675us 28.03% 2.272ms 2.272ms 0.000us 0.00% 6.451ms 6.451ms 1
|
| 4287 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.399ms 100.13% 6.399ms 6.399ms 1
|
| 4288 |
+
aten::scaled_dot_product_attention 0.23% 18.671us 1.77% 143.224us 47.741us 0.000us 0.00% 5.726ms 1.909ms 3
|
| 4289 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 19.652us 1.54% 124.553us 41.518us 0.000us 0.00% 5.726ms 1.909ms 3
|
| 4290 |
+
aten::_efficient_attention_forward 0.35% 28.317us 0.99% 80.642us 26.881us 5.726ms 89.60% 5.726ms 1.909ms 3
|
| 4291 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.726ms 89.60% 5.726ms 1.909ms 3
|
| 4292 |
+
aten::contiguous 0.10% 7.791us 22.70% 1.840ms 204.460us 0.000us 0.00% 725.025us 80.558us 9
|
| 4293 |
+
aten::clone 0.29% 23.489us 22.61% 1.832ms 203.594us 0.000us 0.00% 725.025us 80.558us 9
|
| 4294 |
+
aten::copy_ 0.81% 65.293us 21.68% 1.757ms 195.223us 664.641us 10.40% 725.025us 80.558us 9
|
| 4295 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 664.641us 10.40% 664.641us 73.849us 9
|
| 4296 |
+
Activity Buffer Request 17.77% 1.440ms 17.77% 1.440ms 1.440ms 60.384us 0.94% 60.384us 60.384us 1
|
| 4297 |
+
aten::transpose 0.63% 51.151us 0.85% 69.251us 2.885us 0.000us 0.00% 0.000us 0.000us 24
|
| 4298 |
+
aten::as_strided 0.22% 18.100us 0.22% 18.100us 0.754us 0.000us 0.00% 0.000us 0.000us 24
|
| 4299 |
+
aten::empty_like 0.15% 11.960us 0.64% 51.852us 5.761us 0.000us 0.00% 0.000us 0.000us 9
|
| 4300 |
+
aten::empty 0.79% 64.314us 0.79% 64.314us 3.063us 0.000us 0.00% 0.000us 0.000us 21
|
| 4301 |
+
cudaLaunchKernel 3.36% 272.117us 3.36% 272.117us 22.676us 0.000us 0.00% 0.000us 0.000us 12
|
| 4302 |
+
cudaStreamIsCapturing 0.03% 2.500us 0.03% 2.500us 0.833us 0.000us 0.00% 0.000us 0.000us 3
|
| 4303 |
+
cudaFuncSetAttribute 0.06% 4.532us 0.06% 4.532us 1.511us 0.000us 0.00% 0.000us 0.000us 3
|
| 4304 |
+
cudaDeviceSynchronize 71.97% 5.833ms 71.97% 5.833ms 5.833ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4305 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4306 |
+
Self CPU time total: 8.105ms
|
| 4307 |
+
Self CUDA time total: 6.391ms
|
| 4308 |
|
| 4309 |
|
| 4310 |
|
|
|
|
| 4314 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4315 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4316 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4317 |
+
torch_mem_eff 2.88% 242.135us 27.00% 2.269ms 2.269ms 0.000us 0.00% 6.759ms 6.759ms 1
|
| 4318 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.705ms 100.12% 6.705ms 6.705ms 1
|
| 4319 |
+
aten::scaled_dot_product_attention 0.21% 17.851us 1.72% 144.884us 48.295us 0.000us 0.00% 6.024ms 2.008ms 3
|
| 4320 |
+
aten::_scaled_dot_product_efficient_attention 0.23% 19.591us 1.51% 127.033us 42.344us 0.000us 0.00% 6.024ms 2.008ms 3
|
| 4321 |
+
aten::_efficient_attention_forward 0.34% 28.520us 0.97% 81.532us 27.177us 6.024ms 89.96% 6.024ms 2.008ms 3
|
| 4322 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.024ms 89.96% 6.024ms 2.008ms 3
|
| 4323 |
+
aten::contiguous 0.10% 8.099us 21.87% 1.838ms 204.242us 0.000us 0.00% 734.178us 81.575us 9
|
| 4324 |
+
aten::clone 0.28% 23.122us 21.78% 1.830ms 203.342us 0.000us 0.00% 734.178us 81.575us 9
|
| 4325 |
+
aten::copy_ 0.74% 62.180us 20.86% 1.753ms 194.799us 672.322us 10.04% 734.178us 81.575us 9
|
| 4326 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 672.322us 10.04% 672.322us 74.702us 9
|
| 4327 |
+
Activity Buffer Request 17.19% 1.445ms 17.19% 1.445ms 1.445ms 61.856us 0.92% 61.856us 61.856us 1
|
| 4328 |
+
aten::transpose 0.62% 52.351us 0.83% 70.022us 2.918us 0.000us 0.00% 0.000us 0.000us 24
|
| 4329 |
+
aten::as_strided 0.21% 17.671us 0.21% 17.671us 0.736us 0.000us 0.00% 0.000us 0.000us 24
|
| 4330 |
+
aten::empty_like 0.15% 12.653us 0.64% 53.763us 5.974us 0.000us 0.00% 0.000us 0.000us 9
|
| 4331 |
+
aten::empty 0.79% 66.761us 0.79% 66.761us 3.179us 0.000us 0.00% 0.000us 0.000us 21
|
| 4332 |
+
cudaLaunchKernel 3.19% 267.907us 3.19% 267.907us 22.326us 0.000us 0.00% 0.000us 0.000us 12
|
| 4333 |
+
cudaStreamIsCapturing 0.03% 2.430us 0.03% 2.430us 0.810us 0.000us 0.00% 0.000us 0.000us 3
|
| 4334 |
+
cudaFuncSetAttribute 0.04% 3.350us 0.04% 3.350us 1.117us 0.000us 0.00% 0.000us 0.000us 3
|
| 4335 |
+
cudaDeviceSynchronize 73.00% 6.134ms 73.00% 6.134ms 6.134ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4336 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4337 |
+
Self CPU time total: 8.404ms
|
| 4338 |
+
Self CUDA time total: 6.697ms
|
| 4339 |
|
| 4340 |
|
| 4341 |
impl wl p50(ms) ok
|
| 4342 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.85 True
|
| 4343 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
|
| 4344 |
+
torch_mem_eff cuda_attn_L320_bfloat16 1.99 True
|
| 4345 |
+
torch_mem_eff cuda_attn_L384_bfloat16 2.07 True
|
| 4346 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
|
| 4347 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
|
| 4348 |
</pre></div>
|
| 4349 |
<div class="cell-artifacts">
|
| 4350 |
<h4>Artifacts:</h4>
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -4104,13 +4104,14 @@ body[data-tool="eraser"] .main-content {
|
|
| 4104 |
<span class="collapse-indicators">
|
| 4105 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 4106 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 4108 |
</span> |
|
| 4109 |
-
Cell: benchmark | 4.
|
| 4110 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4112 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
-
<a href="https://github.com/huggingface/kernels-
|
|
|
|
| 4114 |
</div>
|
| 4115 |
<div id="code-benchmark" class="cell-code" data-lines="32">
|
| 4116 |
<div class="code-wrap">
|
|
@@ -4155,24 +4156,27 @@ Cell: benchmark | 4.12s
|
|
| 4155 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 4156 |
impl wl p50(ms) ok
|
| 4157 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4158 |
-
Error: module '
|
| 4159 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4160 |
-
Error: module '
|
| 4161 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4162 |
-
Error: module '
|
| 4163 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4164 |
-
Error: module '
|
| 4165 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4166 |
-
Error: module '
|
| 4167 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4168 |
-
Error: module '
|
| 4169 |
</pre></div>
|
| 4170 |
-
<div class="
|
| 4171 |
-
|
| 4172 |
-
|
| 4173 |
-
|
| 4174 |
-
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 21.06it/s]
|
| 4175 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4176 |
<div class="cell-artifacts">
|
| 4177 |
<h4>Artifacts:</h4>
|
| 4178 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 4104 |
<span class="collapse-indicators">
|
| 4105 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 4106 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
+
Cell: benchmark | 4.69s
|
| 4110 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4112 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
|
| 4114 |
+
<a href="https://huggingface.co/kernels-community/sage_attention" target="_blank" class="hf-btn">🤗 HF</a>
|
| 4115 |
</div>
|
| 4116 |
<div id="code-benchmark" class="cell-code" data-lines="32">
|
| 4117 |
<div class="code-wrap">
|
|
|
|
| 4156 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 4157 |
impl wl p50(ms) ok
|
| 4158 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4159 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4160 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4161 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4162 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4163 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4164 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4165 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4166 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4167 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4168 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4169 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4170 |
</pre></div>
|
| 4171 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4172 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4173 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4174 |
+
Installed 15 packages in 14ms
|
|
|
|
| 4175 |
</div>
|
| 4176 |
+
</div>
|
| 4177 |
+
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 4178 |
+
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 11.73it/s]
|
| 4179 |
+
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 16.12it/s]</div>
|
| 4180 |
<div class="cell-artifacts">
|
| 4181 |
<h4>Artifacts:</h4>
|
| 4182 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 4106 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
-
Cell: benchmark |
|
| 4110 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4112 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
-
<a href="https://github.com/huggingface/kernels-
|
| 4114 |
</div>
|
| 4115 |
<div id="code-benchmark" class="cell-code" data-lines="30">
|
| 4116 |
<div class="code-wrap">
|
|
@@ -4158,21 +4158,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
|
|
| 4158 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4159 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4160 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4161 |
-
xformers_meff
|
| 4162 |
-
xformers_flash3::flash_fwd 4.
|
| 4163 |
-
flash_attn_3::fwd 1.
|
| 4164 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4165 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4166 |
-
Activity Buffer Request
|
| 4167 |
-
aten::empty 0.
|
| 4168 |
-
cudaFuncSetAttribute 0.
|
| 4169 |
-
cudaLaunchKernel
|
| 4170 |
-
aten::reshape 0.
|
| 4171 |
-
aten::view 0.
|
| 4172 |
-
cudaDeviceSynchronize
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
-
Self CPU time total: 4.
|
| 4175 |
-
Self CUDA time total: 2.
|
| 4176 |
|
| 4177 |
|
| 4178 |
|
|
@@ -4182,21 +4182,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
|
|
| 4182 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4183 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4184 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4185 |
-
xformers_meff 7.
|
| 4186 |
-
xformers_flash3::flash_fwd 3.
|
| 4187 |
-
flash_attn_3::fwd 1.
|
| 4188 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4189 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4190 |
-
Activity Buffer Request
|
| 4191 |
-
aten::empty 0.
|
| 4192 |
-
cudaFuncSetAttribute 0.13% 5.
|
| 4193 |
-
cudaLaunchKernel 0.
|
| 4194 |
-
aten::reshape 0.21% 9.
|
| 4195 |
-
aten::view 0.33% 14.
|
| 4196 |
-
cudaDeviceSynchronize 53.
|
| 4197 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4198 |
-
Self CPU time total: 4.
|
| 4199 |
-
Self CUDA time total: 2.
|
| 4200 |
|
| 4201 |
|
| 4202 |
|
|
@@ -4206,21 +4206,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
|
|
| 4206 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4207 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4208 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4209 |
-
xformers_meff 6.
|
| 4210 |
-
xformers_flash3::flash_fwd 3.
|
| 4211 |
-
flash_attn_3::fwd 1.
|
| 4212 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4213 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4214 |
-
Activity Buffer Request
|
| 4215 |
-
aten::empty 0.
|
| 4216 |
-
cudaFuncSetAttribute 0.12% 5.
|
| 4217 |
-
cudaLaunchKernel 0.
|
| 4218 |
-
aten::reshape 0.
|
| 4219 |
-
aten::view 0.31% 13.
|
| 4220 |
-
cudaDeviceSynchronize 54.
|
| 4221 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4222 |
-
Self CPU time total: 4.
|
| 4223 |
-
Self CUDA time total: 2.
|
| 4224 |
|
| 4225 |
|
| 4226 |
|
|
@@ -4230,21 +4230,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
|
|
| 4230 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4231 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4232 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4233 |
-
xformers_meff 6.
|
| 4234 |
-
xformers_flash3::flash_fwd 3.
|
| 4235 |
-
flash_attn_3::fwd 1.
|
| 4236 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4237 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4238 |
-
Activity Buffer Request 30.
|
| 4239 |
-
aten::empty 0.63% 29.
|
| 4240 |
-
cudaFuncSetAttribute 0.
|
| 4241 |
-
cudaLaunchKernel
|
| 4242 |
-
aten::reshape 0.19%
|
| 4243 |
-
aten::view 0.
|
| 4244 |
-
cudaDeviceSynchronize
|
| 4245 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4246 |
-
Self CPU time total: 4.
|
| 4247 |
-
Self CUDA time total: 2.
|
| 4248 |
|
| 4249 |
|
| 4250 |
|
|
@@ -4254,21 +4254,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
|
|
| 4254 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4255 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4256 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4257 |
-
xformers_meff
|
| 4258 |
-
xformers_flash3::flash_fwd
|
| 4259 |
-
flash_attn_3::fwd 1.
|
| 4260 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4261 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4262 |
-
Activity Buffer Request 27.
|
| 4263 |
-
aten::empty 0.
|
| 4264 |
-
cudaFuncSetAttribute 0.
|
| 4265 |
-
cudaLaunchKernel 3.
|
| 4266 |
-
aten::reshape 0.
|
| 4267 |
-
aten::view 0.
|
| 4268 |
-
cudaDeviceSynchronize
|
| 4269 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4270 |
-
Self CPU time total: 5.
|
| 4271 |
-
Self CUDA time total: 3.
|
| 4272 |
|
| 4273 |
|
| 4274 |
|
|
@@ -4278,37 +4278,83 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
|
|
| 4278 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4279 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4280 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4281 |
-
xformers_meff
|
| 4282 |
-
xformers_flash3::flash_fwd 2.
|
| 4283 |
-
flash_attn_3::fwd 1.
|
| 4284 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4285 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4286 |
-
Activity Buffer Request
|
| 4287 |
-
aten::empty 0.58% 29.
|
| 4288 |
-
cudaFuncSetAttribute 0.12%
|
| 4289 |
-
cudaLaunchKernel 3.
|
| 4290 |
-
aten::reshape 0.
|
| 4291 |
-
aten::view 0.
|
| 4292 |
-
cudaDeviceSynchronize
|
| 4293 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4294 |
-
Self CPU time total: 5.
|
| 4295 |
-
Self CUDA time total: 3.
|
| 4296 |
|
| 4297 |
|
| 4298 |
impl wl p50(ms) ok
|
| 4299 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4300 |
-
xformers_meff cuda_attn_L256_bfloat16 1.
|
| 4301 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4302 |
xformers_meff cuda_attn_L384_bfloat16 1.08 True
|
| 4303 |
-
xformers_meff cuda_attn_L448_bfloat16 1.
|
| 4304 |
xformers_meff cuda_attn_L512_bfloat16 1.23 True
|
| 4305 |
</pre></div>
|
| 4306 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4307 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4308 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4309 |
Downloading xformers (111.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4310 |
Downloading xformers
|
| 4311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4312 |
</div>
|
| 4313 |
</div>
|
| 4314 |
<div class="cell-artifacts">
|
|
|
|
| 4106 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
+
Cell: benchmark | 33.71s
|
| 4110 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4112 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/flash_attn/impls/xformers.md" target="_blank" class="github-btn">GitHub</a>
|
| 4114 |
</div>
|
| 4115 |
<div id="code-benchmark" class="cell-code" data-lines="30">
|
| 4116 |
<div class="code-wrap">
|
|
|
|
| 4158 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4159 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4160 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4161 |
+
xformers_meff 10.98% 488.134us 52.82% 2.349ms 2.349ms 0.000us 0.00% 3.539ms 3.539ms 1
|
| 4162 |
+
xformers_flash3::flash_fwd 4.45% 198.034us 41.02% 1.824ms 608.009us 0.000us 0.00% 3.539ms 1.180ms 3
|
| 4163 |
+
flash_attn_3::fwd 1.81% 80.354us 36.57% 1.626ms 541.997us 2.647ms 100.00% 3.539ms 1.180ms 3
|
| 4164 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.648ms 100.06% 2.648ms 2.648ms 1
|
| 4165 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.647ms 100.00% 2.647ms 882.203us 3
|
| 4166 |
+
Activity Buffer Request 32.65% 1.452ms 32.65% 1.452ms 1.452ms 892.891us 33.74% 892.891us 892.891us 1
|
| 4167 |
+
aten::empty 0.78% 34.470us 0.78% 34.470us 5.745us 0.000us 0.00% 0.000us 0.000us 6
|
| 4168 |
+
cudaFuncSetAttribute 0.26% 11.370us 0.26% 11.370us 3.790us 0.000us 0.00% 0.000us 0.000us 3
|
| 4169 |
+
cudaLaunchKernel 1.08% 47.851us 1.08% 47.851us 15.950us 0.000us 0.00% 0.000us 0.000us 3
|
| 4170 |
+
aten::reshape 0.28% 12.261us 0.82% 36.420us 6.070us 0.000us 0.00% 0.000us 0.000us 6
|
| 4171 |
+
aten::view 0.54% 24.159us 0.54% 24.159us 4.026us 0.000us 0.00% 0.000us 0.000us 6
|
| 4172 |
+
cudaDeviceSynchronize 47.18% 2.098ms 47.18% 2.098ms 2.098ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4173 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4174 |
+
Self CPU time total: 4.447ms
|
| 4175 |
+
Self CUDA time total: 2.647ms
|
| 4176 |
|
| 4177 |
|
| 4178 |
|
|
|
|
| 4182 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4183 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4184 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4185 |
+
xformers_meff 7.22% 318.208us 46.97% 2.070ms 2.070ms 0.000us 0.00% 3.700ms 3.700ms 1
|
| 4186 |
+
xformers_flash3::flash_fwd 3.33% 146.973us 39.20% 1.728ms 575.898us 0.000us 0.00% 3.700ms 1.233ms 3
|
| 4187 |
+
flash_attn_3::fwd 1.20% 53.004us 35.87% 1.581ms 526.907us 2.767ms 100.00% 3.700ms 1.233ms 3
|
| 4188 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.769ms 100.05% 2.769ms 2.769ms 1
|
| 4189 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.767ms 100.00% 2.767ms 922.499us 3
|
| 4190 |
+
Activity Buffer Request 33.12% 1.459ms 33.12% 1.459ms 1.459ms 932.857us 33.71% 932.857us 932.857us 1
|
| 4191 |
+
aten::empty 0.65% 28.790us 0.65% 28.790us 4.798us 0.000us 0.00% 0.000us 0.000us 6
|
| 4192 |
+
cudaFuncSetAttribute 0.13% 5.860us 0.13% 5.860us 1.953us 0.000us 0.00% 0.000us 0.000us 3
|
| 4193 |
+
cudaLaunchKernel 0.76% 33.580us 0.76% 33.580us 11.193us 0.000us 0.00% 0.000us 0.000us 3
|
| 4194 |
+
aten::reshape 0.21% 9.291us 0.54% 23.901us 3.983us 0.000us 0.00% 0.000us 0.000us 6
|
| 4195 |
+
aten::view 0.33% 14.610us 0.33% 14.610us 2.435us 0.000us 0.00% 0.000us 0.000us 6
|
| 4196 |
+
cudaDeviceSynchronize 53.03% 2.337ms 53.03% 2.337ms 2.337ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4197 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4198 |
+
Self CPU time total: 4.407ms
|
| 4199 |
+
Self CUDA time total: 2.767ms
|
| 4200 |
|
| 4201 |
|
| 4202 |
|
|
|
|
| 4206 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4207 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4208 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4209 |
+
xformers_meff 6.87% 306.279us 45.67% 2.036ms 2.036ms 0.000us 0.00% 3.803ms 3.803ms 1
|
| 4210 |
+
xformers_flash3::flash_fwd 3.28% 146.193us 38.29% 1.707ms 568.871us 0.000us 0.00% 3.803ms 1.268ms 3
|
| 4211 |
+
flash_attn_3::fwd 1.22% 54.360us 35.01% 1.560ms 520.140us 2.841ms 100.00% 3.803ms 1.268ms 3
|
| 4212 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.843ms 100.05% 2.843ms 2.843ms 1
|
| 4213 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.841ms 100.00% 2.841ms 947.064us 3
|
| 4214 |
+
Activity Buffer Request 32.21% 1.435ms 32.21% 1.435ms 1.435ms 961.848us 33.85% 961.848us 961.848us 1
|
| 4215 |
+
aten::empty 0.68% 30.200us 0.68% 30.200us 5.033us 0.000us 0.00% 0.000us 0.000us 6
|
| 4216 |
+
cudaFuncSetAttribute 0.12% 5.560us 0.12% 5.560us 1.853us 0.000us 0.00% 0.000us 0.000us 3
|
| 4217 |
+
cudaLaunchKernel 0.78% 34.863us 0.78% 34.863us 11.621us 0.000us 0.00% 0.000us 0.000us 3
|
| 4218 |
+
aten::reshape 0.20% 8.808us 0.51% 22.610us 3.768us 0.000us 0.00% 0.000us 0.000us 6
|
| 4219 |
+
aten::view 0.31% 13.802us 0.31% 13.802us 2.300us 0.000us 0.00% 0.000us 0.000us 6
|
| 4220 |
+
cudaDeviceSynchronize 54.33% 2.422ms 54.33% 2.422ms 2.422ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4221 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4222 |
+
Self CPU time total: 4.457ms
|
| 4223 |
+
Self CUDA time total: 2.841ms
|
| 4224 |
|
| 4225 |
|
| 4226 |
|
|
|
|
| 4230 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4231 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4232 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4233 |
+
xformers_meff 6.67% 311.798us 48.16% 2.253ms 2.253ms 0.000us 0.00% 3.854ms 3.854ms 1
|
| 4234 |
+
xformers_flash3::flash_fwd 3.68% 172.144us 40.98% 1.917ms 638.949us 0.000us 0.00% 3.854ms 1.285ms 3
|
| 4235 |
+
flash_attn_3::fwd 1.19% 55.670us 37.30% 1.745ms 581.568us 2.881ms 100.00% 3.854ms 1.285ms 3
|
| 4236 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.883ms 100.05% 2.883ms 2.883ms 1
|
| 4237 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.881ms 100.00% 2.881ms 960.465us 3
|
| 4238 |
+
Activity Buffer Request 30.77% 1.440ms 30.77% 1.440ms 1.440ms 972.603us 33.75% 972.603us 972.603us 1
|
| 4239 |
+
aten::empty 0.63% 29.580us 0.63% 29.580us 4.930us 0.000us 0.00% 0.000us 0.000us 6
|
| 4240 |
+
cudaFuncSetAttribute 0.12% 5.801us 0.12% 5.801us 1.934us 0.000us 0.00% 0.000us 0.000us 3
|
| 4241 |
+
cudaLaunchKernel 4.58% 214.036us 4.58% 214.036us 71.345us 0.000us 0.00% 0.000us 0.000us 3
|
| 4242 |
+
aten::reshape 0.19% 9.019us 0.51% 24.051us 4.009us 0.000us 0.00% 0.000us 0.000us 6
|
| 4243 |
+
aten::view 0.32% 15.032us 0.32% 15.032us 2.505us 0.000us 0.00% 0.000us 0.000us 6
|
| 4244 |
+
cudaDeviceSynchronize 51.84% 2.425ms 51.84% 2.425ms 2.425ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4245 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4246 |
+
Self CPU time total: 4.678ms
|
| 4247 |
+
Self CUDA time total: 2.881ms
|
| 4248 |
|
| 4249 |
|
| 4250 |
|
|
|
|
| 4254 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4255 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4256 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4257 |
+
xformers_meff 5.88% 304.576us 42.22% 2.188ms 2.188ms 0.000us 0.00% 4.552ms 4.552ms 1
|
| 4258 |
+
xformers_flash3::flash_fwd 2.84% 147.154us 35.91% 1.861ms 620.213us 0.000us 0.00% 4.552ms 1.517ms 3
|
| 4259 |
+
flash_attn_3::fwd 1.02% 52.961us 33.07% 1.713ms 571.161us 3.412ms 100.00% 4.552ms 1.517ms 3
|
| 4260 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.414ms 100.04% 3.414ms 3.414ms 1
|
| 4261 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.412ms 100.00% 3.412ms 1.137ms 3
|
| 4262 |
+
Activity Buffer Request 27.95% 1.448ms 27.95% 1.448ms 1.448ms 1.140ms 33.41% 1.140ms 1.140ms 1
|
| 4263 |
+
aten::empty 0.56% 29.272us 0.56% 29.272us 4.879us 0.000us 0.00% 0.000us 0.000us 6
|
| 4264 |
+
cudaFuncSetAttribute 0.12% 6.180us 0.12% 6.180us 2.060us 0.000us 0.00% 0.000us 0.000us 3
|
| 4265 |
+
cudaLaunchKernel 3.41% 176.624us 3.41% 176.624us 58.875us 0.000us 0.00% 0.000us 0.000us 3
|
| 4266 |
+
aten::reshape 0.17% 9.052us 0.44% 22.882us 3.814us 0.000us 0.00% 0.000us 0.000us 6
|
| 4267 |
+
aten::view 0.27% 13.830us 0.27% 13.830us 2.305us 0.000us 0.00% 0.000us 0.000us 6
|
| 4268 |
+
cudaDeviceSynchronize 57.78% 2.994ms 57.78% 2.994ms 2.994ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4269 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4270 |
+
Self CPU time total: 5.182ms
|
| 4271 |
+
Self CUDA time total: 3.412ms
|
| 4272 |
|
| 4273 |
|
| 4274 |
|
|
|
|
| 4278 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4279 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4280 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4281 |
+
xformers_meff 5.58% 285.697us 41.87% 2.143ms 2.143ms 0.000us 0.00% 4.544ms 4.544ms 1
|
| 4282 |
+
xformers_flash3::flash_fwd 2.91% 148.714us 35.83% 1.834ms 611.255us 0.000us 0.00% 4.544ms 1.515ms 3
|
| 4283 |
+
flash_attn_3::fwd 1.04% 53.311us 32.92% 1.685ms 561.684us 3.402ms 100.00% 4.544ms 1.515ms 3
|
| 4284 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.403ms 100.05% 3.403ms 3.403ms 1
|
| 4285 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.402ms 100.00% 3.402ms 1.134ms 3
|
| 4286 |
+
Activity Buffer Request 27.78% 1.422ms 27.78% 1.422ms 1.422ms 1.142ms 33.57% 1.142ms 1.142ms 1
|
| 4287 |
+
aten::empty 0.58% 29.640us 0.58% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6
|
| 4288 |
+
cudaFuncSetAttribute 0.12% 5.990us 0.12% 5.990us 1.997us 0.000us 0.00% 0.000us 0.000us 3
|
| 4289 |
+
cudaLaunchKernel 3.40% 174.134us 3.40% 174.134us 58.045us 0.000us 0.00% 0.000us 0.000us 3
|
| 4290 |
+
aten::reshape 0.17% 8.543us 0.45% 23.191us 3.865us 0.000us 0.00% 0.000us 0.000us 6
|
| 4291 |
+
aten::view 0.29% 14.648us 0.29% 14.648us 2.441us 0.000us 0.00% 0.000us 0.000us 6
|
| 4292 |
+
cudaDeviceSynchronize 58.13% 2.975ms 58.13% 2.975ms 2.975ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4293 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4294 |
+
Self CPU time total: 5.118ms
|
| 4295 |
+
Self CUDA time total: 3.402ms
|
| 4296 |
|
| 4297 |
|
| 4298 |
impl wl p50(ms) ok
|
| 4299 |
+
xformers_meff cuda_attn_L128_bfloat16 1.00 True
|
| 4300 |
+
xformers_meff cuda_attn_L256_bfloat16 1.03 True
|
| 4301 |
+
xformers_meff cuda_attn_L320_bfloat16 1.08 True
|
| 4302 |
xformers_meff cuda_attn_L384_bfloat16 1.08 True
|
| 4303 |
+
xformers_meff cuda_attn_L448_bfloat16 1.25 True
|
| 4304 |
xformers_meff cuda_attn_L512_bfloat16 1.23 True
|
| 4305 |
</pre></div>
|
| 4306 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4307 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4308 |
<div class="uv-logs-content" style="display: none;">
|
| 4309 |
+
Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4310 |
+
Downloading networkx (1.9MiB)
|
| 4311 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4312 |
+
Downloading fonttools (4.7MiB)
|
| 4313 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4314 |
+
Downloading numpy (16.2MiB)
|
| 4315 |
+
Downloading torch (846.9MiB)
|
| 4316 |
+
Downloading setuptools (1.1MiB)
|
| 4317 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4318 |
+
Downloading sympy (6.0MiB)
|
| 4319 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4320 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4321 |
+
Downloading kiwisolver (1.4MiB)
|
| 4322 |
+
Downloading matplotlib (8.3MiB)
|
| 4323 |
+
Downloading triton (148.3MiB)
|
| 4324 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4325 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4326 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4327 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4328 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4329 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4330 |
Downloading xformers (111.8MiB)
|
| 4331 |
+
Downloading pillow (6.7MiB)
|
| 4332 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4333 |
+
Downloading nvidia-cufile-cu12
|
| 4334 |
+
Downloading kiwisolver
|
| 4335 |
+
Downloading setuptools
|
| 4336 |
+
Downloading networkx
|
| 4337 |
+
Downloading fonttools
|
| 4338 |
+
Downloading pillow
|
| 4339 |
+
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4340 |
+
Downloading nvidia-cuda-cupti-cu12
|
| 4341 |
+
Downloading matplotlib
|
| 4342 |
+
Downloading sympy
|
| 4343 |
+
Downloading numpy
|
| 4344 |
+
Downloading nvidia-nvjitlink-cu12
|
| 4345 |
+
Downloading nvidia-curand-cu12
|
| 4346 |
+
Downloading nvidia-cuda-nvrtc-cu12
|
| 4347 |
Downloading xformers
|
| 4348 |
+
Downloading triton
|
| 4349 |
+
Downloading nvidia-cufft-cu12
|
| 4350 |
+
Downloading nvidia-cusolver-cu12
|
| 4351 |
+
Downloading nvidia-cusparse-cu12
|
| 4352 |
+
Downloading nvidia-cusparselt-cu12
|
| 4353 |
+
Downloading nvidia-nccl-cu12
|
| 4354 |
+
Downloading nvidia-cublas-cu12
|
| 4355 |
+
Downloading nvidia-cudnn-cu12
|
| 4356 |
+
Downloading torch
|
| 4357 |
+
Installed 38 packages in 236ms
|
| 4358 |
</div>
|
| 4359 |
</div>
|
| 4360 |
<div class="cell-artifacts">
|
flash_attn/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
flash_attn/results/combined_results.html
CHANGED
|
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4107 |
<rdf:RDF>
|
| 4108 |
<ns2:Work>
|
| 4109 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4110 |
-
<dc:date>2025-10-
|
| 4111 |
<dc:format>image/svg+xml</dc:format>
|
| 4112 |
<dc:creator>
|
| 4113 |
<ns2:Agent>
|
|
@@ -4217,96 +4217,96 @@ body[data-tool="eraser"] .main-content {
|
|
| 4217 |
<g id="matplotlib.axis_2">
|
| 4218 |
<g id="ytick_1">
|
| 4219 |
<g id="grid-y--2" class="grid grid-y">
|
| 4220 |
-
<path d="M 47.81
|
| 4221 |
</g>
|
| 4222 |
<g id="line2d_7">
|
| 4223 |
<defs>
|
| 4224 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4225 |
</defs>
|
| 4226 |
<g>
|
| 4227 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4228 |
</g>
|
| 4229 |
</g>
|
| 4230 |
<g id="text_7">
|
| 4231 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4232 |
</g>
|
| 4233 |
</g>
|
| 4234 |
<g id="ytick_2">
|
| 4235 |
<g id="grid-y--3" class="grid grid-y">
|
| 4236 |
-
<path d="M 47.81
|
| 4237 |
</g>
|
| 4238 |
<g id="line2d_8">
|
| 4239 |
<g>
|
| 4240 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4241 |
</g>
|
| 4242 |
</g>
|
| 4243 |
<g id="text_8">
|
| 4244 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4245 |
</g>
|
| 4246 |
</g>
|
| 4247 |
<g id="ytick_3">
|
| 4248 |
<g id="grid-y--4" class="grid grid-y">
|
| 4249 |
-
<path d="M 47.81
|
| 4250 |
</g>
|
| 4251 |
<g id="line2d_9">
|
| 4252 |
<g>
|
| 4253 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4254 |
</g>
|
| 4255 |
</g>
|
| 4256 |
<g id="text_9">
|
| 4257 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4258 |
</g>
|
| 4259 |
</g>
|
| 4260 |
<g id="ytick_4">
|
| 4261 |
<g id="grid-y--5" class="grid grid-y">
|
| 4262 |
-
<path d="M 47.81
|
| 4263 |
</g>
|
| 4264 |
<g id="line2d_10">
|
| 4265 |
<g>
|
| 4266 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4267 |
</g>
|
| 4268 |
</g>
|
| 4269 |
<g id="text_10">
|
| 4270 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4271 |
</g>
|
| 4272 |
</g>
|
| 4273 |
<g id="ytick_5">
|
| 4274 |
<g id="grid-y--6" class="grid grid-y">
|
| 4275 |
-
<path d="M 47.81
|
| 4276 |
</g>
|
| 4277 |
<g id="line2d_11">
|
| 4278 |
<g>
|
| 4279 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4280 |
</g>
|
| 4281 |
</g>
|
| 4282 |
<g id="text_11">
|
| 4283 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4284 |
</g>
|
| 4285 |
</g>
|
| 4286 |
<g id="ytick_6">
|
| 4287 |
<g id="grid-y--7" class="grid grid-y">
|
| 4288 |
-
<path d="M 47.81
|
| 4289 |
</g>
|
| 4290 |
<g id="line2d_12">
|
| 4291 |
<g>
|
| 4292 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4293 |
</g>
|
| 4294 |
</g>
|
| 4295 |
<g id="text_12">
|
| 4296 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4297 |
</g>
|
| 4298 |
</g>
|
| 4299 |
<g id="ytick_7">
|
| 4300 |
<g id="grid-y--8" class="grid grid-y">
|
| 4301 |
-
<path d="M 47.81
|
| 4302 |
</g>
|
| 4303 |
<g id="line2d_13">
|
| 4304 |
<g>
|
| 4305 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4306 |
</g>
|
| 4307 |
</g>
|
| 4308 |
<g id="text_13">
|
| 4309 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4310 |
</g>
|
| 4311 |
</g>
|
| 4312 |
<g id="label--y" class="ylabel">
|
|
@@ -4314,73 +4314,73 @@ body[data-tool="eraser"] .main-content {
|
|
| 4314 |
</g>
|
| 4315 |
</g>
|
| 4316 |
<g id="series--torch-flash-ma" class="series">
|
| 4317 |
-
<path d="M 83.607806
|
| 4318 |
<defs>
|
| 4319 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4320 |
</defs>
|
| 4321 |
<g clip-path="url(#p09feef2583)">
|
| 4322 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4323 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4324 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4325 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4326 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4327 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4328 |
</g>
|
| 4329 |
</g>
|
| 4330 |
<g id="series--torch-mem-eff" class="series">
|
| 4331 |
-
<path d="M 83.607806
|
| 4332 |
<defs>
|
| 4333 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4334 |
</defs>
|
| 4335 |
<g clip-path="url(#p09feef2583)">
|
| 4336 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4337 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4338 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4339 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4340 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4341 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4342 |
</g>
|
| 4343 |
</g>
|
| 4344 |
<g id="series--xformers-meff" class="series">
|
| 4345 |
-
<path d="M 83.607806
|
| 4346 |
<defs>
|
| 4347 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4348 |
</defs>
|
| 4349 |
<g clip-path="url(#p09feef2583)">
|
| 4350 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4351 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4352 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="382.
|
| 4353 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4354 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4355 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4356 |
</g>
|
| 4357 |
</g>
|
| 4358 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4359 |
-
<path d="M 83.607806
|
| 4360 |
<defs>
|
| 4361 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4362 |
</defs>
|
| 4363 |
<g clip-path="url(#p09feef2583)">
|
| 4364 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4365 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4366 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4367 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4368 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="340.
|
| 4369 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4373 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4374 |
<defs>
|
| 4375 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4376 |
</defs>
|
| 4377 |
<g clip-path="url(#p09feef2583)">
|
| 4378 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4379 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4380 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4381 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4382 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="348.
|
| 4383 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4384 |
</g>
|
| 4385 |
</g>
|
| 4386 |
<g id="patch_3">
|
|
@@ -4465,7 +4465,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4465 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4466 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4467 |
</span> |
|
| 4468 |
-
Cell: combine | 4.
|
| 4469 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4470 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4471 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4572,47 +4572,47 @@ Summary: 6 found, 0 skipped, 0 missing
|
|
| 4572 |
COMBINED BENCHMARK SUMMARY
|
| 4573 |
|
| 4574 |
impl wl p50(ms) ok
|
| 4575 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.
|
| 4576 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16
|
| 4577 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4578 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4579 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4580 |
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
|
| 4581 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.
|
| 4582 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4583 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4584 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4585 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4586 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4587 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4588 |
-
Error: module '
|
| 4589 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4590 |
-
Error: module '
|
| 4591 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4592 |
-
Error: module '
|
| 4593 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4594 |
-
Error: module '
|
| 4595 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4596 |
-
Error: module '
|
| 4597 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4598 |
-
Error: module '
|
| 4599 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4600 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4601 |
-
torch_flash_ma cuda_attn_L320_bfloat16 1.
|
| 4602 |
-
torch_flash_ma cuda_attn_L384_bfloat16 1.
|
| 4603 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4604 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4605 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4606 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4607 |
-
torch_mem_eff cuda_attn_L320_bfloat16 1.
|
| 4608 |
-
torch_mem_eff cuda_attn_L384_bfloat16
|
| 4609 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4610 |
-
torch_mem_eff cuda_attn_L512_bfloat16 2.
|
| 4611 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4612 |
-
xformers_meff cuda_attn_L256_bfloat16 1.
|
| 4613 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4614 |
xformers_meff cuda_attn_L384_bfloat16 1.08 True
|
| 4615 |
-
xformers_meff cuda_attn_L448_bfloat16 1.
|
| 4616 |
xformers_meff cuda_attn_L512_bfloat16 1.23 True
|
| 4617 |
|
| 4618 |
GENERATING COMBINED VISUALIZATION
|
|
@@ -4637,7 +4637,7 @@ Implementations included:
|
|
| 4637 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4638 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4639 |
<div class="uv-logs-content" style="display: none;">
|
| 4640 |
-
Installed 37 packages in
|
| 4641 |
</div>
|
| 4642 |
</div>
|
| 4643 |
<div class="cell-artifacts">
|
|
@@ -4650,7 +4650,7 @@ Installed 37 packages in 190ms
|
|
| 4650 |
<rdf:RDF>
|
| 4651 |
<ns2:Work>
|
| 4652 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4653 |
-
<dc:date>2025-10-
|
| 4654 |
<dc:format>image/svg+xml</dc:format>
|
| 4655 |
<dc:creator>
|
| 4656 |
<ns2:Agent>
|
|
@@ -4760,96 +4760,96 @@ Installed 37 packages in 190ms
|
|
| 4760 |
<g id="matplotlib.axis_2">
|
| 4761 |
<g id="ytick_1">
|
| 4762 |
<g id="grid-y--2" class="grid grid-y">
|
| 4763 |
-
<path d="M 47.81
|
| 4764 |
</g>
|
| 4765 |
<g id="line2d_7">
|
| 4766 |
<defs>
|
| 4767 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4768 |
</defs>
|
| 4769 |
<g>
|
| 4770 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4771 |
</g>
|
| 4772 |
</g>
|
| 4773 |
<g id="text_7">
|
| 4774 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4775 |
</g>
|
| 4776 |
</g>
|
| 4777 |
<g id="ytick_2">
|
| 4778 |
<g id="grid-y--3" class="grid grid-y">
|
| 4779 |
-
<path d="M 47.81
|
| 4780 |
</g>
|
| 4781 |
<g id="line2d_8">
|
| 4782 |
<g>
|
| 4783 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4784 |
</g>
|
| 4785 |
</g>
|
| 4786 |
<g id="text_8">
|
| 4787 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4788 |
</g>
|
| 4789 |
</g>
|
| 4790 |
<g id="ytick_3">
|
| 4791 |
<g id="grid-y--4" class="grid grid-y">
|
| 4792 |
-
<path d="M 47.81
|
| 4793 |
</g>
|
| 4794 |
<g id="line2d_9">
|
| 4795 |
<g>
|
| 4796 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4797 |
</g>
|
| 4798 |
</g>
|
| 4799 |
<g id="text_9">
|
| 4800 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4801 |
</g>
|
| 4802 |
</g>
|
| 4803 |
<g id="ytick_4">
|
| 4804 |
<g id="grid-y--5" class="grid grid-y">
|
| 4805 |
-
<path d="M 47.81
|
| 4806 |
</g>
|
| 4807 |
<g id="line2d_10">
|
| 4808 |
<g>
|
| 4809 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4810 |
</g>
|
| 4811 |
</g>
|
| 4812 |
<g id="text_10">
|
| 4813 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4814 |
</g>
|
| 4815 |
</g>
|
| 4816 |
<g id="ytick_5">
|
| 4817 |
<g id="grid-y--6" class="grid grid-y">
|
| 4818 |
-
<path d="M 47.81
|
| 4819 |
</g>
|
| 4820 |
<g id="line2d_11">
|
| 4821 |
<g>
|
| 4822 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4823 |
</g>
|
| 4824 |
</g>
|
| 4825 |
<g id="text_11">
|
| 4826 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4827 |
</g>
|
| 4828 |
</g>
|
| 4829 |
<g id="ytick_6">
|
| 4830 |
<g id="grid-y--7" class="grid grid-y">
|
| 4831 |
-
<path d="M 47.81
|
| 4832 |
</g>
|
| 4833 |
<g id="line2d_12">
|
| 4834 |
<g>
|
| 4835 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4836 |
</g>
|
| 4837 |
</g>
|
| 4838 |
<g id="text_12">
|
| 4839 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4840 |
</g>
|
| 4841 |
</g>
|
| 4842 |
<g id="ytick_7">
|
| 4843 |
<g id="grid-y--8" class="grid grid-y">
|
| 4844 |
-
<path d="M 47.81
|
| 4845 |
</g>
|
| 4846 |
<g id="line2d_13">
|
| 4847 |
<g>
|
| 4848 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4849 |
</g>
|
| 4850 |
</g>
|
| 4851 |
<g id="text_13">
|
| 4852 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4853 |
</g>
|
| 4854 |
</g>
|
| 4855 |
<g id="label--y" class="ylabel">
|
|
@@ -4857,73 +4857,73 @@ Installed 37 packages in 190ms
|
|
| 4857 |
</g>
|
| 4858 |
</g>
|
| 4859 |
<g id="series--torch-flash-ma" class="series">
|
| 4860 |
-
<path d="M 83.607806
|
| 4861 |
<defs>
|
| 4862 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4863 |
</defs>
|
| 4864 |
<g clip-path="url(#p09feef2583)">
|
| 4865 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4866 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4867 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4868 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4869 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4870 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4871 |
</g>
|
| 4872 |
</g>
|
| 4873 |
<g id="series--torch-mem-eff" class="series">
|
| 4874 |
-
<path d="M 83.607806
|
| 4875 |
<defs>
|
| 4876 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4877 |
</defs>
|
| 4878 |
<g clip-path="url(#p09feef2583)">
|
| 4879 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4880 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4881 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4882 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4883 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4884 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4885 |
</g>
|
| 4886 |
</g>
|
| 4887 |
<g id="series--xformers-meff" class="series">
|
| 4888 |
-
<path d="M 83.607806
|
| 4889 |
<defs>
|
| 4890 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4891 |
</defs>
|
| 4892 |
<g clip-path="url(#p09feef2583)">
|
| 4893 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4894 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4895 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="382.
|
| 4896 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4897 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4898 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4899 |
</g>
|
| 4900 |
</g>
|
| 4901 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4902 |
-
<path d="M 83.607806
|
| 4903 |
<defs>
|
| 4904 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4905 |
</defs>
|
| 4906 |
<g clip-path="url(#p09feef2583)">
|
| 4907 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4908 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4909 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4910 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4911 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="340.
|
| 4912 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4913 |
</g>
|
| 4914 |
</g>
|
| 4915 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4916 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4917 |
<defs>
|
| 4918 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4919 |
</defs>
|
| 4920 |
<g clip-path="url(#p09feef2583)">
|
| 4921 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4922 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4923 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4924 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4925 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="348.
|
| 4926 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4927 |
</g>
|
| 4928 |
</g>
|
| 4929 |
<g id="patch_3">
|
|
|
|
| 4107 |
<rdf:RDF>
|
| 4108 |
<ns2:Work>
|
| 4109 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4110 |
+
<dc:date>2025-10-31T20:14:18.946177</dc:date>
|
| 4111 |
<dc:format>image/svg+xml</dc:format>
|
| 4112 |
<dc:creator>
|
| 4113 |
<ns2:Agent>
|
|
|
|
| 4217 |
<g id="matplotlib.axis_2">
|
| 4218 |
<g id="ytick_1">
|
| 4219 |
<g id="grid-y--2" class="grid grid-y">
|
| 4220 |
+
<path d="M 47.81 406.365305 L 835.361742 406.365305 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4221 |
</g>
|
| 4222 |
<g id="line2d_7">
|
| 4223 |
<defs>
|
| 4224 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4225 |
</defs>
|
| 4226 |
<g>
|
| 4227 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="406.365305" style="stroke: #000000; stroke-width: 0.8" />
|
| 4228 |
</g>
|
| 4229 |
</g>
|
| 4230 |
<g id="text_7">
|
| 4231 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="410.164524" transform="rotate(-0 40.81 410.164524)">1.0</text>
|
| 4232 |
</g>
|
| 4233 |
</g>
|
| 4234 |
<g id="ytick_2">
|
| 4235 |
<g id="grid-y--3" class="grid grid-y">
|
| 4236 |
+
<path d="M 47.81 348.61376 L 835.361742 348.61376 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4237 |
</g>
|
| 4238 |
<g id="line2d_8">
|
| 4239 |
<g>
|
| 4240 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="348.61376" style="stroke: #000000; stroke-width: 0.8" />
|
| 4241 |
</g>
|
| 4242 |
</g>
|
| 4243 |
<g id="text_8">
|
| 4244 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="352.412978" transform="rotate(-0 40.81 352.412978)">1.2</text>
|
| 4245 |
</g>
|
| 4246 |
</g>
|
| 4247 |
<g id="ytick_3">
|
| 4248 |
<g id="grid-y--4" class="grid grid-y">
|
| 4249 |
+
<path d="M 47.81 290.862214 L 835.361742 290.862214 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4250 |
</g>
|
| 4251 |
<g id="line2d_9">
|
| 4252 |
<g>
|
| 4253 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="290.862214" style="stroke: #000000; stroke-width: 0.8" />
|
| 4254 |
</g>
|
| 4255 |
</g>
|
| 4256 |
<g id="text_9">
|
| 4257 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="294.661433" transform="rotate(-0 40.81 294.661433)">1.4</text>
|
| 4258 |
</g>
|
| 4259 |
</g>
|
| 4260 |
<g id="ytick_4">
|
| 4261 |
<g id="grid-y--5" class="grid grid-y">
|
| 4262 |
+
<path d="M 47.81 233.110668 L 835.361742 233.110668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4263 |
</g>
|
| 4264 |
<g id="line2d_10">
|
| 4265 |
<g>
|
| 4266 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="233.110668" style="stroke: #000000; stroke-width: 0.8" />
|
| 4267 |
</g>
|
| 4268 |
</g>
|
| 4269 |
<g id="text_10">
|
| 4270 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.909887" transform="rotate(-0 40.81 236.909887)">1.6</text>
|
| 4271 |
</g>
|
| 4272 |
</g>
|
| 4273 |
<g id="ytick_5">
|
| 4274 |
<g id="grid-y--6" class="grid grid-y">
|
| 4275 |
+
<path d="M 47.81 175.359123 L 835.361742 175.359123 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4276 |
</g>
|
| 4277 |
<g id="line2d_11">
|
| 4278 |
<g>
|
| 4279 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="175.359123" style="stroke: #000000; stroke-width: 0.8" />
|
| 4280 |
</g>
|
| 4281 |
</g>
|
| 4282 |
<g id="text_11">
|
| 4283 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="179.158342" transform="rotate(-0 40.81 179.158342)">1.8</text>
|
| 4284 |
</g>
|
| 4285 |
</g>
|
| 4286 |
<g id="ytick_6">
|
| 4287 |
<g id="grid-y--7" class="grid grid-y">
|
| 4288 |
+
<path d="M 47.81 117.607577 L 835.361742 117.607577 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4289 |
</g>
|
| 4290 |
<g id="line2d_12">
|
| 4291 |
<g>
|
| 4292 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="117.607577" style="stroke: #000000; stroke-width: 0.8" />
|
| 4293 |
</g>
|
| 4294 |
</g>
|
| 4295 |
<g id="text_12">
|
| 4296 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.406796" transform="rotate(-0 40.81 121.406796)">2.0</text>
|
| 4297 |
</g>
|
| 4298 |
</g>
|
| 4299 |
<g id="ytick_7">
|
| 4300 |
<g id="grid-y--8" class="grid grid-y">
|
| 4301 |
+
<path d="M 47.81 59.856031 L 835.361742 59.856031 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4302 |
</g>
|
| 4303 |
<g id="line2d_13">
|
| 4304 |
<g>
|
| 4305 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="59.856031" style="stroke: #000000; stroke-width: 0.8" />
|
| 4306 |
</g>
|
| 4307 |
</g>
|
| 4308 |
<g id="text_13">
|
| 4309 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="63.65525" transform="rotate(-0 40.81 63.65525)">2.2</text>
|
| 4310 |
</g>
|
| 4311 |
</g>
|
| 4312 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4314 |
</g>
|
| 4315 |
</g>
|
| 4316 |
<g id="series--torch-flash-ma" class="series">
|
| 4317 |
+
<path d="M 83.607806 344.244567 L 226.799032 326.470951 L 369.990258 319.632879 L 513.181484 311.200865 L 656.37271 263.410306 L 799.563935 258.605377 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4318 |
<defs>
|
| 4319 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4320 |
</defs>
|
| 4321 |
<g clip-path="url(#p09feef2583)">
|
| 4322 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="344.244567" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4323 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="326.470951" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4324 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="319.632879" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4325 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="311.200865" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4326 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="263.410306" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4327 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="258.605377" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4328 |
</g>
|
| 4329 |
</g>
|
| 4330 |
<g id="series--torch-mem-eff" class="series">
|
| 4331 |
+
<path d="M 83.607806 160.220133 L 226.799032 131.522812 L 369.990258 119.284971 L 513.181484 97.052936 L 656.37271 99.854174 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4332 |
<defs>
|
| 4333 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4334 |
</defs>
|
| 4335 |
<g clip-path="url(#p09feef2583)">
|
| 4336 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="160.220133" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4337 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="131.522812" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4338 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="119.284971" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4339 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="97.052936" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4340 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="99.854174" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4341 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4342 |
</g>
|
| 4343 |
</g>
|
| 4344 |
<g id="series--xformers-meff" class="series">
|
| 4345 |
+
<path d="M 83.607806 406.681206 L 226.799032 399.095541 L 369.990258 382.16221 L 513.181484 383.640938 L 656.37271 334.388976 L 799.563935 340.779474 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4346 |
<defs>
|
| 4347 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4348 |
</defs>
|
| 4349 |
<g clip-path="url(#p09feef2583)">
|
| 4350 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="406.681206" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4351 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="399.095541" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4352 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="382.16221" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4353 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="383.640938" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4354 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="334.388976" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4355 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="340.779474" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4356 |
</g>
|
| 4357 |
</g>
|
| 4358 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4359 |
+
<path d="M 83.607806 420.013439 L 226.799032 405.003813 L 369.990258 391.079337 L 513.181484 388.024281 L 656.37271 340.106668 L 799.563935 341.194996 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4360 |
<defs>
|
| 4361 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4362 |
</defs>
|
| 4363 |
<g clip-path="url(#p09feef2583)">
|
| 4364 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="420.013439" style="fill: #d62728; stroke: #d62728" />
|
| 4365 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="405.003813" style="fill: #d62728; stroke: #d62728" />
|
| 4366 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="391.079337" style="fill: #d62728; stroke: #d62728" />
|
| 4367 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="388.024281" style="fill: #d62728; stroke: #d62728" />
|
| 4368 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="340.106668" style="fill: #d62728; stroke: #d62728" />
|
| 4369 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="341.194996" style="fill: #d62728; stroke: #d62728" />
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4373 |
+
<path d="M 83.607806 428.387702 L 226.799032 418.228917 L 369.990258 402.378716 L 513.181484 397.605262 L 656.37271 348.593258 L 799.563935 355.437105 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4374 |
<defs>
|
| 4375 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4376 |
</defs>
|
| 4377 |
<g clip-path="url(#p09feef2583)">
|
| 4378 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4379 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="418.228917" style="fill: #9467bd; stroke: #9467bd" />
|
| 4380 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="402.378716" style="fill: #9467bd; stroke: #9467bd" />
|
| 4381 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="397.605262" style="fill: #9467bd; stroke: #9467bd" />
|
| 4382 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="348.593258" style="fill: #9467bd; stroke: #9467bd" />
|
| 4383 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="355.437105" style="fill: #9467bd; stroke: #9467bd" />
|
| 4384 |
</g>
|
| 4385 |
</g>
|
| 4386 |
<g id="patch_3">
|
|
|
|
| 4465 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4466 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4467 |
</span> |
|
| 4468 |
+
Cell: combine | 4.31s
|
| 4469 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4470 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4471 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4572 |
COMBINED BENCHMARK SUMMARY
|
| 4573 |
|
| 4574 |
impl wl p50(ms) ok
|
| 4575 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
|
| 4576 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
|
| 4577 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
|
| 4578 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
|
| 4579 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
|
| 4580 |
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
|
| 4581 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
|
| 4582 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
|
| 4583 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
|
| 4584 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.03 True
|
| 4585 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
|
| 4586 |
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4587 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4588 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4589 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4590 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4591 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4592 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4593 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4594 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4595 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4596 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4597 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4598 |
+
Error: module 'sage_attention_ef0573391bb63704' has no attribute 'fwd'
|
| 4599 |
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4600 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
|
| 4601 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
|
| 4602 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
|
| 4603 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.50 True
|
| 4604 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
|
| 4605 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.85 True
|
| 4606 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
|
| 4607 |
+
torch_mem_eff cuda_attn_L320_bfloat16 1.99 True
|
| 4608 |
+
torch_mem_eff cuda_attn_L384_bfloat16 2.07 True
|
| 4609 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
|
| 4610 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
|
| 4611 |
+
xformers_meff cuda_attn_L128_bfloat16 1.00 True
|
| 4612 |
+
xformers_meff cuda_attn_L256_bfloat16 1.03 True
|
| 4613 |
+
xformers_meff cuda_attn_L320_bfloat16 1.08 True
|
| 4614 |
xformers_meff cuda_attn_L384_bfloat16 1.08 True
|
| 4615 |
+
xformers_meff cuda_attn_L448_bfloat16 1.25 True
|
| 4616 |
xformers_meff cuda_attn_L512_bfloat16 1.23 True
|
| 4617 |
|
| 4618 |
GENERATING COMBINED VISUALIZATION
|
|
|
|
| 4637 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4638 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4639 |
<div class="uv-logs-content" style="display: none;">
|
| 4640 |
+
Installed 37 packages in 225ms
|
| 4641 |
</div>
|
| 4642 |
</div>
|
| 4643 |
<div class="cell-artifacts">
|
|
|
|
| 4650 |
<rdf:RDF>
|
| 4651 |
<ns2:Work>
|
| 4652 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4653 |
+
<dc:date>2025-10-31T20:14:18.946177</dc:date>
|
| 4654 |
<dc:format>image/svg+xml</dc:format>
|
| 4655 |
<dc:creator>
|
| 4656 |
<ns2:Agent>
|
|
|
|
| 4760 |
<g id="matplotlib.axis_2">
|
| 4761 |
<g id="ytick_1">
|
| 4762 |
<g id="grid-y--2" class="grid grid-y">
|
| 4763 |
+
<path d="M 47.81 406.365305 L 835.361742 406.365305 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4764 |
</g>
|
| 4765 |
<g id="line2d_7">
|
| 4766 |
<defs>
|
| 4767 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4768 |
</defs>
|
| 4769 |
<g>
|
| 4770 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="406.365305" style="stroke: #000000; stroke-width: 0.8" />
|
| 4771 |
</g>
|
| 4772 |
</g>
|
| 4773 |
<g id="text_7">
|
| 4774 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="410.164524" transform="rotate(-0 40.81 410.164524)">1.0</text>
|
| 4775 |
</g>
|
| 4776 |
</g>
|
| 4777 |
<g id="ytick_2">
|
| 4778 |
<g id="grid-y--3" class="grid grid-y">
|
| 4779 |
+
<path d="M 47.81 348.61376 L 835.361742 348.61376 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4780 |
</g>
|
| 4781 |
<g id="line2d_8">
|
| 4782 |
<g>
|
| 4783 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="348.61376" style="stroke: #000000; stroke-width: 0.8" />
|
| 4784 |
</g>
|
| 4785 |
</g>
|
| 4786 |
<g id="text_8">
|
| 4787 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="352.412978" transform="rotate(-0 40.81 352.412978)">1.2</text>
|
| 4788 |
</g>
|
| 4789 |
</g>
|
| 4790 |
<g id="ytick_3">
|
| 4791 |
<g id="grid-y--4" class="grid grid-y">
|
| 4792 |
+
<path d="M 47.81 290.862214 L 835.361742 290.862214 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4793 |
</g>
|
| 4794 |
<g id="line2d_9">
|
| 4795 |
<g>
|
| 4796 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="290.862214" style="stroke: #000000; stroke-width: 0.8" />
|
| 4797 |
</g>
|
| 4798 |
</g>
|
| 4799 |
<g id="text_9">
|
| 4800 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="294.661433" transform="rotate(-0 40.81 294.661433)">1.4</text>
|
| 4801 |
</g>
|
| 4802 |
</g>
|
| 4803 |
<g id="ytick_4">
|
| 4804 |
<g id="grid-y--5" class="grid grid-y">
|
| 4805 |
+
<path d="M 47.81 233.110668 L 835.361742 233.110668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4806 |
</g>
|
| 4807 |
<g id="line2d_10">
|
| 4808 |
<g>
|
| 4809 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="233.110668" style="stroke: #000000; stroke-width: 0.8" />
|
| 4810 |
</g>
|
| 4811 |
</g>
|
| 4812 |
<g id="text_10">
|
| 4813 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="236.909887" transform="rotate(-0 40.81 236.909887)">1.6</text>
|
| 4814 |
</g>
|
| 4815 |
</g>
|
| 4816 |
<g id="ytick_5">
|
| 4817 |
<g id="grid-y--6" class="grid grid-y">
|
| 4818 |
+
<path d="M 47.81 175.359123 L 835.361742 175.359123 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4819 |
</g>
|
| 4820 |
<g id="line2d_11">
|
| 4821 |
<g>
|
| 4822 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="175.359123" style="stroke: #000000; stroke-width: 0.8" />
|
| 4823 |
</g>
|
| 4824 |
</g>
|
| 4825 |
<g id="text_11">
|
| 4826 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="179.158342" transform="rotate(-0 40.81 179.158342)">1.8</text>
|
| 4827 |
</g>
|
| 4828 |
</g>
|
| 4829 |
<g id="ytick_6">
|
| 4830 |
<g id="grid-y--7" class="grid grid-y">
|
| 4831 |
+
<path d="M 47.81 117.607577 L 835.361742 117.607577 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4832 |
</g>
|
| 4833 |
<g id="line2d_12">
|
| 4834 |
<g>
|
| 4835 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="117.607577" style="stroke: #000000; stroke-width: 0.8" />
|
| 4836 |
</g>
|
| 4837 |
</g>
|
| 4838 |
<g id="text_12">
|
| 4839 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.406796" transform="rotate(-0 40.81 121.406796)">2.0</text>
|
| 4840 |
</g>
|
| 4841 |
</g>
|
| 4842 |
<g id="ytick_7">
|
| 4843 |
<g id="grid-y--8" class="grid grid-y">
|
| 4844 |
+
<path d="M 47.81 59.856031 L 835.361742 59.856031 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4845 |
</g>
|
| 4846 |
<g id="line2d_13">
|
| 4847 |
<g>
|
| 4848 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="59.856031" style="stroke: #000000; stroke-width: 0.8" />
|
| 4849 |
</g>
|
| 4850 |
</g>
|
| 4851 |
<g id="text_13">
|
| 4852 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="63.65525" transform="rotate(-0 40.81 63.65525)">2.2</text>
|
| 4853 |
</g>
|
| 4854 |
</g>
|
| 4855 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4857 |
</g>
|
| 4858 |
</g>
|
| 4859 |
<g id="series--torch-flash-ma" class="series">
|
| 4860 |
+
<path d="M 83.607806 344.244567 L 226.799032 326.470951 L 369.990258 319.632879 L 513.181484 311.200865 L 656.37271 263.410306 L 799.563935 258.605377 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4861 |
<defs>
|
| 4862 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4863 |
</defs>
|
| 4864 |
<g clip-path="url(#p09feef2583)">
|
| 4865 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="344.244567" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4866 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="326.470951" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4867 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="319.632879" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4868 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="311.200865" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4869 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="263.410306" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4870 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="258.605377" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4871 |
</g>
|
| 4872 |
</g>
|
| 4873 |
<g id="series--torch-mem-eff" class="series">
|
| 4874 |
+
<path d="M 83.607806 160.220133 L 226.799032 131.522812 L 369.990258 119.284971 L 513.181484 97.052936 L 656.37271 99.854174 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4875 |
<defs>
|
| 4876 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4877 |
</defs>
|
| 4878 |
<g clip-path="url(#p09feef2583)">
|
| 4879 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="160.220133" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4880 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="131.522812" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4881 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="119.284971" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4882 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="97.052936" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4883 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="99.854174" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4884 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4885 |
</g>
|
| 4886 |
</g>
|
| 4887 |
<g id="series--xformers-meff" class="series">
|
| 4888 |
+
<path d="M 83.607806 406.681206 L 226.799032 399.095541 L 369.990258 382.16221 L 513.181484 383.640938 L 656.37271 334.388976 L 799.563935 340.779474 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4889 |
<defs>
|
| 4890 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4891 |
</defs>
|
| 4892 |
<g clip-path="url(#p09feef2583)">
|
| 4893 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="406.681206" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4894 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="399.095541" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4895 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="382.16221" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4896 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="383.640938" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4897 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="334.388976" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4898 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="340.779474" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4899 |
</g>
|
| 4900 |
</g>
|
| 4901 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4902 |
+
<path d="M 83.607806 420.013439 L 226.799032 405.003813 L 369.990258 391.079337 L 513.181484 388.024281 L 656.37271 340.106668 L 799.563935 341.194996 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4903 |
<defs>
|
| 4904 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4905 |
</defs>
|
| 4906 |
<g clip-path="url(#p09feef2583)">
|
| 4907 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="420.013439" style="fill: #d62728; stroke: #d62728" />
|
| 4908 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="405.003813" style="fill: #d62728; stroke: #d62728" />
|
| 4909 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="391.079337" style="fill: #d62728; stroke: #d62728" />
|
| 4910 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="388.024281" style="fill: #d62728; stroke: #d62728" />
|
| 4911 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="340.106668" style="fill: #d62728; stroke: #d62728" />
|
| 4912 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="341.194996" style="fill: #d62728; stroke: #d62728" />
|
| 4913 |
</g>
|
| 4914 |
</g>
|
| 4915 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4916 |
+
<path d="M 83.607806 428.387702 L 226.799032 418.228917 L 369.990258 402.378716 L 513.181484 397.605262 L 656.37271 348.593258 L 799.563935 355.437105 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4917 |
<defs>
|
| 4918 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4919 |
</defs>
|
| 4920 |
<g clip-path="url(#p09feef2583)">
|
| 4921 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4922 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="418.228917" style="fill: #9467bd; stroke: #9467bd" />
|
| 4923 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="402.378716" style="fill: #9467bd; stroke: #9467bd" />
|
| 4924 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="397.605262" style="fill: #9467bd; stroke: #9467bd" />
|
| 4925 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="348.593258" style="fill: #9467bd; stroke: #9467bd" />
|
| 4926 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="355.437105" style="fill: #9467bd; stroke: #9467bd" />
|
| 4927 |
</g>
|
| 4928 |
</g>
|
| 4929 |
<g id="patch_3">
|
index.html
CHANGED
|
@@ -4097,35 +4097,54 @@ body[data-tool="eraser"] .main-content {
|
|
| 4097 |
</div>
|
| 4098 |
|
| 4099 |
<div class="main-content">
|
| 4100 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4101 |
<p>This report aggregates latency and performance benchmarks across core model components.<br />
|
| 4102 |
Each section includes:<br />
|
| 4103 |
- A latency visualization<br />
|
| 4104 |
- Links to detailed implementation benchmarks </p>
|
| 4105 |
<h2>TABLE OF CONTENTS</h2>
|
| 4106 |
<ul>
|
| 4107 |
-
<li><a href="#
|
| 4108 |
-
<li><a href="#layer-normalization">LAYER NORMALIZATION</a></li>
|
| 4109 |
-
<li><a href="#rotary-position-embeddings">ROTARY POSITION EMBEDDINGS</a></li>
|
| 4110 |
<li><a href="#flash-attention">FLASH ATTENTION</a></li>
|
|
|
|
|
|
|
|
|
|
| 4111 |
<li><a href="#causal-conv1d">CAUSAL CONV1D</a></li>
|
| 4112 |
-
<li><a href="#
|
| 4113 |
-
<li><a href="#notes">NOTES</a></li>
|
| 4114 |
</ul>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4115 |
<h2>METHODOLOGY</h2>
|
| 4116 |
-
<p>Each benchmark is run with the
|
|
|
|
| 4117 |
- a reference implementation (usually PyTorch native) is included for baseline comparison<br />
|
| 4118 |
- multiple input sizes and batch sizes are tested to reflect real-world usage<br />
|
| 4119 |
- runs are repeatable via python virtual environments and documented dependencies<br />
|
| 4120 |
- results are collected and visualized using standardized scripts </p>
|
| 4121 |
-
<
|
|
|
|
| 4122 |
<div class="alert">
|
| 4123 |
<strong>Note:</strong> Latency values are measured in milliseconds (ms). Lower values indicate better performance.
|
| 4124 |
</div>
|
| 4125 |
|
| 4126 |
-
<h2>
|
| 4127 |
<div class="artifact-preview">
|
| 4128 |
-
<img src="
|
| 4129 |
</div>
|
| 4130 |
|
| 4131 |
<table>
|
|
@@ -4133,32 +4152,40 @@ Each section includes:<br />
|
|
| 4133 |
<tr>
|
| 4134 |
<th>Implementation</th>
|
| 4135 |
<th>Description</th>
|
|
|
|
|
|
|
|
|
|
| 4136 |
</tr>
|
| 4137 |
</thead>
|
| 4138 |
<tbody>
|
| 4139 |
<tr>
|
| 4140 |
-
<td>HF Kernels
|
| 4141 |
-
<td>HuggingFace kernels implementation</td>
|
|
|
|
|
|
|
|
|
|
| 4142 |
</tr>
|
| 4143 |
<tr>
|
| 4144 |
-
<td>PyTorch
|
| 4145 |
-
<td>PyTorch native implementation</td>
|
|
|
|
|
|
|
|
|
|
| 4146 |
</tr>
|
| 4147 |
</tbody>
|
| 4148 |
</table>
|
| 4149 |
<p align="center">
|
| 4150 |
-
|
| 4151 |
-
|
| 4152 |
-
onclick="window.location.href='layer_norm/'"
|
| 4153 |
class="btn">
|
| 4154 |
Explore Full Bench
|
| 4155 |
</button>
|
| 4156 |
</p>
|
| 4157 |
|
| 4158 |
<hr />
|
| 4159 |
-
<h2>
|
| 4160 |
<div class="artifact-preview">
|
| 4161 |
-
<img src="
|
| 4162 |
</div>
|
| 4163 |
|
| 4164 |
<table>
|
|
@@ -4166,31 +4193,68 @@ Each section includes:<br />
|
|
| 4166 |
<tr>
|
| 4167 |
<th>Implementation</th>
|
| 4168 |
<th>Description</th>
|
|
|
|
|
|
|
|
|
|
| 4169 |
</tr>
|
| 4170 |
</thead>
|
| 4171 |
<tbody>
|
| 4172 |
<tr>
|
| 4173 |
-
<td>
|
| 4174 |
-
<td>
|
|
|
|
|
|
|
|
|
|
| 4175 |
</tr>
|
| 4176 |
<tr>
|
| 4177 |
-
<td>
|
| 4178 |
-
<td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4179 |
</tr>
|
| 4180 |
</tbody>
|
| 4181 |
</table>
|
| 4182 |
<p align="center">
|
| 4183 |
<button
|
| 4184 |
-
onclick="window.location.href='
|
| 4185 |
class="btn">
|
| 4186 |
Explore Full Bench
|
| 4187 |
</button>
|
| 4188 |
</p>
|
| 4189 |
|
| 4190 |
<hr />
|
| 4191 |
-
<h2>
|
| 4192 |
<div class="artifact-preview">
|
| 4193 |
-
<img src="
|
| 4194 |
</div>
|
| 4195 |
|
| 4196 |
<table>
|
|
@@ -4198,38 +4262,72 @@ Each section includes:<br />
|
|
| 4198 |
<tr>
|
| 4199 |
<th>Implementation</th>
|
| 4200 |
<th>Description</th>
|
|
|
|
|
|
|
|
|
|
| 4201 |
</tr>
|
| 4202 |
</thead>
|
| 4203 |
<tbody>
|
| 4204 |
<tr>
|
| 4205 |
-
<td>
|
| 4206 |
-
<td>
|
| 4207 |
-
</
|
| 4208 |
-
<
|
| 4209 |
-
<td>
|
| 4210 |
-
<td>HuggingFace kernels Flash Attention</td>
|
| 4211 |
</tr>
|
| 4212 |
<tr>
|
| 4213 |
-
<td>
|
| 4214 |
-
<td>
|
|
|
|
|
|
|
|
|
|
| 4215 |
</tr>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4216 |
<tr>
|
| 4217 |
-
<
|
| 4218 |
-
<
|
|
|
|
|
|
|
|
|
|
| 4219 |
</tr>
|
|
|
|
|
|
|
| 4220 |
<tr>
|
| 4221 |
-
<td>
|
| 4222 |
-
<td>
|
|
|
|
|
|
|
|
|
|
| 4223 |
</tr>
|
| 4224 |
<tr>
|
| 4225 |
-
<td>
|
| 4226 |
-
<td>
|
|
|
|
|
|
|
|
|
|
| 4227 |
</tr>
|
| 4228 |
</tbody>
|
| 4229 |
</table>
|
| 4230 |
<p align="center">
|
| 4231 |
<button
|
| 4232 |
-
onclick="window.location.href='
|
| 4233 |
class="btn">
|
| 4234 |
Explore Full Bench
|
| 4235 |
</button>
|
|
@@ -4246,16 +4344,25 @@ Each section includes:<br />
|
|
| 4246 |
<tr>
|
| 4247 |
<th>Implementation</th>
|
| 4248 |
<th>Description</th>
|
|
|
|
|
|
|
|
|
|
| 4249 |
</tr>
|
| 4250 |
</thead>
|
| 4251 |
<tbody>
|
| 4252 |
<tr>
|
| 4253 |
<td>HF Kernels Causal Conv1D</td>
|
| 4254 |
<td>HuggingFace kernels implementation</td>
|
|
|
|
|
|
|
|
|
|
| 4255 |
</tr>
|
| 4256 |
<tr>
|
| 4257 |
<td>PyTorch Causal Conv1D</td>
|
| 4258 |
<td>PyTorch native implementation</td>
|
|
|
|
|
|
|
|
|
|
| 4259 |
</tr>
|
| 4260 |
</tbody>
|
| 4261 |
</table>
|
|
@@ -4268,9 +4375,9 @@ Each section includes:<br />
|
|
| 4268 |
</p>
|
| 4269 |
|
| 4270 |
<hr />
|
| 4271 |
-
<h2>
|
| 4272 |
<div class="artifact-preview">
|
| 4273 |
-
<img src="
|
| 4274 |
</div>
|
| 4275 |
|
| 4276 |
<table>
|
|
@@ -4278,28 +4385,77 @@ Each section includes:<br />
|
|
| 4278 |
<tr>
|
| 4279 |
<th>Implementation</th>
|
| 4280 |
<th>Description</th>
|
|
|
|
|
|
|
|
|
|
| 4281 |
</tr>
|
| 4282 |
</thead>
|
| 4283 |
<tbody>
|
| 4284 |
<tr>
|
| 4285 |
-
<td>HF Kernels
|
| 4286 |
-
<td>HuggingFace kernels
|
|
|
|
|
|
|
|
|
|
| 4287 |
</tr>
|
| 4288 |
<tr>
|
| 4289 |
-
<td>PyTorch
|
| 4290 |
-
<td>PyTorch native
|
|
|
|
|
|
|
|
|
|
| 4291 |
</tr>
|
| 4292 |
</tbody>
|
| 4293 |
</table>
|
| 4294 |
<p align="center">
|
| 4295 |
<button
|
| 4296 |
-
onclick="window.location.href='
|
| 4297 |
class="btn">
|
| 4298 |
Explore Full Bench
|
| 4299 |
</button>
|
| 4300 |
</p>
|
| 4301 |
|
| 4302 |
<hr />
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4303 |
<style>
|
| 4304 |
.controls {
|
| 4305 |
display: none !important;
|
|
@@ -4343,12 +4499,10 @@ Each section includes:<br />
|
|
| 4343 |
}
|
| 4344 |
:root {
|
| 4345 |
--bg-alert: #0069cbff;
|
| 4346 |
-
--border-alert: #001628ff;
|
| 4347 |
}
|
| 4348 |
.alert {
|
| 4349 |
-
padding: 5px;
|
| 4350 |
background-color: var(--bg-alert);
|
| 4351 |
-
border-left: 6px solid var(--border-alert);
|
| 4352 |
margin-bottom: 10px;
|
| 4353 |
border-radius: 6px;
|
| 4354 |
}
|
|
|
|
| 4097 |
</div>
|
| 4098 |
|
| 4099 |
<div class="main-content">
|
| 4100 |
+
<div class="linkbar">
|
| 4101 |
+
<a target="_blank" href="https://github.com/huggingface/kernels">Python Library</a> |
|
| 4102 |
+
<a target="_blank" href="https://github.com/huggingface/kernel-builder">Builder</a> |
|
| 4103 |
+
<a target="_blank" href="https://github.com/huggingface/kernels-community">Community</a> |
|
| 4104 |
+
<a target="_blank" href="https://huggingface.co/kernels-community">Community Hub</a> |
|
| 4105 |
+
<a target="_blank" href="https://github.com/huggingface/kernels-benchmarks">Benchmarks</a>
|
| 4106 |
+
</div>
|
| 4107 |
+
|
| 4108 |
+
<p><br/></p>
|
| 4109 |
+
<h1>KERNELS COMMUNITY BENCHMARKS</h1>
|
| 4110 |
<p>This report aggregates latency and performance benchmarks across core model components.<br />
|
| 4111 |
Each section includes:<br />
|
| 4112 |
- A latency visualization<br />
|
| 4113 |
- Links to detailed implementation benchmarks </p>
|
| 4114 |
<h2>TABLE OF CONTENTS</h2>
|
| 4115 |
<ul>
|
| 4116 |
+
<li><a href="#activation-functions">ACTIVATION FUNCTIONS</a></li>
|
|
|
|
|
|
|
| 4117 |
<li><a href="#flash-attention">FLASH ATTENTION</a></li>
|
| 4118 |
+
<li><a href="#deformable-detr">DEFORMABLE DETR</a></li>
|
| 4119 |
+
<li><a href="#openai-style-moe">OPENAI-STYLE MOE</a></li>
|
| 4120 |
+
<li><a href="#rotary-position-embeddings">ROTARY POSITION EMBEDDINGS</a></li>
|
| 4121 |
<li><a href="#causal-conv1d">CAUSAL CONV1D</a></li>
|
| 4122 |
+
<li><a href="#layer-normaliz=ation">LAYER NORMALIZATION</a></li>
|
|
|
|
| 4123 |
</ul>
|
| 4124 |
+
<h2>RUN YOURSELF</h2>
|
| 4125 |
+
<p>To run the benchmarks locally, clone the repository and use <code>uvx</code> to build and run the benchmarks:</p>
|
| 4126 |
+
<p>Note benches are made to run on a machine with a compatible NVIDIA GPU and CUDA installed, other hardware may not not work as expected.</p>
|
| 4127 |
+
<div class="codehilite"><pre><span></span><code>git<span class="w"> </span>clone<span class="w"> </span>https://github.com/huggingface/kernels-benchmarks.git
|
| 4128 |
+
<span class="nb">cd</span><span class="w"> </span>kernels-benchmarks
|
| 4129 |
+
uvx<span class="w"> </span>https://github.com/drbh/uvnote.git<span class="w"> </span>build<span class="w"> </span>benches
|
| 4130 |
+
</code></pre></div>
|
| 4131 |
+
|
| 4132 |
<h2>METHODOLOGY</h2>
|
| 4133 |
+
<p>Each benchmark is run with the
|
| 4134 |
+
<a target="_blank" href="https://github.com/huggingface/kernels-benchmarks">Kernels Benchmarking Framework</a> and follows these principles:<br />
|
| 4135 |
- a reference implementation (usually PyTorch native) is included for baseline comparison<br />
|
| 4136 |
- multiple input sizes and batch sizes are tested to reflect real-world usage<br />
|
| 4137 |
- runs are repeatable via python virtual environments and documented dependencies<br />
|
| 4138 |
- results are collected and visualized using standardized scripts </p>
|
| 4139 |
+
<p><br/></p>
|
| 4140 |
+
<h2>BENCHMARKS</h2>
|
| 4141 |
<div class="alert">
|
| 4142 |
<strong>Note:</strong> Latency values are measured in milliseconds (ms). Lower values indicate better performance.
|
| 4143 |
</div>
|
| 4144 |
|
| 4145 |
+
<h2>ACTIVATION FUNCTIONS</h2>
|
| 4146 |
<div class="artifact-preview">
|
| 4147 |
+
<img src="activation/results/artifacts/combine/latency.svg" alt="Activation Latency" width="800">
|
| 4148 |
</div>
|
| 4149 |
|
| 4150 |
<table>
|
|
|
|
| 4152 |
<tr>
|
| 4153 |
<th>Implementation</th>
|
| 4154 |
<th>Description</th>
|
| 4155 |
+
<th>Source</th>
|
| 4156 |
+
<th>HF</th>
|
| 4157 |
+
<th>Bench</th>
|
| 4158 |
</tr>
|
| 4159 |
</thead>
|
| 4160 |
<tbody>
|
| 4161 |
<tr>
|
| 4162 |
+
<td>HF Kernels SwiGLU</td>
|
| 4163 |
+
<td>HuggingFace kernels SwiGLU implementation</td>
|
| 4164 |
+
<td><a href="https://github.com/huggingface/kernels-community/tree/main/activation">GitHub</a></td>
|
| 4165 |
+
<td><a href="https://huggingface.co/kernels-community/activation">HF</a></td>
|
| 4166 |
+
<td><a href="activation/impls/hf_kernels_swiglu.html">Bench</a></td>
|
| 4167 |
</tr>
|
| 4168 |
<tr>
|
| 4169 |
+
<td>PyTorch SwiGLU</td>
|
| 4170 |
+
<td>PyTorch native SwiGLU implementation</td>
|
| 4171 |
+
<td>-</td>
|
| 4172 |
+
<td>-</td>
|
| 4173 |
+
<td><a href="activation/impls/torch_swiglu.html">Bench</a></td>
|
| 4174 |
</tr>
|
| 4175 |
</tbody>
|
| 4176 |
</table>
|
| 4177 |
<p align="center">
|
| 4178 |
+
<button
|
| 4179 |
+
onclick="window.location.href='/#/activation/'"
|
|
|
|
| 4180 |
class="btn">
|
| 4181 |
Explore Full Bench
|
| 4182 |
</button>
|
| 4183 |
</p>
|
| 4184 |
|
| 4185 |
<hr />
|
| 4186 |
+
<h2>FLASH ATTENTION</h2>
|
| 4187 |
<div class="artifact-preview">
|
| 4188 |
+
<img src="flash_attn/results/artifacts/combine/latency.svg" alt="Flash Attention Latency" width="800">
|
| 4189 |
</div>
|
| 4190 |
|
| 4191 |
<table>
|
|
|
|
| 4193 |
<tr>
|
| 4194 |
<th>Implementation</th>
|
| 4195 |
<th>Description</th>
|
| 4196 |
+
<th>Source</th>
|
| 4197 |
+
<th>HF</th>
|
| 4198 |
+
<th>Bench</th>
|
| 4199 |
</tr>
|
| 4200 |
</thead>
|
| 4201 |
<tbody>
|
| 4202 |
<tr>
|
| 4203 |
+
<td>Flash Attention</td>
|
| 4204 |
+
<td>Torch SDPA Flash Attention implementation</td>
|
| 4205 |
+
<td>-</td>
|
| 4206 |
+
<td>-</td>
|
| 4207 |
+
<td><a href="flash_attn/impls/flash_attention.html">Bench</a></td>
|
| 4208 |
</tr>
|
| 4209 |
<tr>
|
| 4210 |
+
<td>HF Kernels Flash Attention 2</td>
|
| 4211 |
+
<td>HuggingFace kernels Flash Attention</td>
|
| 4212 |
+
<td><a href="https://github.com/huggingface/kernels-community/tree/main/flash-attn2">GitHub</a></td>
|
| 4213 |
+
<td><a href="https://huggingface.co/kernels-community/flash-attn2">HF</a></td>
|
| 4214 |
+
<td><a href="flash_attn/impls/hf_kernels_flash_attn.html">Bench</a></td>
|
| 4215 |
+
</tr>
|
| 4216 |
+
<tr>
|
| 4217 |
+
<td>HF Kernels Flash Attention 3</td>
|
| 4218 |
+
<td>HuggingFace kernels Flash Attention 3</td>
|
| 4219 |
+
<td><a href="https://github.com/huggingface/kernels-community/tree/main/flash-attn3">GitHub</a></td>
|
| 4220 |
+
<td><a href="https://huggingface.co/kernels-community/flash-attn3">HF</a></td>
|
| 4221 |
+
<td><a href="flash_attn/impls/hf_kernels_flash_attn3.html">Bench</a></td>
|
| 4222 |
+
</tr>
|
| 4223 |
+
<tr>
|
| 4224 |
+
<td>Memory Efficient Attention</td>
|
| 4225 |
+
<td>Memory efficient attention implementation</td>
|
| 4226 |
+
<td></td>
|
| 4227 |
+
<td>-</td>
|
| 4228 |
+
<td><a href="flash_attn/impls/mem_efficient_attention.html">Bench</a></td>
|
| 4229 |
+
</tr>
|
| 4230 |
+
<tr>
|
| 4231 |
+
<td>Sage Attention</td>
|
| 4232 |
+
<td>Sage attention implementation</td>
|
| 4233 |
+
<td></td>
|
| 4234 |
+
<td><a href="https://huggingface.co/kernels-community/sage_attention">HF</a></td>
|
| 4235 |
+
<td><a href="flash_attn/impls/sage_attention.html">Bench</a></td>
|
| 4236 |
+
</tr>
|
| 4237 |
+
<tr>
|
| 4238 |
+
<td>xFormers</td>
|
| 4239 |
+
<td>xFormers attention implementation</td>
|
| 4240 |
+
<td><a href="https://github.com/facebookresearch/xformers">GitHub</a></td>
|
| 4241 |
+
<td>-</td>
|
| 4242 |
+
<td><a href="flash_attn/impls/xformers.html">Bench</a></td>
|
| 4243 |
</tr>
|
| 4244 |
</tbody>
|
| 4245 |
</table>
|
| 4246 |
<p align="center">
|
| 4247 |
<button
|
| 4248 |
+
onclick="window.location.href='flash_attn/'"
|
| 4249 |
class="btn">
|
| 4250 |
Explore Full Bench
|
| 4251 |
</button>
|
| 4252 |
</p>
|
| 4253 |
|
| 4254 |
<hr />
|
| 4255 |
+
<h2>DEFORMABLE DETR</h2>
|
| 4256 |
<div class="artifact-preview">
|
| 4257 |
+
<img src="deformable_detr/results/artifacts/combine/latency.svg" alt="Deformable DETR Latency" width="800">
|
| 4258 |
</div>
|
| 4259 |
|
| 4260 |
<table>
|
|
|
|
| 4262 |
<tr>
|
| 4263 |
<th>Implementation</th>
|
| 4264 |
<th>Description</th>
|
| 4265 |
+
<th>Source</th>
|
| 4266 |
+
<th>HF</th>
|
| 4267 |
+
<th>Bench</th>
|
| 4268 |
</tr>
|
| 4269 |
</thead>
|
| 4270 |
<tbody>
|
| 4271 |
<tr>
|
| 4272 |
+
<td>HF Kernels Deformable DETR</td>
|
| 4273 |
+
<td>HuggingFace kernels Deformable DETR implementation</td>
|
| 4274 |
+
<td><a href="https://github.com/huggingface/kernels-community/tree/main/deformable-detr">GitHub</a></td>
|
| 4275 |
+
<td><a href="https://huggingface.co/kernels-community/deformable-detr">HF</a></td>
|
| 4276 |
+
<td><a href="deformable_detr/impls/hf_kernels_deformable_detr.html">Bench</a></td>
|
|
|
|
| 4277 |
</tr>
|
| 4278 |
<tr>
|
| 4279 |
+
<td>PyTorch Deformable DETR</td>
|
| 4280 |
+
<td>PyTorch native Deformable DETR implementation</td>
|
| 4281 |
+
<td>-</td>
|
| 4282 |
+
<td>-</td>
|
| 4283 |
+
<td><a href="deformable_detr/impls/torch_deformable_detr.html">Bench</a></td>
|
| 4284 |
</tr>
|
| 4285 |
+
</tbody>
|
| 4286 |
+
</table>
|
| 4287 |
+
<p align="center">
|
| 4288 |
+
<button
|
| 4289 |
+
onclick="window.location.href='deformable_detr/'"
|
| 4290 |
+
class="btn">
|
| 4291 |
+
Explore Full Bench
|
| 4292 |
+
</button>
|
| 4293 |
+
</p>
|
| 4294 |
+
|
| 4295 |
+
<hr />
|
| 4296 |
+
<h2>OPENAI-STYLE MOE</h2>
|
| 4297 |
+
<div class="artifact-preview">
|
| 4298 |
+
<img src="openai_moe/results/artifacts/combine/latency.svg" alt="OpenAI MoE Latency" width="800">
|
| 4299 |
+
</div>
|
| 4300 |
+
|
| 4301 |
+
<table>
|
| 4302 |
+
<thead>
|
| 4303 |
<tr>
|
| 4304 |
+
<th>Implementation</th>
|
| 4305 |
+
<th>Description</th>
|
| 4306 |
+
<th>Source</th>
|
| 4307 |
+
<th>HF</th>
|
| 4308 |
+
<th>Bench</th>
|
| 4309 |
</tr>
|
| 4310 |
+
</thead>
|
| 4311 |
+
<tbody>
|
| 4312 |
<tr>
|
| 4313 |
+
<td>GptOssExperts</td>
|
| 4314 |
+
<td>GPT OSS reference OpenAI-style MoE</td>
|
| 4315 |
+
<td></td>
|
| 4316 |
+
<td></td>
|
| 4317 |
+
<td><a href="openai_moe/impls/gpt_oss_moe.html">Bench</a></td>
|
| 4318 |
</tr>
|
| 4319 |
<tr>
|
| 4320 |
+
<td>Binned PyTorch</td>
|
| 4321 |
+
<td>Binned PyTorch OpenAI-style MoE implementation</td>
|
| 4322 |
+
<td>-</td>
|
| 4323 |
+
<td>-</td>
|
| 4324 |
+
<td><a href="openai_moe/impls/binned_torch.html">Bench</a></td>
|
| 4325 |
</tr>
|
| 4326 |
</tbody>
|
| 4327 |
</table>
|
| 4328 |
<p align="center">
|
| 4329 |
<button
|
| 4330 |
+
onclick="window.location.href='openai_moe/'"
|
| 4331 |
class="btn">
|
| 4332 |
Explore Full Bench
|
| 4333 |
</button>
|
|
|
|
| 4344 |
<tr>
|
| 4345 |
<th>Implementation</th>
|
| 4346 |
<th>Description</th>
|
| 4347 |
+
<th>Source</th>
|
| 4348 |
+
<th>HF</th>
|
| 4349 |
+
<th>Bench</th>
|
| 4350 |
</tr>
|
| 4351 |
</thead>
|
| 4352 |
<tbody>
|
| 4353 |
<tr>
|
| 4354 |
<td>HF Kernels Causal Conv1D</td>
|
| 4355 |
<td>HuggingFace kernels implementation</td>
|
| 4356 |
+
<td><a href="https://github.com/huggingface/kernels-community/tree/main/causal-conv1d">GitHub</a></td>
|
| 4357 |
+
<td><a href="https://huggingface.co/kernels-community/causal-conv1d">HF</a></td>
|
| 4358 |
+
<td><a href="causal_conv1d/impls/hf_kernels_causal_conv1d.html">Bench</a></td>
|
| 4359 |
</tr>
|
| 4360 |
<tr>
|
| 4361 |
<td>PyTorch Causal Conv1D</td>
|
| 4362 |
<td>PyTorch native implementation</td>
|
| 4363 |
+
<td>-</td>
|
| 4364 |
+
<td>-</td>
|
| 4365 |
+
<td><a href="causal_conv1d/impls/torch_causal_conv1d.html">Bench</a></td>
|
| 4366 |
</tr>
|
| 4367 |
</tbody>
|
| 4368 |
</table>
|
|
|
|
| 4375 |
</p>
|
| 4376 |
|
| 4377 |
<hr />
|
| 4378 |
+
<h2>ROTARY POSITION EMBEDDINGS</h2>
|
| 4379 |
<div class="artifact-preview">
|
| 4380 |
+
<img src="rotary/results/artifacts/combine/latency.svg" alt="Rotary Position Embeddings Latency" width="800">
|
| 4381 |
</div>
|
| 4382 |
|
| 4383 |
<table>
|
|
|
|
| 4385 |
<tr>
|
| 4386 |
<th>Implementation</th>
|
| 4387 |
<th>Description</th>
|
| 4388 |
+
<th>Source</th>
|
| 4389 |
+
<th>HF</th>
|
| 4390 |
+
<th>Bench</th>
|
| 4391 |
</tr>
|
| 4392 |
</thead>
|
| 4393 |
<tbody>
|
| 4394 |
<tr>
|
| 4395 |
+
<td>HF Kernels Rotary</td>
|
| 4396 |
+
<td>HuggingFace kernels implementation</td>
|
| 4397 |
+
<td><a href="https://github.com/huggingface/kernels-community/tree/main/rotary">GitHub</a></td>
|
| 4398 |
+
<td><a href="https://huggingface.co/kernels-community/rotary">HF</a></td>
|
| 4399 |
+
<td><a href="rotary/impls/hf_kernels_rotary.html">Bench</a></td>
|
| 4400 |
</tr>
|
| 4401 |
<tr>
|
| 4402 |
+
<td>PyTorch Rotary</td>
|
| 4403 |
+
<td>PyTorch native implementation</td>
|
| 4404 |
+
<td>-</td>
|
| 4405 |
+
<td>-</td>
|
| 4406 |
+
<td><a href="rotary/impls/torch_rotary.html">Bench</a></td>
|
| 4407 |
</tr>
|
| 4408 |
</tbody>
|
| 4409 |
</table>
|
| 4410 |
<p align="center">
|
| 4411 |
<button
|
| 4412 |
+
onclick="window.location.href='rotary/'"
|
| 4413 |
class="btn">
|
| 4414 |
Explore Full Bench
|
| 4415 |
</button>
|
| 4416 |
</p>
|
| 4417 |
|
| 4418 |
<hr />
|
| 4419 |
+
<h2>LAYER NORMALIZATION</h2>
|
| 4420 |
+
<div class="artifact-preview">
|
| 4421 |
+
<img src="layer_norm/results/artifacts/combine/latency.svg" alt="Layer Norm Latency" width="800">
|
| 4422 |
+
</div>
|
| 4423 |
+
|
| 4424 |
+
<table>
|
| 4425 |
+
<thead>
|
| 4426 |
+
<tr>
|
| 4427 |
+
<th>Implementation</th>
|
| 4428 |
+
<th>Description</th>
|
| 4429 |
+
<th>Source</th>
|
| 4430 |
+
<th>HF</th>
|
| 4431 |
+
<th>Bench</th>
|
| 4432 |
+
</tr>
|
| 4433 |
+
</thead>
|
| 4434 |
+
<tbody>
|
| 4435 |
+
<tr>
|
| 4436 |
+
<td>HF Kernels Layer Norm</td>
|
| 4437 |
+
<td>HuggingFace kernels implementation</td>
|
| 4438 |
+
<td><a href="https://github.com/huggingface/kernels-community/tree/main/layer-norm">GitHub</a></td>
|
| 4439 |
+
<td><a href="https://huggingface.co/kernels-community/layer-norm">HF</a></td>
|
| 4440 |
+
<td><a href="layer_norm/impls/hf_kernels_layer_norm.html">Bench</a></td>
|
| 4441 |
+
</tr>
|
| 4442 |
+
<tr>
|
| 4443 |
+
<td>PyTorch Layer Norm</td>
|
| 4444 |
+
<td>PyTorch native implementation</td>
|
| 4445 |
+
<td>-</td>
|
| 4446 |
+
<td>-</td>
|
| 4447 |
+
<td><a href="layer_norm/impls/torch_layer_norm.html">Bench</a></td>
|
| 4448 |
+
</tr>
|
| 4449 |
+
</tbody>
|
| 4450 |
+
</table>
|
| 4451 |
+
<p align="center">
|
| 4452 |
+
<button
|
| 4453 |
+
onclick="window.location.href='layer_norm/'"
|
| 4454 |
+
class="btn">
|
| 4455 |
+
Explore Full Bench
|
| 4456 |
+
</button>
|
| 4457 |
+
</p>
|
| 4458 |
+
|
| 4459 |
<style>
|
| 4460 |
.controls {
|
| 4461 |
display: none !important;
|
|
|
|
| 4499 |
}
|
| 4500 |
:root {
|
| 4501 |
--bg-alert: #0069cbff;
|
|
|
|
| 4502 |
}
|
| 4503 |
.alert {
|
| 4504 |
+
padding: 5px 10px;
|
| 4505 |
background-color: var(--bg-alert);
|
|
|
|
| 4506 |
margin-bottom: 10px;
|
| 4507 |
border-radius: 6px;
|
| 4508 |
}
|
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-31T20:00:11Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.807951000012963, "p50": 0.8174310000299556, "p90": 0.8198709999760467, "mean": 0.8162470000002031, "iqr": 0.0038399999766625115, "raw_times": [0.8160309999993842, 0.8198709999760467, 0.8174310000299556, 0.807951000012963, 0.819950999982666], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8318710000025931, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-31T20:00:11Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6729929999996784, "p50": 1.6790130000003956, "p90": 1.685203000022284, "mean": 1.6802827999867986, "iqr": 0.007120000077520672, "raw_times": [1.685203000022284, 1.6790130000003956, 1.6729929999996784, 1.686121999966872, 1.6780829999447633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6821429999822612, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-31T20:00:12Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6009309999844845, "p50": 1.6056009999942944, "p90": 1.611341000000266, "mean": 1.606853000009778, "iqr": 0.008409999963987502, "raw_times": [1.6009309999844845, 1.6056009999942944, 1.613461000033567, 1.6029310000362784, 1.611341000000266], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6386120000220217, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-31T20:00:12Z", "run": "f2cf664f6646484f88815be637f5bc9d", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.3123249999675863, "p50": 3.327974000001177, "p90": 3.3289149999973233, "mean": 3.3240905999946335, "iqr": 0.010180999993281148, "raw_times": [3.3325050000030387, 3.3289149999973233, 3.3123249999675863, 3.318734000004042, 3.327974000001177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.335275000040383, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
|
layer_norm/impls/cells/benchmark.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
-
# "kernels",
|
| 7 |
# "kernels-benchmark-tools",
|
| 8 |
# ]
|
| 9 |
#
|
|
@@ -13,37 +12,15 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
from kernels import get_kernel
|
| 17 |
|
| 18 |
-
# Load the layer norm kernel
|
| 19 |
-
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
B, S, D = x.shape
|
| 24 |
-
# The kernel expects [N, D] input; support beta (bias) if provided.
|
| 25 |
-
out = layer_norm_kernel.dropout_add_ln_fwd(
|
| 26 |
-
input=x.view(-1, D),
|
| 27 |
-
gamma=weight,
|
| 28 |
-
beta=bias,
|
| 29 |
-
rowscale=None,
|
| 30 |
-
colscale=None,
|
| 31 |
-
x0_subset=None,
|
| 32 |
-
z_subset=None,
|
| 33 |
-
dropout_p=0.0,
|
| 34 |
-
epsilon=eps,
|
| 35 |
-
rowscale_const=1.0,
|
| 36 |
-
z_numrows=S,
|
| 37 |
-
gen=None,
|
| 38 |
-
residual_in_fp32=False,
|
| 39 |
-
is_rms_norm=False,
|
| 40 |
-
)[0].view(B, S, D)
|
| 41 |
-
return out
|
| 42 |
|
| 43 |
|
| 44 |
run_benchmark(
|
| 45 |
kernel_type=KernelTypeEnum.LAYER_NORM,
|
| 46 |
-
impl_name="
|
| 47 |
-
impl_tags={"family": "
|
| 48 |
-
impl_func=
|
| 49 |
)
|
|
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
|
|
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
# ]
|
| 8 |
#
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
|
| 18 |
+
return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
run_benchmark(
|
| 22 |
kernel_type=KernelTypeEnum.LAYER_NORM,
|
| 23 |
+
impl_name="torch_layer_norm",
|
| 24 |
+
impl_tags={"family": "torch", "op": "layer_norm"},
|
| 25 |
+
impl_func=torch_layer_norm,
|
| 26 |
)
|
layer_norm/impls/hf_kernels_layer_norm.html
CHANGED
|
@@ -4107,11 +4107,12 @@ body[data-tool="eraser"] .main-content {
|
|
| 4107 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4108 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4109 |
</span> |
|
| 4110 |
-
Cell: benchmark |
|
| 4111 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4112 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4113 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4114 |
-
<a href="https://github.com/huggingface/kernels-
|
|
|
|
| 4115 |
</div>
|
| 4116 |
<div id="code-benchmark" class="cell-code" data-lines="49">
|
| 4117 |
<div class="code-wrap">
|
|
@@ -4178,19 +4179,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
|
|
| 4178 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4179 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4180 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4181 |
-
hf_kernels_layer_norm
|
| 4182 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 1.
|
| 4183 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4184 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4185 |
-
Activity Buffer Request
|
| 4186 |
-
aten::view 0.
|
| 4187 |
-
aten::empty 1.
|
| 4188 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.
|
| 4189 |
-
cudaLaunchKernel 1.
|
| 4190 |
-
cudaDeviceSynchronize 53.
|
| 4191 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4192 |
-
Self CPU time total: 4.
|
| 4193 |
-
Self CUDA time total: 2.
|
| 4194 |
|
| 4195 |
|
| 4196 |
|
|
@@ -4200,19 +4201,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
|
|
| 4200 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4201 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4202 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4203 |
-
hf_kernels_layer_norm
|
| 4204 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 4205 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4206 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4207 |
-
Activity Buffer Request
|
| 4208 |
-
aten::view 0.
|
| 4209 |
-
aten::empty 0.
|
| 4210 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.
|
| 4211 |
-
cudaLaunchKernel 0.
|
| 4212 |
-
cudaDeviceSynchronize
|
| 4213 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4214 |
-
Self CPU time total: 6.
|
| 4215 |
-
Self CUDA time total: 4.
|
| 4216 |
|
| 4217 |
|
| 4218 |
|
|
@@ -4222,19 +4223,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
|
|
| 4222 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4223 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4224 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4225 |
-
hf_kernels_layer_norm
|
| 4226 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 4227 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4228 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4229 |
-
Activity Buffer Request
|
| 4230 |
-
aten::view 0.19% 11.
|
| 4231 |
-
aten::empty 0.
|
| 4232 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08%
|
| 4233 |
-
cudaLaunchKernel 0.
|
| 4234 |
-
cudaDeviceSynchronize
|
| 4235 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4236 |
-
Self CPU time total: 6.
|
| 4237 |
-
Self CUDA time total: 4.
|
| 4238 |
|
| 4239 |
|
| 4240 |
|
|
@@ -4244,36 +4245,38 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
|
|
| 4244 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4245 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4246 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4247 |
-
hf_kernels_layer_norm 1.
|
| 4248 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 4249 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4250 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4251 |
-
Activity Buffer Request
|
| 4252 |
-
aten::view 0.
|
| 4253 |
-
aten::empty 0.
|
| 4254 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 4255 |
-
cudaLaunchKernel 2.
|
| 4256 |
-
cudaDeviceSynchronize
|
| 4257 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4258 |
-
Self CPU time total:
|
| 4259 |
-
Self CUDA time total: 9.
|
| 4260 |
|
| 4261 |
|
| 4262 |
impl wl p50(ms) ok
|
| 4263 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
|
| 4264 |
-
hf_kernels_layer_norm LN_B16_S2048_D8192 1.
|
| 4265 |
-
hf_kernels_layer_norm LN_B16_S4096_D4096 1.
|
| 4266 |
-
hf_kernels_layer_norm LN_B16_S4096_D8192 3.
|
| 4267 |
</pre></div>
|
| 4268 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4269 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4270 |
<div class="uv-logs-content" style="display: none;">
|
| 4271 |
-
|
|
|
|
|
|
|
| 4272 |
</div>
|
| 4273 |
</div>
|
| 4274 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4275 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 4276 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 4277 |
<div class="cell-artifacts">
|
| 4278 |
<h4>Artifacts:</h4>
|
| 4279 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
|
|
|
| 4107 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4108 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4109 |
</span> |
|
| 4110 |
+
Cell: benchmark | 10.09s
|
| 4111 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4112 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4113 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4114 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/layer_norm/impls/hf_kernels_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
|
| 4115 |
+
<a href="https://huggingface.co/kernels-community/layer-norm" target="_blank" class="hf-btn">🤗 HF</a>
|
| 4116 |
</div>
|
| 4117 |
<div id="code-benchmark" class="cell-code" data-lines="49">
|
| 4118 |
<div class="code-wrap">
|
|
|
|
| 4179 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4180 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4181 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4182 |
+
hf_kernels_layer_norm 5.01% 203.177us 46.78% 1.895ms 1.895ms 0.000us 0.00% 3.141ms 3.141ms 1
|
| 4183 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 1.71% 69.312us 41.16% 1.668ms 555.914us 2.399ms 100.00% 3.141ms 1.047ms 3
|
| 4184 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.401ms 100.06% 2.401ms 2.401ms 1
|
| 4185 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.399ms 100.00% 2.399ms 799.825us 3
|
| 4186 |
+
Activity Buffer Request 36.95% 1.497ms 36.95% 1.497ms 1.497ms 742.012us 30.92% 742.012us 742.012us 1
|
| 4187 |
+
aten::view 0.61% 24.559us 0.61% 24.559us 4.093us 0.000us 0.00% 0.000us 0.000us 6
|
| 4188 |
+
aten::empty 1.20% 48.622us 1.20% 48.622us 5.402us 0.000us 0.00% 0.000us 0.000us 9
|
| 4189 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.170us 0.23% 9.170us 3.057us 0.000us 0.00% 0.000us 0.000us 3
|
| 4190 |
+
cudaLaunchKernel 1.07% 43.390us 1.07% 43.390us 14.463us 0.000us 0.00% 0.000us 0.000us 3
|
| 4191 |
+
cudaDeviceSynchronize 53.22% 2.156ms 53.22% 2.156ms 2.156ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4192 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4193 |
+
Self CPU time total: 4.052ms
|
| 4194 |
+
Self CUDA time total: 2.399ms
|
| 4195 |
|
| 4196 |
|
| 4197 |
|
|
|
|
| 4201 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4202 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4203 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4204 |
+
hf_kernels_layer_norm 1.88% 119.443us 26.75% 1.701ms 1.701ms 0.000us 0.00% 6.407ms 6.407ms 1
|
| 4205 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.71% 45.121us 24.67% 1.568ms 522.677us 4.827ms 100.00% 6.407ms 2.136ms 3
|
| 4206 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.829ms 100.03% 4.829ms 4.829ms 1
|
| 4207 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.827ms 100.00% 4.827ms 1.609ms 3
|
| 4208 |
+
Activity Buffer Request 22.91% 1.456ms 22.91% 1.456ms 1.456ms 1.580ms 32.72% 1.580ms 1.580ms 1
|
| 4209 |
+
aten::view 0.21% 13.200us 0.21% 13.200us 2.200us 0.000us 0.00% 0.000us 0.000us 6
|
| 4210 |
+
aten::empty 0.51% 32.711us 0.51% 32.711us 3.635us 0.000us 0.00% 0.000us 0.000us 9
|
| 4211 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.289us 0.08% 5.289us 1.763us 0.000us 0.00% 0.000us 0.000us 3
|
| 4212 |
+
cudaLaunchKernel 0.45% 28.522us 0.45% 28.522us 9.507us 0.000us 0.00% 0.000us 0.000us 3
|
| 4213 |
+
cudaDeviceSynchronize 73.25% 4.656ms 73.25% 4.656ms 4.656ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4214 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4215 |
+
Self CPU time total: 6.357ms
|
| 4216 |
+
Self CUDA time total: 4.827ms
|
| 4217 |
|
| 4218 |
|
| 4219 |
|
|
|
|
| 4223 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4224 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
+
hf_kernels_layer_norm 1.89% 118.801us 26.85% 1.686ms 1.686ms 0.000us 0.00% 6.309ms 6.309ms 1
|
| 4227 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.78% 49.183us 24.77% 1.555ms 518.493us 4.763ms 100.00% 6.309ms 2.103ms 3
|
| 4228 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.765ms 100.03% 4.765ms 4.765ms 1
|
| 4229 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.763ms 100.00% 4.763ms 1.588ms 3
|
| 4230 |
+
Activity Buffer Request 22.96% 1.442ms 22.96% 1.442ms 1.442ms 1.546ms 32.46% 1.546ms 1.546ms 1
|
| 4231 |
+
aten::view 0.19% 11.741us 0.19% 11.741us 1.957us 0.000us 0.00% 0.000us 0.000us 6
|
| 4232 |
+
aten::empty 0.49% 30.460us 0.49% 30.460us 3.384us 0.000us 0.00% 0.000us 0.000us 9
|
| 4233 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.920us 0.08% 4.920us 1.640us 0.000us 0.00% 0.000us 0.000us 3
|
| 4234 |
+
cudaLaunchKernel 0.46% 29.050us 0.46% 29.050us 9.683us 0.000us 0.00% 0.000us 0.000us 3
|
| 4235 |
+
cudaDeviceSynchronize 73.15% 4.593ms 73.15% 4.593ms 4.593ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4236 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4237 |
+
Self CPU time total: 6.279ms
|
| 4238 |
+
Self CUDA time total: 4.763ms
|
| 4239 |
|
| 4240 |
|
| 4241 |
|
|
|
|
| 4245 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4246 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4247 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4248 |
+
hf_kernels_layer_norm 1.11% 112.814us 7.31% 743.908us 743.908us 0.000us 0.00% 12.737ms 12.737ms 1
|
| 4249 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.47% 47.722us 6.09% 619.105us 206.368us 9.594ms 100.00% 12.737ms 4.246ms 3
|
| 4250 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.595ms 100.02% 9.595ms 9.595ms 1
|
| 4251 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.594ms 100.00% 9.594ms 3.198ms 3
|
| 4252 |
+
Activity Buffer Request 2.50% 254.176us 2.50% 254.176us 254.176us 3.143ms 32.76% 3.143ms 3.143ms 1
|
| 4253 |
+
aten::view 0.12% 11.989us 0.12% 11.989us 1.998us 0.000us 0.00% 0.000us 0.000us 6
|
| 4254 |
+
aten::empty 0.30% 30.280us 0.30% 30.280us 3.364us 0.000us 0.00% 0.000us 0.000us 9
|
| 4255 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.000us 0.05% 5.000us 1.667us 0.000us 0.00% 0.000us 0.000us 3
|
| 4256 |
+
cudaLaunchKernel 2.77% 281.927us 2.77% 281.927us 93.976us 0.000us 0.00% 0.000us 0.000us 3
|
| 4257 |
+
cudaDeviceSynchronize 92.69% 9.430ms 92.69% 9.430ms 9.430ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4258 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4259 |
+
Self CPU time total: 10.174ms
|
| 4260 |
+
Self CUDA time total: 9.594ms
|
| 4261 |
|
| 4262 |
|
| 4263 |
impl wl p50(ms) ok
|
| 4264 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
|
| 4265 |
+
hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
|
| 4266 |
+
hf_kernels_layer_norm LN_B16_S4096_D4096 1.66 True
|
| 4267 |
+
hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
|
| 4268 |
</pre></div>
|
| 4269 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4270 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4271 |
<div class="uv-logs-content" style="display: none;">
|
| 4272 |
+
Downloading hf-xet (3.2MiB)
|
| 4273 |
+
Downloading hf-xet
|
| 4274 |
+
Installed 52 packages in 218ms
|
| 4275 |
</div>
|
| 4276 |
</div>
|
| 4277 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4278 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.22it/s]
|
| 4279 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.45it/s]</div>
|
| 4280 |
<div class="cell-artifacts">
|
| 4281 |
<h4>Artifacts:</h4>
|
| 4282 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
layer_norm/impls/torch_layer_norm.html
CHANGED
|
@@ -4106,11 +4106,11 @@ body[data-tool="eraser"] .main-content {
|
|
| 4106 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
-
Cell: nv | 0.
|
| 4110 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 4112 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
-
<a href="https://github.com/huggingface/kernels-
|
| 4114 |
</div>
|
| 4115 |
<div id="code-nv" class="cell-code" data-lines="2">
|
| 4116 |
<div class="code-wrap">
|
|
@@ -4122,7 +4122,7 @@ Cell: nv | 0.26s
|
|
| 4122 |
</div>
|
| 4123 |
</div>
|
| 4124 |
<div id="output-nv" class="cell-output">
|
| 4125 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 4126 |
+-----------------------------------------------------------------------------------------+
|
| 4127 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 4128 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -4131,7 +4131,7 @@ Cell: nv | 0.26s
|
|
| 4131 |
| | | MIG M. |
|
| 4132 |
|=========================================+========================+======================|
|
| 4133 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 4134 |
-
| N/A
|
| 4135 |
| | | N/A |
|
| 4136 |
+-----------------------------------------+------------------------+----------------------+
|
| 4137 |
|
|
@@ -4153,13 +4153,13 @@ Cell: nv | 0.26s
|
|
| 4153 |
<span class="collapse-indicators">
|
| 4154 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 4155 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4156 |
-
<span id="uv-indicator-benchmark"
|
| 4157 |
</span> |
|
| 4158 |
-
Cell: benchmark |
|
| 4159 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4160 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4161 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4162 |
-
<a href="https://github.com/huggingface/kernels-
|
| 4163 |
</div>
|
| 4164 |
<div id="code-benchmark" class="cell-code" data-lines="26">
|
| 4165 |
<div class="code-wrap">
|
|
@@ -4203,19 +4203,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
|
|
| 4203 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4204 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4205 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4206 |
-
torch_layer_norm 3.
|
| 4207 |
-
aten::layer_norm 0.
|
| 4208 |
-
aten::native_layer_norm 2.
|
| 4209 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4210 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4211 |
-
Activity Buffer Request 37.
|
| 4212 |
-
aten::empty 1.
|
| 4213 |
-
cudaLaunchKernel 1.
|
| 4214 |
-
aten::view 0.
|
| 4215 |
-
cudaDeviceSynchronize 53.
|
| 4216 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4217 |
-
Self CPU time total: 3.
|
| 4218 |
-
Self CUDA time total: 2.
|
| 4219 |
|
| 4220 |
|
| 4221 |
|
|
@@ -4225,19 +4225,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
|
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4227 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4228 |
-
torch_layer_norm 1.
|
| 4229 |
-
aten::layer_norm 0.
|
| 4230 |
-
aten::native_layer_norm 0.
|
| 4231 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4232 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4233 |
-
Activity Buffer Request
|
| 4234 |
-
aten::empty 0.
|
| 4235 |
-
cudaLaunchKernel 0.
|
| 4236 |
-
aten::view 0.06% 3.
|
| 4237 |
-
cudaDeviceSynchronize
|
| 4238 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4239 |
-
Self CPU time total: 6.
|
| 4240 |
-
Self CUDA time total: 4.
|
| 4241 |
|
| 4242 |
|
| 4243 |
|
|
@@ -4247,19 +4247,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
|
|
| 4247 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4248 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4249 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4250 |
-
torch_layer_norm 1.
|
| 4251 |
-
aten::layer_norm 0.
|
| 4252 |
-
aten::native_layer_norm 0.
|
| 4253 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4254 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4255 |
-
Activity Buffer Request 23.
|
| 4256 |
-
aten::empty 0.
|
| 4257 |
-
cudaLaunchKernel 0.
|
| 4258 |
-
aten::view 0.06%
|
| 4259 |
-
cudaDeviceSynchronize 73.
|
| 4260 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4261 |
-
Self CPU time total: 6.
|
| 4262 |
-
Self CUDA time total: 4.
|
| 4263 |
|
| 4264 |
|
| 4265 |
|
|
@@ -4269,33 +4269,27 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
|
|
| 4269 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4270 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4271 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4272 |
-
torch_layer_norm 0.
|
| 4273 |
-
aten::layer_norm 0.08%
|
| 4274 |
-
aten::native_layer_norm 0.
|
| 4275 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4276 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4277 |
-
Activity Buffer Request 11.
|
| 4278 |
-
aten::empty 0.
|
| 4279 |
-
cudaLaunchKernel
|
| 4280 |
-
aten::view 0.03% 3.
|
| 4281 |
-
cudaDeviceSynchronize
|
| 4282 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4283 |
-
Self CPU time total: 11.
|
| 4284 |
-
Self CUDA time total: 9.
|
| 4285 |
|
| 4286 |
|
| 4287 |
impl wl p50(ms) ok
|
| 4288 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4289 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4290 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4291 |
-
torch_layer_norm LN_B16_S4096_D8192 3.
|
| 4292 |
</pre></div>
|
| 4293 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4294 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4295 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4296 |
-
Installed 37 packages in 236ms
|
| 4297 |
-
</div>
|
| 4298 |
-
</div>
|
| 4299 |
<div class="cell-artifacts">
|
| 4300 |
<h4>Artifacts:</h4>
|
| 4301 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
|
|
|
| 4106 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 4107 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4108 |
</span> |
|
| 4109 |
+
Cell: nv | 0.23s
|
| 4110 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 4111 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 4112 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 4113 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
|
| 4114 |
</div>
|
| 4115 |
<div id="code-nv" class="cell-code" data-lines="2">
|
| 4116 |
<div class="code-wrap">
|
|
|
|
| 4122 |
</div>
|
| 4123 |
</div>
|
| 4124 |
<div id="output-nv" class="cell-output">
|
| 4125 |
+
<div class="cell-stdout"><pre class="stdout-text">Fri Oct 31 20:00:08 2025
|
| 4126 |
+-----------------------------------------------------------------------------------------+
|
| 4127 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 4128 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 4131 |
| | | MIG M. |
|
| 4132 |
|=========================================+========================+======================|
|
| 4133 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 4134 |
+
| N/A 32C P0 85W / 350W | 0MiB / 46068MiB | 22% Default |
|
| 4135 |
| | | N/A |
|
| 4136 |
+-----------------------------------------+------------------------+----------------------+
|
| 4137 |
|
|
|
|
| 4153 |
<span class="collapse-indicators">
|
| 4154 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 4155 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 4156 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 4157 |
</span> |
|
| 4158 |
+
Cell: benchmark | 3.89s
|
| 4159 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 4160 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 4161 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 4162 |
+
<a href="https://github.com/huggingface/kernels-benchmarks/blob/main/layer_norm/impls/torch_layer_norm.md" target="_blank" class="github-btn">GitHub</a>
|
| 4163 |
</div>
|
| 4164 |
<div id="code-benchmark" class="cell-code" data-lines="26">
|
| 4165 |
<div class="code-wrap">
|
|
|
|
| 4203 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4204 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4205 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4206 |
+
torch_layer_norm 3.88% 150.743us 46.08% 1.790ms 1.790ms 0.000us 0.00% 3.031ms 3.031ms 1
|
| 4207 |
+
aten::layer_norm 0.46% 17.882us 42.20% 1.639ms 546.344us 0.000us 0.00% 3.031ms 1.010ms 3
|
| 4208 |
+
aten::native_layer_norm 2.05% 79.451us 41.74% 1.621ms 540.384us 2.322ms 100.00% 3.031ms 1.010ms 3
|
| 4209 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.323ms 100.06% 2.323ms 2.323ms 1
|
| 4210 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.322ms 100.00% 2.322ms 773.873us 3
|
| 4211 |
+
Activity Buffer Request 37.13% 1.442ms 37.13% 1.442ms 1.442ms 709.660us 30.57% 709.660us 709.660us 1
|
| 4212 |
+
aten::empty 1.23% 47.623us 1.23% 47.623us 5.291us 0.000us 0.00% 0.000us 0.000us 9
|
| 4213 |
+
cudaLaunchKernel 1.17% 45.281us 1.17% 45.281us 15.094us 0.000us 0.00% 0.000us 0.000us 3
|
| 4214 |
+
aten::view 0.17% 6.710us 0.17% 6.710us 1.118us 0.000us 0.00% 0.000us 0.000us 6
|
| 4215 |
+
cudaDeviceSynchronize 53.92% 2.094ms 53.92% 2.094ms 2.094ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4216 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4217 |
+
Self CPU time total: 3.884ms
|
| 4218 |
+
Self CUDA time total: 2.322ms
|
| 4219 |
|
| 4220 |
|
| 4221 |
|
|
|
|
| 4225 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4226 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4227 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4228 |
+
torch_layer_norm 1.99% 129.362us 27.22% 1.769ms 1.769ms 0.000us 0.00% 6.490ms 6.490ms 1
|
| 4229 |
+
aten::layer_norm 0.17% 10.831us 25.23% 1.640ms 546.698us 0.000us 0.00% 6.490ms 2.163ms 3
|
| 4230 |
+
aten::native_layer_norm 0.91% 59.414us 25.06% 1.629ms 543.087us 4.900ms 100.00% 6.490ms 2.163ms 3
|
| 4231 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.901ms 100.03% 4.901ms 4.901ms 1
|
| 4232 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.900ms 100.00% 4.900ms 1.633ms 3
|
| 4233 |
+
Activity Buffer Request 23.14% 1.504ms 23.14% 1.504ms 1.504ms 1.590ms 32.46% 1.590ms 1.590ms 1
|
| 4234 |
+
aten::empty 0.46% 29.779us 0.46% 29.779us 3.309us 0.000us 0.00% 0.000us 0.000us 9
|
| 4235 |
+
cudaLaunchKernel 0.49% 31.860us 0.49% 31.860us 10.620us 0.000us 0.00% 0.000us 0.000us 3
|
| 4236 |
+
aten::view 0.06% 3.750us 0.06% 3.750us 0.625us 0.000us 0.00% 0.000us 0.000us 6
|
| 4237 |
+
cudaDeviceSynchronize 72.78% 4.732ms 72.78% 4.732ms 4.732ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4238 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4239 |
+
Self CPU time total: 6.501ms
|
| 4240 |
+
Self CUDA time total: 4.900ms
|
| 4241 |
|
| 4242 |
|
| 4243 |
|
|
|
|
| 4247 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4248 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4249 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4250 |
+
torch_layer_norm 1.73% 108.072us 26.73% 1.674ms 1.674ms 0.000us 0.00% 6.258ms 6.258ms 1
|
| 4251 |
+
aten::layer_norm 0.14% 8.910us 25.01% 1.566ms 522.010us 0.000us 0.00% 6.258ms 2.086ms 3
|
| 4252 |
+
aten::native_layer_norm 0.87% 54.314us 24.86% 1.557ms 519.040us 4.736ms 100.00% 6.258ms 2.086ms 3
|
| 4253 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.737ms 100.03% 4.737ms 4.737ms 1
|
| 4254 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.736ms 100.00% 4.736ms 1.579ms 3
|
| 4255 |
+
Activity Buffer Request 23.05% 1.444ms 23.05% 1.444ms 1.444ms 1.522ms 32.13% 1.522ms 1.522ms 1
|
| 4256 |
+
aten::empty 0.46% 28.531us 0.46% 28.531us 3.170us 0.000us 0.00% 0.000us 0.000us 9
|
| 4257 |
+
cudaLaunchKernel 0.43% 26.620us 0.43% 26.620us 8.873us 0.000us 0.00% 0.000us 0.000us 3
|
| 4258 |
+
aten::view 0.06% 4.039us 0.06% 4.039us 0.673us 0.000us 0.00% 0.000us 0.000us 6
|
| 4259 |
+
cudaDeviceSynchronize 73.27% 4.589ms 73.27% 4.589ms 4.589ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4260 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4261 |
+
Self CPU time total: 6.263ms
|
| 4262 |
+
Self CUDA time total: 4.736ms
|
| 4263 |
|
| 4264 |
|
| 4265 |
|
|
|
|
| 4269 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4270 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4271 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4272 |
+
torch_layer_norm 0.85% 101.562us 19.08% 2.285ms 2.285ms 0.000us 0.00% 13.093ms 13.093ms 1
|
| 4273 |
+
aten::layer_norm 0.08% 9.511us 18.23% 2.184ms 727.942us 0.000us 0.00% 13.093ms 4.364ms 3
|
| 4274 |
+
aten::native_layer_norm 0.48% 57.051us 18.15% 2.174ms 724.772us 9.846ms 100.00% 13.093ms 4.364ms 3
|
| 4275 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.848ms 100.01% 9.848ms 9.848ms 1
|
| 4276 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.846ms 100.00% 9.846ms 3.282ms 3
|
| 4277 |
+
Activity Buffer Request 11.95% 1.431ms 11.95% 1.431ms 1.431ms 3.247ms 32.97% 3.247ms 3.247ms 1
|
| 4278 |
+
aten::empty 0.24% 29.142us 0.24% 29.142us 3.238us 0.000us 0.00% 0.000us 0.000us 9
|
| 4279 |
+
cudaLaunchKernel 5.45% 653.217us 5.45% 653.217us 217.739us 0.000us 0.00% 0.000us 0.000us 3
|
| 4280 |
+
aten::view 0.03% 3.890us 0.03% 3.890us 0.648us 0.000us 0.00% 0.000us 0.000us 6
|
| 4281 |
+
cudaDeviceSynchronize 80.92% 9.693ms 80.92% 9.693ms 9.693ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4282 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4283 |
+
Self CPU time total: 11.979ms
|
| 4284 |
+
Self CUDA time total: 9.846ms
|
| 4285 |
|
| 4286 |
|
| 4287 |
impl wl p50(ms) ok
|
| 4288 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4289 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4290 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4291 |
+
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
| 4292 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4293 |
<div class="cell-artifacts">
|
| 4294 |
<h4>Artifacts:</h4>
|
| 4295 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
layer_norm/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
layer_norm/results/combined_results.html
CHANGED
|
@@ -4107,7 +4107,7 @@ body[data-tool="eraser"] .main-content {
|
|
| 4107 |
<rdf:RDF>
|
| 4108 |
<ns2:Work>
|
| 4109 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4110 |
-
<dc:date>2025-10-
|
| 4111 |
<dc:format>image/svg+xml</dc:format>
|
| 4112 |
<dc:creator>
|
| 4113 |
<ns2:Agent>
|
|
@@ -4191,70 +4191,70 @@ body[data-tool="eraser"] .main-content {
|
|
| 4191 |
<g id="matplotlib.axis_2">
|
| 4192 |
<g id="ytick_1">
|
| 4193 |
<g id="grid-y--2" class="grid grid-y">
|
| 4194 |
-
<path d="M 47.72 409.
|
| 4195 |
</g>
|
| 4196 |
<g id="line2d_5">
|
| 4197 |
<defs>
|
| 4198 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4199 |
</defs>
|
| 4200 |
<g>
|
| 4201 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="409.
|
| 4202 |
</g>
|
| 4203 |
</g>
|
| 4204 |
<g id="text_5">
|
| 4205 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.
|
| 4206 |
</g>
|
| 4207 |
</g>
|
| 4208 |
<g id="ytick_2">
|
| 4209 |
<g id="grid-y--3" class="grid grid-y">
|
| 4210 |
-
<path d="M 47.72 331.
|
| 4211 |
</g>
|
| 4212 |
<g id="line2d_6">
|
| 4213 |
<g>
|
| 4214 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="331.
|
| 4215 |
</g>
|
| 4216 |
</g>
|
| 4217 |
<g id="text_6">
|
| 4218 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4219 |
</g>
|
| 4220 |
</g>
|
| 4221 |
<g id="ytick_3">
|
| 4222 |
<g id="grid-y--4" class="grid grid-y">
|
| 4223 |
-
<path d="M 47.72 253.
|
| 4224 |
</g>
|
| 4225 |
<g id="line2d_7">
|
| 4226 |
<g>
|
| 4227 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="253.
|
| 4228 |
</g>
|
| 4229 |
</g>
|
| 4230 |
<g id="text_7">
|
| 4231 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4232 |
</g>
|
| 4233 |
</g>
|
| 4234 |
<g id="ytick_4">
|
| 4235 |
<g id="grid-y--5" class="grid grid-y">
|
| 4236 |
-
<path d="M 47.72 175.
|
| 4237 |
</g>
|
| 4238 |
<g id="line2d_8">
|
| 4239 |
<g>
|
| 4240 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="175.
|
| 4241 |
</g>
|
| 4242 |
</g>
|
| 4243 |
<g id="text_8">
|
| 4244 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4245 |
</g>
|
| 4246 |
</g>
|
| 4247 |
<g id="ytick_5">
|
| 4248 |
<g id="grid-y--6" class="grid grid-y">
|
| 4249 |
-
<path d="M 47.72
|
| 4250 |
</g>
|
| 4251 |
<g id="line2d_9">
|
| 4252 |
<g>
|
| 4253 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4254 |
</g>
|
| 4255 |
</g>
|
| 4256 |
<g id="text_9">
|
| 4257 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4258 |
</g>
|
| 4259 |
</g>
|
| 4260 |
<g id="label--y" class="ylabel">
|
|
@@ -4262,27 +4262,27 @@ body[data-tool="eraser"] .main-content {
|
|
| 4262 |
</g>
|
| 4263 |
</g>
|
| 4264 |
<g id="series--torch-layer-norm" class="series">
|
| 4265 |
-
<path d="M 83.741924 437.689571 L 323.888085 303.
|
| 4266 |
<defs>
|
| 4267 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4268 |
</defs>
|
| 4269 |
<g clip-path="url(#p2214f54723)">
|
| 4270 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4271 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.
|
| 4272 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.
|
| 4273 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4274 |
</g>
|
| 4275 |
</g>
|
| 4276 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4277 |
-
<path d="M 83.741924 434.
|
| 4278 |
<defs>
|
| 4279 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4280 |
</defs>
|
| 4281 |
<g clip-path="url(#p2214f54723)">
|
| 4282 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.
|
| 4283 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.
|
| 4284 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="
|
| 4285 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.
|
| 4286 |
</g>
|
| 4287 |
</g>
|
| 4288 |
<g id="patch_3">
|
|
@@ -4428,13 +4428,13 @@ COMBINED BENCHMARK SUMMARY
|
|
| 4428 |
|
| 4429 |
impl wl p50(ms) ok
|
| 4430 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
|
| 4431 |
-
hf_kernels_layer_norm LN_B16_S2048_D8192 1.
|
| 4432 |
-
hf_kernels_layer_norm LN_B16_S4096_D4096 1.
|
| 4433 |
-
hf_kernels_layer_norm LN_B16_S4096_D8192 3.
|
| 4434 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4435 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4436 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4437 |
-
torch_layer_norm LN_B16_S4096_D8192 3.
|
| 4438 |
|
| 4439 |
GENERATING COMBINED VISUALIZATION
|
| 4440 |
|
|
@@ -4454,7 +4454,7 @@ Implementations included:
|
|
| 4454 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4455 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4456 |
<div class="uv-logs-content" style="display: none;">
|
| 4457 |
-
Installed 37 packages in
|
| 4458 |
</div>
|
| 4459 |
</div>
|
| 4460 |
<div class="cell-artifacts">
|
|
@@ -4467,7 +4467,7 @@ Installed 37 packages in 222ms
|
|
| 4467 |
<rdf:RDF>
|
| 4468 |
<ns2:Work>
|
| 4469 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4470 |
-
<dc:date>2025-10-
|
| 4471 |
<dc:format>image/svg+xml</dc:format>
|
| 4472 |
<dc:creator>
|
| 4473 |
<ns2:Agent>
|
|
@@ -4551,70 +4551,70 @@ Installed 37 packages in 222ms
|
|
| 4551 |
<g id="matplotlib.axis_2">
|
| 4552 |
<g id="ytick_1">
|
| 4553 |
<g id="grid-y--2" class="grid grid-y">
|
| 4554 |
-
<path d="M 47.72 409.
|
| 4555 |
</g>
|
| 4556 |
<g id="line2d_5">
|
| 4557 |
<defs>
|
| 4558 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4559 |
</defs>
|
| 4560 |
<g>
|
| 4561 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="409.
|
| 4562 |
</g>
|
| 4563 |
</g>
|
| 4564 |
<g id="text_5">
|
| 4565 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="ytick_2">
|
| 4569 |
<g id="grid-y--3" class="grid grid-y">
|
| 4570 |
-
<path d="M 47.72 331.
|
| 4571 |
</g>
|
| 4572 |
<g id="line2d_6">
|
| 4573 |
<g>
|
| 4574 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="331.
|
| 4575 |
</g>
|
| 4576 |
</g>
|
| 4577 |
<g id="text_6">
|
| 4578 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="ytick_3">
|
| 4582 |
<g id="grid-y--4" class="grid grid-y">
|
| 4583 |
-
<path d="M 47.72 253.
|
| 4584 |
</g>
|
| 4585 |
<g id="line2d_7">
|
| 4586 |
<g>
|
| 4587 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="253.
|
| 4588 |
</g>
|
| 4589 |
</g>
|
| 4590 |
<g id="text_7">
|
| 4591 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="ytick_4">
|
| 4595 |
<g id="grid-y--5" class="grid grid-y">
|
| 4596 |
-
<path d="M 47.72 175.
|
| 4597 |
</g>
|
| 4598 |
<g id="line2d_8">
|
| 4599 |
<g>
|
| 4600 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="175.
|
| 4601 |
</g>
|
| 4602 |
</g>
|
| 4603 |
<g id="text_8">
|
| 4604 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4605 |
</g>
|
| 4606 |
</g>
|
| 4607 |
<g id="ytick_5">
|
| 4608 |
<g id="grid-y--6" class="grid grid-y">
|
| 4609 |
-
<path d="M 47.72
|
| 4610 |
</g>
|
| 4611 |
<g id="line2d_9">
|
| 4612 |
<g>
|
| 4613 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4614 |
</g>
|
| 4615 |
</g>
|
| 4616 |
<g id="text_9">
|
| 4617 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="label--y" class="ylabel">
|
|
@@ -4622,27 +4622,27 @@ Installed 37 packages in 222ms
|
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="series--torch-layer-norm" class="series">
|
| 4625 |
-
<path d="M 83.741924 437.689571 L 323.888085 303.
|
| 4626 |
<defs>
|
| 4627 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4628 |
</defs>
|
| 4629 |
<g clip-path="url(#p2214f54723)">
|
| 4630 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4631 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.
|
| 4632 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.
|
| 4633 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4634 |
</g>
|
| 4635 |
</g>
|
| 4636 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4637 |
-
<path d="M 83.741924 434.
|
| 4638 |
<defs>
|
| 4639 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4640 |
</defs>
|
| 4641 |
<g clip-path="url(#p2214f54723)">
|
| 4642 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.
|
| 4643 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.
|
| 4644 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="
|
| 4645 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.
|
| 4646 |
</g>
|
| 4647 |
</g>
|
| 4648 |
<g id="patch_3">
|
|
|
|
| 4107 |
<rdf:RDF>
|
| 4108 |
<ns2:Work>
|
| 4109 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4110 |
+
<dc:date>2025-10-31T20:13:56.885734</dc:date>
|
| 4111 |
<dc:format>image/svg+xml</dc:format>
|
| 4112 |
<dc:creator>
|
| 4113 |
<ns2:Agent>
|
|
|
|
| 4191 |
<g id="matplotlib.axis_2">
|
| 4192 |
<g id="ytick_1">
|
| 4193 |
<g id="grid-y--2" class="grid grid-y">
|
| 4194 |
+
<path d="M 47.72 409.237714 L 840.20233 409.237714 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4195 |
</g>
|
| 4196 |
<g id="line2d_5">
|
| 4197 |
<defs>
|
| 4198 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4199 |
</defs>
|
| 4200 |
<g>
|
| 4201 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="409.237714" style="stroke: #000000; stroke-width: 0.8" />
|
| 4202 |
</g>
|
| 4203 |
</g>
|
| 4204 |
<g id="text_5">
|
| 4205 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.036933" transform="rotate(-0 40.72 413.036933)">1.0</text>
|
| 4206 |
</g>
|
| 4207 |
</g>
|
| 4208 |
<g id="ytick_2">
|
| 4209 |
<g id="grid-y--3" class="grid grid-y">
|
| 4210 |
+
<path d="M 47.72 331.316879 L 840.20233 331.316879 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4211 |
</g>
|
| 4212 |
<g id="line2d_6">
|
| 4213 |
<g>
|
| 4214 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.316879" style="stroke: #000000; stroke-width: 0.8" />
|
| 4215 |
</g>
|
| 4216 |
</g>
|
| 4217 |
<g id="text_6">
|
| 4218 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.116098" transform="rotate(-0 40.72 335.116098)">1.5</text>
|
| 4219 |
</g>
|
| 4220 |
</g>
|
| 4221 |
<g id="ytick_3">
|
| 4222 |
<g id="grid-y--4" class="grid grid-y">
|
| 4223 |
+
<path d="M 47.72 253.396045 L 840.20233 253.396045 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4224 |
</g>
|
| 4225 |
<g id="line2d_7">
|
| 4226 |
<g>
|
| 4227 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="253.396045" style="stroke: #000000; stroke-width: 0.8" />
|
| 4228 |
</g>
|
| 4229 |
</g>
|
| 4230 |
<g id="text_7">
|
| 4231 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.195264" transform="rotate(-0 40.72 257.195264)">2.0</text>
|
| 4232 |
</g>
|
| 4233 |
</g>
|
| 4234 |
<g id="ytick_4">
|
| 4235 |
<g id="grid-y--5" class="grid grid-y">
|
| 4236 |
+
<path d="M 47.72 175.47521 L 840.20233 175.47521 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4237 |
</g>
|
| 4238 |
<g id="line2d_8">
|
| 4239 |
<g>
|
| 4240 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="175.47521" style="stroke: #000000; stroke-width: 0.8" />
|
| 4241 |
</g>
|
| 4242 |
</g>
|
| 4243 |
<g id="text_8">
|
| 4244 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.274429" transform="rotate(-0 40.72 179.274429)">2.5</text>
|
| 4245 |
</g>
|
| 4246 |
</g>
|
| 4247 |
<g id="ytick_5">
|
| 4248 |
<g id="grid-y--6" class="grid grid-y">
|
| 4249 |
+
<path d="M 47.72 97.554376 L 840.20233 97.554376 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4250 |
</g>
|
| 4251 |
<g id="line2d_9">
|
| 4252 |
<g>
|
| 4253 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="97.554376" style="stroke: #000000; stroke-width: 0.8" />
|
| 4254 |
</g>
|
| 4255 |
</g>
|
| 4256 |
<g id="text_9">
|
| 4257 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.353595" transform="rotate(-0 40.72 101.353595)">3.0</text>
|
| 4258 |
</g>
|
| 4259 |
</g>
|
| 4260 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4262 |
</g>
|
| 4263 |
</g>
|
| 4264 |
<g id="series--torch-layer-norm" class="series">
|
| 4265 |
+
<path d="M 83.741924 437.689571 L 323.888085 303.419195 L 564.034245 314.859843 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4266 |
<defs>
|
| 4267 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4268 |
</defs>
|
| 4269 |
<g clip-path="url(#p2214f54723)">
|
| 4270 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4271 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.419195" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4272 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.859843" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4273 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4274 |
</g>
|
| 4275 |
</g>
|
| 4276 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4277 |
+
<path d="M 83.741924 434.525986 L 323.888085 307.036436 L 564.034245 306.425536 L 804.180406 56.12044 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4278 |
<defs>
|
| 4279 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4280 |
</defs>
|
| 4281 |
<g clip-path="url(#p2214f54723)">
|
| 4282 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.525986" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4283 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.036436" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4284 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="306.425536" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4285 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.12044" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4286 |
</g>
|
| 4287 |
</g>
|
| 4288 |
<g id="patch_3">
|
|
|
|
| 4428 |
|
| 4429 |
impl wl p50(ms) ok
|
| 4430 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
|
| 4431 |
+
hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
|
| 4432 |
+
hf_kernels_layer_norm LN_B16_S4096_D4096 1.66 True
|
| 4433 |
+
hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
|
| 4434 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4435 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4436 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4437 |
+
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
| 4438 |
|
| 4439 |
GENERATING COMBINED VISUALIZATION
|
| 4440 |
|
|
|
|
| 4454 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4455 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4456 |
<div class="uv-logs-content" style="display: none;">
|
| 4457 |
+
Installed 37 packages in 216ms
|
| 4458 |
</div>
|
| 4459 |
</div>
|
| 4460 |
<div class="cell-artifacts">
|
|
|
|
| 4467 |
<rdf:RDF>
|
| 4468 |
<ns2:Work>
|
| 4469 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4470 |
+
<dc:date>2025-10-31T20:13:56.885734</dc:date>
|
| 4471 |
<dc:format>image/svg+xml</dc:format>
|
| 4472 |
<dc:creator>
|
| 4473 |
<ns2:Agent>
|
|
|
|
| 4551 |
<g id="matplotlib.axis_2">
|
| 4552 |
<g id="ytick_1">
|
| 4553 |
<g id="grid-y--2" class="grid grid-y">
|
| 4554 |
+
<path d="M 47.72 409.237714 L 840.20233 409.237714 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4555 |
</g>
|
| 4556 |
<g id="line2d_5">
|
| 4557 |
<defs>
|
| 4558 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4559 |
</defs>
|
| 4560 |
<g>
|
| 4561 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="409.237714" style="stroke: #000000; stroke-width: 0.8" />
|
| 4562 |
</g>
|
| 4563 |
</g>
|
| 4564 |
<g id="text_5">
|
| 4565 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.036933" transform="rotate(-0 40.72 413.036933)">1.0</text>
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="ytick_2">
|
| 4569 |
<g id="grid-y--3" class="grid grid-y">
|
| 4570 |
+
<path d="M 47.72 331.316879 L 840.20233 331.316879 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4571 |
</g>
|
| 4572 |
<g id="line2d_6">
|
| 4573 |
<g>
|
| 4574 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.316879" style="stroke: #000000; stroke-width: 0.8" />
|
| 4575 |
</g>
|
| 4576 |
</g>
|
| 4577 |
<g id="text_6">
|
| 4578 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.116098" transform="rotate(-0 40.72 335.116098)">1.5</text>
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="ytick_3">
|
| 4582 |
<g id="grid-y--4" class="grid grid-y">
|
| 4583 |
+
<path d="M 47.72 253.396045 L 840.20233 253.396045 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4584 |
</g>
|
| 4585 |
<g id="line2d_7">
|
| 4586 |
<g>
|
| 4587 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="253.396045" style="stroke: #000000; stroke-width: 0.8" />
|
| 4588 |
</g>
|
| 4589 |
</g>
|
| 4590 |
<g id="text_7">
|
| 4591 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.195264" transform="rotate(-0 40.72 257.195264)">2.0</text>
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="ytick_4">
|
| 4595 |
<g id="grid-y--5" class="grid grid-y">
|
| 4596 |
+
<path d="M 47.72 175.47521 L 840.20233 175.47521 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4597 |
</g>
|
| 4598 |
<g id="line2d_8">
|
| 4599 |
<g>
|
| 4600 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="175.47521" style="stroke: #000000; stroke-width: 0.8" />
|
| 4601 |
</g>
|
| 4602 |
</g>
|
| 4603 |
<g id="text_8">
|
| 4604 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.274429" transform="rotate(-0 40.72 179.274429)">2.5</text>
|
| 4605 |
</g>
|
| 4606 |
</g>
|
| 4607 |
<g id="ytick_5">
|
| 4608 |
<g id="grid-y--6" class="grid grid-y">
|
| 4609 |
+
<path d="M 47.72 97.554376 L 840.20233 97.554376 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4610 |
</g>
|
| 4611 |
<g id="line2d_9">
|
| 4612 |
<g>
|
| 4613 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="97.554376" style="stroke: #000000; stroke-width: 0.8" />
|
| 4614 |
</g>
|
| 4615 |
</g>
|
| 4616 |
<g id="text_9">
|
| 4617 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.353595" transform="rotate(-0 40.72 101.353595)">3.0</text>
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="series--torch-layer-norm" class="series">
|
| 4625 |
+
<path d="M 83.741924 437.689571 L 323.888085 303.419195 L 564.034245 314.859843 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4626 |
<defs>
|
| 4627 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4628 |
</defs>
|
| 4629 |
<g clip-path="url(#p2214f54723)">
|
| 4630 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4631 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.419195" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4632 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.859843" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4633 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4634 |
</g>
|
| 4635 |
</g>
|
| 4636 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4637 |
+
<path d="M 83.741924 434.525986 L 323.888085 307.036436 L 564.034245 306.425536 L 804.180406 56.12044 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4638 |
<defs>
|
| 4639 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4640 |
</defs>
|
| 4641 |
<g clip-path="url(#p2214f54723)">
|
| 4642 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="434.525986" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4643 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="307.036436" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4644 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="306.425536" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4645 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.12044" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4646 |
</g>
|
| 4647 |
</g>
|
| 4648 |
<g id="patch_3">
|
openai_moe/impls/artifacts/benchmark/openai_moe.jsonl
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"ts": "2025-10-31T20:01:48Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 149.85902099999748, "p50": 150.05062800003088, "p90": 150.2997029999733, "mean": 150.08009959999526, "iqr": 0.4259410000031494, "raw_times": [149.85902099999748, 150.3173840000045, 150.2997029999733, 149.87376199997016, 150.05062800003088], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 150.9511389999716, "peak_bytes": 416866816, "ok": true, "absmax": 2.765655517578125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.765655517578125e-05, "mae": 2.0696452338597737e-06, "mse": 7.332408985538663e-12, "ref": "naive_moe"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-31T20:02:12Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 199.76808500001653, "p50": 200.257487999977, "p90": 201.3672960000008, "mean": 200.6008808000047, "iqr": 1.3947150000035435, "raw_times": [200.257487999977, 201.63895400003184, 201.3672960000008, 199.97258099999726, 199.76808500001653], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 200.2076969999962, "peak_bytes": 632035840, "ok": true, "absmax": 1.621246337890625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.621246337890625e-05, "mae": 9.61917862696282e-07, "mse": 1.59423277530657e-12, "ref": "naive_moe"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-31T20:02:55Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 370.4508769999961, "p50": 372.7904090000038, "p90": 374.84007900002325, "mean": 372.8004498000132, "iqr": 3.7740770000027624, "raw_times": [374.84007900002325, 371.0660020000205, 370.4508769999961, 374.85488200002237, 372.7904090000038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 371.103493000021, "peak_bytes": 643844608, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.0501920516835526e-06, "mse": 7.1848811622476916e-12, "ref": "naive_moe"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-31T20:03:43Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 381.2919249999709, "p50": 382.6824700000202, "p90": 382.6975609999863, "mean": 382.48455139998896, "iqr": 0.3518089999943186, "raw_times": [382.345751999992, 381.2919249999709, 383.4050489999754, 382.6975609999863, 382.6824700000202], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 384.12325699999883, "peak_bytes": 823386112, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 9.400179123986163e-07, "mse": 1.5130355735665235e-12, "ref": "naive_moe"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-31T20:05:12Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 735.1488859999336, "p50": 742.0204380000541, "p90": 746.9078719999516, "mean": 742.4016768000001, "iqr": 5.8942259998957525, "raw_times": [746.9175420000056, 746.9078719999516, 742.0204380000541, 735.1488859999336, 741.0136460000558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 715.4345070000545, "peak_bytes": 1036112384, "ok": true, "absmax": 3.2901763916015625e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 3.2901763916015625e-05, "mae": 2.0572656467265915e-06, "mse": 7.247809123700488e-12, "ref": "naive_moe"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-31T20:06:54Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 799.7175619999553, "p50": 801.8970370000034, "p90": 803.0568570000014, "mean": 801.7179149999947, "iqr": 2.358569999955762, "raw_times": [799.7175619999553, 800.6982870000456, 803.2198319999679, 803.0568570000014, 801.8970370000034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 797.9236759999822, "peak_bytes": 1235263488, "ok": true, "absmax": 1.430511474609375e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.430511474609375e-05, "mae": 9.400343401466671e-07, "mse": 1.5107844445957919e-12, "ref": "naive_moe"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-31T20:09:51Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1484.2085319999114, "p50": 1486.4837999999736, "p90": 1487.529773999995, "mean": 1488.3352192000075, "iqr": 2.3281069999256943, "raw_times": [1498.252323000088, 1486.4837999999736, 1484.2085319999114, 1485.2016670000694, 1487.529773999995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1502.5766269999394, "peak_bytes": 1861947904, "ok": true, "absmax": 2.6226043701171875e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 2.6226043701171875e-05, "mae": 2.060702854578267e-06, "mse": 7.262949790198814e-12, "ref": "naive_moe"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-31T20:13:14Z", "run": "cee70b6f35064c71bc12a633683f7c01", "impl": "binned_torch", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1520.7084719999102, "p50": 1524.500331000013, "p90": 1525.4868470000247, "mean": 1524.7435091999705, "iqr": 1.6920530000561484, "raw_times": [1529.2271019999362, 1524.500331000013, 1523.7947939999685, 1525.4868470000247, 1520.7084719999102], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1532.9394789999924, "peak_bytes": 2062163968, "ok": true, "absmax": 1.5974044799804688e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5974044799804688e-05, "mae": 9.529014732834185e-07, "mse": 1.5621694476192216e-12, "ref": "naive_moe"}, "err": null}
|
openai_moe/impls/binned_torch.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
openai_moe/impls/cells/benchmark.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# ]
|
| 8 |
+
#
|
| 9 |
+
# [tool.uv.sources]
|
| 10 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 11 |
+
# ///
|
| 12 |
+
import torch
|
| 13 |
+
import sys
|
| 14 |
+
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def binned_gather(x, indices, bins, expert_capacity, top_k):
|
| 18 |
+
E, H = bins.shape[0], x.shape[1]
|
| 19 |
+
out = torch.zeros((E, expert_capacity, H), device=x.device, dtype=x.dtype)
|
| 20 |
+
for e in range(E):
|
| 21 |
+
start = 0 if e == 0 else bins[e - 1]
|
| 22 |
+
end = bins[e]
|
| 23 |
+
n = min(end - start, expert_capacity)
|
| 24 |
+
for i in range(n):
|
| 25 |
+
flat_pos = indices[start + i]
|
| 26 |
+
tok = flat_pos // top_k
|
| 27 |
+
out[e, i] = x[tok]
|
| 28 |
+
return out
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def binned_scatter(x, indices, weights, bins, expert_capacity, top_k):
|
| 32 |
+
E, C, H = x.shape
|
| 33 |
+
N = indices.shape[0] // top_k
|
| 34 |
+
out = torch.zeros((N, top_k, H), dtype=x.dtype, device=x.device)
|
| 35 |
+
for e in range(E):
|
| 36 |
+
start = 0 if e == 0 else bins[e - 1]
|
| 37 |
+
end = bins[e]
|
| 38 |
+
n = end - start
|
| 39 |
+
if n == 0:
|
| 40 |
+
continue
|
| 41 |
+
take = min(n, expert_capacity)
|
| 42 |
+
for i in range(take):
|
| 43 |
+
flat_pos = indices[start + i] # flattened (token, slot)
|
| 44 |
+
tok = flat_pos // top_k
|
| 45 |
+
slot = flat_pos % top_k
|
| 46 |
+
scale = weights[flat_pos] if weights is not None else 1.0
|
| 47 |
+
out[tok, slot] = x[e, i] * scale
|
| 48 |
+
return out.sum(dim=1)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def sort_tokens_by_expert(router_indices, num_experts):
|
| 52 |
+
flat_indices = router_indices.flatten()
|
| 53 |
+
sorted_values, sorted_indices = torch.sort(flat_indices)
|
| 54 |
+
tokens_per_expert = torch.bincount(sorted_values, minlength=num_experts)
|
| 55 |
+
bins = torch.cumsum(tokens_per_expert, dim=0)
|
| 56 |
+
return sorted_indices, sorted_values, bins, tokens_per_expert
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def binned_experts_ref(
|
| 60 |
+
hidden_states,
|
| 61 |
+
router_indices,
|
| 62 |
+
routing_weights,
|
| 63 |
+
gate_up_proj,
|
| 64 |
+
gate_up_proj_bias,
|
| 65 |
+
down_proj,
|
| 66 |
+
down_proj_bias,
|
| 67 |
+
expert_capacity,
|
| 68 |
+
):
|
| 69 |
+
B, S, H = hidden_states.shape
|
| 70 |
+
E, K = routing_weights.shape[2], router_indices.shape[1]
|
| 71 |
+
|
| 72 |
+
indices, _, bins, _ = sort_tokens_by_expert(router_indices, E)
|
| 73 |
+
x = binned_gather(hidden_states.view(-1, H), indices, bins, expert_capacity, K)
|
| 74 |
+
|
| 75 |
+
gate_up = torch.bmm(x, gate_up_proj) + gate_up_proj_bias[..., None, :]
|
| 76 |
+
gate, up = gate_up[..., ::2], gate_up[..., 1::2]
|
| 77 |
+
|
| 78 |
+
# clamp to limit
|
| 79 |
+
limit = 7.0
|
| 80 |
+
gate = gate.clamp(min=None, max=limit)
|
| 81 |
+
up = up.clamp(min=-limit, max=limit)
|
| 82 |
+
|
| 83 |
+
glu = gate * torch.sigmoid(gate * 1.702)
|
| 84 |
+
x = (up + 1) * glu
|
| 85 |
+
x = torch.bmm(x, down_proj) + down_proj_bias[..., None, :]
|
| 86 |
+
|
| 87 |
+
# build routing weights aligned to (token, slot)
|
| 88 |
+
flat_dense = routing_weights.view(-1, E) # [B*S, E]
|
| 89 |
+
flat_router = router_indices.view(-1, K) # [B*S, K]
|
| 90 |
+
selected = torch.gather(flat_dense, 1, flat_router).reshape(-1) # [B*S*K]
|
| 91 |
+
|
| 92 |
+
# scatter back
|
| 93 |
+
y = binned_scatter(x, indices, selected, bins, expert_capacity, K) # [B*S, H]
|
| 94 |
+
|
| 95 |
+
return y.view(B, S, H)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def binned_torch_openai_moe(
|
| 99 |
+
hidden_states,
|
| 100 |
+
router_indices,
|
| 101 |
+
routing_weights,
|
| 102 |
+
gate_up_proj,
|
| 103 |
+
gate_up_proj_bias,
|
| 104 |
+
down_proj,
|
| 105 |
+
down_proj_bias,
|
| 106 |
+
):
|
| 107 |
+
"""
|
| 108 |
+
Binned PyTorch implementation of OpenAI-style MoE.
|
| 109 |
+
Sorts tokens by expert assignment for more efficient batched processing.
|
| 110 |
+
"""
|
| 111 |
+
B, S = hidden_states.shape[0], hidden_states.shape[1]
|
| 112 |
+
K = router_indices.shape[1]
|
| 113 |
+
|
| 114 |
+
# Set expert_capacity to a reasonable value (max tokens per expert)
|
| 115 |
+
# Use 2x the average to handle imbalance
|
| 116 |
+
expert_capacity = (B * S * K * 2) // routing_weights.shape[2]
|
| 117 |
+
|
| 118 |
+
return binned_experts_ref(
|
| 119 |
+
hidden_states,
|
| 120 |
+
router_indices,
|
| 121 |
+
routing_weights,
|
| 122 |
+
gate_up_proj,
|
| 123 |
+
gate_up_proj_bias,
|
| 124 |
+
down_proj,
|
| 125 |
+
down_proj_bias,
|
| 126 |
+
expert_capacity,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
run_benchmark(
|
| 131 |
+
kernel_type=KernelTypeEnum.OPENAI_MOE,
|
| 132 |
+
impl_name="binned_torch",
|
| 133 |
+
impl_tags={"family": "pytorch", "backend": "eager"},
|
| 134 |
+
impl_func=binned_torch_openai_moe,
|
| 135 |
+
dtype="float32",
|
| 136 |
+
)
|
openai_moe/impls/cells/nv.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
openai_moe/impls/gpt_oss_moe.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
openai_moe/impls/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /openai_moe/impls</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /openai_moe/impls</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='binned_torch.html' class='file'>binned_torch.html</a></li>
|
| 86 |
+
<li><a href='gpt_oss_moe.html' class='file'>gpt_oss_moe.html</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
openai_moe/index.html
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /openai_moe</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /openai_moe</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='impls/index.html' class='dir'>impls/</a></li>
|
| 86 |
+
<li><a href='results/index.html' class='dir'>results/</a></li>
|
| 87 |
+
</ul>
|
| 88 |
+
</body>
|
| 89 |
+
</html>
|
openai_moe/results/artifacts/combine/latency.svg
ADDED
|
|
Git LFS Details
|
openai_moe/results/cells/combine.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch==2.8.0",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
|
| 12 |
+
# ///
|
| 13 |
+
from kernels_benchmark_tools.core.visuals import generate_combined_results
|
| 14 |
+
|
| 15 |
+
# Map display names to uvnote environment variables
|
| 16 |
+
cache_env_map = {
|
| 17 |
+
# "PyTorch OpenAI MoE": "UVNOTE_FILE_TORCH_OPENAI_MOE_BENCHMARK",
|
| 18 |
+
"Binned PyTorch": "UVNOTE_FILE_BINNED_TORCH_BENCHMARK",
|
| 19 |
+
"GptOssExperts": "UVNOTE_FILE_GPT_OSS_MOE_BENCHMARK",
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# Generate combined results with visualization
|
| 23 |
+
generate_combined_results(
|
| 24 |
+
cache_env_map=cache_env_map,
|
| 25 |
+
output_filename="openai_moe.jsonl",
|
| 26 |
+
svg_filename="latency.svg"
|
| 27 |
+
)
|
openai_moe/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
openai_moe/results/index.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset='UTF-8'>
|
| 5 |
+
<meta name='viewport' content='width=device-width, initial-scale=1.0'>
|
| 6 |
+
<title>Index of /openai_moe/results</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--bg-primary: #0a0a0a;
|
| 10 |
+
--bg-secondary: #121212;
|
| 11 |
+
--bg-tertiary: #181818;
|
| 12 |
+
--text-primary: #e0e0e0;
|
| 13 |
+
--text-secondary: #888888;
|
| 14 |
+
--text-link: #64b5f6;
|
| 15 |
+
--border-primary: #2a2a2a;
|
| 16 |
+
}
|
| 17 |
+
body {
|
| 18 |
+
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
|
| 19 |
+
background: var(--bg-primary);
|
| 20 |
+
color: var(--text-primary);
|
| 21 |
+
margin: 0;
|
| 22 |
+
padding: 16px;
|
| 23 |
+
max-width: 900px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
.controls {
|
| 27 |
+
display: flex;
|
| 28 |
+
justify-content: flex-end;
|
| 29 |
+
margin-bottom: 1rem;
|
| 30 |
+
}
|
| 31 |
+
.back-button {
|
| 32 |
+
background: var(--bg-secondary);
|
| 33 |
+
border: 1px solid var(--border-primary);
|
| 34 |
+
padding: 8px 12px;
|
| 35 |
+
border-radius: 4px;
|
| 36 |
+
color: var(--text-secondary);
|
| 37 |
+
cursor: pointer;
|
| 38 |
+
font-size: 0.9rem;
|
| 39 |
+
text-decoration: none;
|
| 40 |
+
display: inline-block;
|
| 41 |
+
}
|
| 42 |
+
.back-button:hover {
|
| 43 |
+
color: var(--text-primary);
|
| 44 |
+
background: var(--bg-tertiary);
|
| 45 |
+
}
|
| 46 |
+
h1 {
|
| 47 |
+
font-size: 1.5em;
|
| 48 |
+
margin: 1rem 0;
|
| 49 |
+
color: var(--text-primary);
|
| 50 |
+
border-bottom: 1px solid var(--border-primary);
|
| 51 |
+
padding-bottom: 0.5rem;
|
| 52 |
+
}
|
| 53 |
+
ul {
|
| 54 |
+
list-style-type: none;
|
| 55 |
+
padding: 0;
|
| 56 |
+
}
|
| 57 |
+
li {
|
| 58 |
+
margin: 0;
|
| 59 |
+
border-bottom: 1px solid var(--border-primary);
|
| 60 |
+
}
|
| 61 |
+
li:last-child {
|
| 62 |
+
border-bottom: none;
|
| 63 |
+
}
|
| 64 |
+
a {
|
| 65 |
+
display: block;
|
| 66 |
+
padding: 0.75rem 0.5rem;
|
| 67 |
+
text-decoration: none;
|
| 68 |
+
color: var(--text-link);
|
| 69 |
+
transition: background 0.2s ease;
|
| 70 |
+
}
|
| 71 |
+
a:hover {
|
| 72 |
+
background: var(--bg-secondary);
|
| 73 |
+
}
|
| 74 |
+
.dir {
|
| 75 |
+
font-weight: 500;
|
| 76 |
+
}
|
| 77 |
+
</style>
|
| 78 |
+
</head>
|
| 79 |
+
<body>
|
| 80 |
+
<div class='controls'>
|
| 81 |
+
<a href='../index.html' class='back-button'>← back</a>
|
| 82 |
+
</div>
|
| 83 |
+
<h1>Index of /openai_moe/results</h1>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href='combined_results.html' class='file'>combined_results.html</a></li>
|
| 86 |
+
</ul>
|
| 87 |
+
</body>
|
| 88 |
+
</html>
|