Upload folder using huggingface_hub
Browse files- activation/impls/artifacts/benchmark/activation.jsonl +9 -9
- activation/impls/cells/benchmark.py +13 -7
- activation/impls/hf_kernels_swiglu.html +101 -162
- activation/impls/torch_swiglu.html +127 -185
- activation/results/artifacts/combine/latency.svg +2 -2
- activation/results/combined_results.html +98 -144
- flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
- flash_attn/impls/cells/benchmark.py +10 -9
- flash_attn/impls/flash_attention.html +147 -211
- flash_attn/impls/hf_kernels_flash_attn.html +117 -118
- flash_attn/impls/hf_kernels_flash_attn3.html +85 -138
- flash_attn/impls/mem_efficient_attention.html +135 -187
- flash_attn/impls/sage_attention.html +12 -60
- flash_attn/impls/xformers.html +93 -139
- flash_attn/results/artifacts/combine/latency.svg +2 -2
- flash_attn/results/combined_results.html +293 -313
- layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +0 -0
- layer_norm/impls/cells/benchmark.py +28 -5
- layer_norm/impls/hf_kernels_layer_norm.html +0 -0
- layer_norm/impls/torch_layer_norm.html +0 -0
- layer_norm/results/artifacts/combine/latency.svg +1 -1
- layer_norm/results/combined_results.html +71 -117
activation/impls/artifacts/benchmark/activation.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024320999955307343, "p50": 0.025090999997701147, "p90": 0.02569000002949906, "mean": 0.026606800020090304, "iqr": 0.0010690000635804608, "raw_times": [0.03331100015202537, 0.025090999997701147, 0.024320999955307343, 0.02569000002949906, 0.0246209999659186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03336100007800269, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028640999971685233, "p50": 0.02958999994007172, "p90": 0.030561000130546745, "mean": 0.02986059994327661, "iqr": 0.0012610003068402875, "raw_times": [0.029299999823706457, 0.028640999971685233, 0.02958999994007172, 0.0312109998503729, 0.030561000130546745], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03354099999341997, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02880000010918593, "p50": 0.030331000061778468, "p90": 0.030401000003621448, "mean": 0.030208600037440192, "iqr": 0.0004209998678561533, "raw_times": [0.02880000010918593, 0.03153099987684982, 0.029980000135765295, 0.030331000061778468, 0.030401000003621448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03317000005154114, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02921000009337149, "p50": 0.0294310000299447, "p90": 0.029789999871354667, "mean": 0.029938399984530406, "iqr": 0.0004489997991186101, "raw_times": [0.0294310000299447, 0.02921000009337149, 0.03191999985574512, 0.029341000072236056, 0.029789999871354667], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03343000003042107, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.029799999992974335, "p50": 0.031021000040709623, "p90": 0.031239999998433632, "mean": 0.03210639997632825, "iqr": 0.0009289999525208259, "raw_times": [0.038159999803610845, 0.031021000040709623, 0.029799999992974335, 0.030311000045912806, 0.031239999998433632], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03207100007784902, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0278800000614865, "p50": 0.028550999786602915, "p90": 0.029250000125102815, "mean": 0.02903839999817137, "iqr": 0.0010100000054080738, "raw_times": [0.0278800000614865, 0.02824000011969474, 0.028550999786602915, 0.029250000125102815, 0.03127099989796989], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03262000018366962, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02945000005638576, "p50": 0.029881000045861583, "p90": 0.03017099993485317, "mean": 0.03019639998456114, "iqr": 0.0005509998572961194, "raw_times": [0.029881000045861583, 0.03185999980814813, 0.03017099993485317, 0.02945000005638576, 0.02962000007755705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031610000178261544, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02807000009852345, "p50": 0.028989999918849207, "p90": 0.02929000015683414, "mean": 0.028920200020365883, "iqr": 0.0003590002961573191, "raw_times": [0.029320000066945795, 0.02929000015683414, 0.02807000009852345, 0.028989999918849207, 0.02893099986067682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.033219999977518455, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.029301000040504732, "p50": 0.03090099994551565, "p90": 0.03149000008306757, "mean": 0.03127060003862425, "iqr": 0.0014889999420120148, "raw_times": [0.029301000040504732, 0.030001000141055556, 0.03149000008306757, 0.03465999998297775, 0.03090099994551565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03197000000909611, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
activation/impls/cells/benchmark.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
@@ -12,17 +13,22 @@
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 15 |
-
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
run_benchmark(
|
| 24 |
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 25 |
-
impl_name="
|
| 26 |
-
impl_tags={"family":"hf-kernels", "backend":"
|
| 27 |
-
impl_func=
|
| 28 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
+
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
|
| 18 |
+
# Load the activation kernel
|
| 19 |
+
activation = get_kernel("kernels-community/activation")
|
| 20 |
|
| 21 |
+
|
| 22 |
+
def hf_kernels_swiglu(input_tensor):
|
| 23 |
+
hidden_dim = input_tensor.shape[-1] // 2
|
| 24 |
+
out_shape = input_tensor.shape[:-1] + (hidden_dim,)
|
| 25 |
+
out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
|
| 26 |
+
return activation.silu_and_mul(out, input_tensor)
|
| 27 |
|
| 28 |
|
| 29 |
run_benchmark(
|
| 30 |
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 31 |
+
impl_name="hf_kernels_swiglu",
|
| 32 |
+
impl_tags={"family": "hf-kernels", "backend": "cuda"},
|
| 33 |
+
impl_func=hf_kernels_swiglu,
|
| 34 |
)
|
activation/impls/hf_kernels_swiglu.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv |
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,34 +3887,22 @@ Cell: nv | 4.02s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
-
| NVIDIA-SMI
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
| 3894 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3895 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
-
| 0 NVIDIA
|
| 3899 |
-
| N/A
|
| 3900 |
-
| | | N/A |
|
| 3901 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
-
| 1 NVIDIA L4 Off | 00000000:3A:00.0 Off | 0 |
|
| 3903 |
-
| N/A 32C P0 28W / 72W | 1MiB / 23034MiB | 2% Default |
|
| 3904 |
-
| | | N/A |
|
| 3905 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3906 |
-
| 2 NVIDIA L4 Off | 00000000:3C:00.0 Off | 0 |
|
| 3907 |
-
| N/A 34C P0 27W / 72W | 1MiB / 23034MiB | 1% Default |
|
| 3908 |
-
| | | N/A |
|
| 3909 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3910 |
-
| 3 NVIDIA L4 Off | 00000000:3E:00.0 Off | 0 |
|
| 3911 |
-
| N/A 32C P0 27W / 72W | 1MiB / 23034MiB | 2% Default |
|
| 3912 |
| | | N/A |
|
| 3913 |
+-----------------------------------------+------------------------+----------------------+
|
| 3914 |
|
| 3915 |
+-----------------------------------------------------------------------------------------+
|
| 3916 |
| Processes: |
|
| 3917 |
-
| GPU GI CI
|
| 3918 |
| ID ID Usage |
|
| 3919 |
|=========================================================================================|
|
| 3920 |
| No running processes found |
|
|
@@ -3932,7 +3920,7 @@ Cell: nv | 4.02s
|
|
| 3932 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3933 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3934 |
</span> |
|
| 3935 |
-
Cell: benchmark |
|
| 3936 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3937 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3938 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3988,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
|
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3990 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3992 |
-
hf_kernels_swiglu
|
| 3993 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 3994 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3995 |
-
Activity Buffer Request 82.
|
| 3996 |
-
aten::empty 2.
|
| 3997 |
-
cudaLaunchKernel 2.
|
| 3998 |
-
cudaDeviceSynchronize 0.
|
| 3999 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4000 |
-
Self CPU time total: 1.
|
| 4001 |
-
Self CUDA time total:
|
| 4002 |
|
| 4003 |
|
| 4004 |
|
|
@@ -4008,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
|
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4012 |
-
hf_kernels_swiglu
|
| 4013 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4014 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4015 |
-
Activity Buffer Request
|
| 4016 |
-
aten::empty 1.24% 20.
|
| 4017 |
-
cudaLaunchKernel 1.
|
| 4018 |
-
cudaDeviceSynchronize 0.
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
-
Self CPU time total: 1.
|
| 4021 |
-
Self CUDA time total:
|
| 4022 |
|
| 4023 |
|
| 4024 |
|
|
@@ -4028,17 +4016,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
|
|
| 4028 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4029 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.
|
| 4032 |
-
hf_kernels_swiglu
|
| 4033 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4034 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4035 |
-
Activity Buffer Request
|
| 4036 |
-
aten::empty 1.
|
| 4037 |
-
cudaLaunchKernel 1.
|
| 4038 |
-
cudaDeviceSynchronize 0.
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
-
Self CPU time total: 1.
|
| 4041 |
-
Self CUDA time total:
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
@@ -4048,17 +4036,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4052 |
-
hf_kernels_swiglu
|
| 4053 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4054 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4055 |
-
Activity Buffer Request
|
| 4056 |
-
aten::empty 1.
|
| 4057 |
-
cudaLaunchKernel
|
| 4058 |
-
cudaDeviceSynchronize 0.
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
-
Self CPU time total: 1.
|
| 4061 |
-
Self CUDA time total:
|
| 4062 |
|
| 4063 |
|
| 4064 |
|
|
@@ -4068,17 +4056,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
|
|
| 4068 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4069 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4072 |
-
hf_kernels_swiglu
|
| 4073 |
-
_activation_beeaae6::silu_and_mul
|
| 4074 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4075 |
-
Activity Buffer Request
|
| 4076 |
-
aten::empty
|
| 4077 |
-
cudaLaunchKernel
|
| 4078 |
-
cudaDeviceSynchronize
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
-
Self CPU time total:
|
| 4081 |
-
Self CUDA time total:
|
| 4082 |
|
| 4083 |
|
| 4084 |
|
|
@@ -4088,17 +4076,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
|
|
| 4088 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4089 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4090 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4091 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.
|
| 4092 |
-
hf_kernels_swiglu
|
| 4093 |
-
_activation_beeaae6::silu_and_mul
|
| 4094 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4095 |
-
Activity Buffer Request
|
| 4096 |
-
aten::empty
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
cudaDeviceSynchronize 0.
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
-
Self CPU time total:
|
| 4101 |
-
Self CUDA time total:
|
| 4102 |
|
| 4103 |
|
| 4104 |
|
|
@@ -4108,17 +4096,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
|
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4110 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4111 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4112 |
-
hf_kernels_swiglu
|
| 4113 |
-
_activation_beeaae6::silu_and_mul
|
| 4114 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4115 |
-
Activity Buffer Request
|
| 4116 |
-
aten::empty
|
| 4117 |
-
cudaLaunchKernel
|
| 4118 |
-
cudaDeviceSynchronize 0.
|
| 4119 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4120 |
-
Self CPU time total:
|
| 4121 |
-
Self CUDA time total:
|
| 4122 |
|
| 4123 |
|
| 4124 |
|
|
@@ -4128,17 +4116,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
|
|
| 4128 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4129 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4130 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4131 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4132 |
-
hf_kernels_swiglu
|
| 4133 |
-
_activation_beeaae6::silu_and_mul
|
| 4134 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4135 |
-
Activity Buffer Request
|
| 4136 |
-
aten::empty
|
| 4137 |
-
cudaLaunchKernel
|
| 4138 |
-
cudaDeviceSynchronize
|
| 4139 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4140 |
-
Self CPU time total:
|
| 4141 |
-
Self CUDA time total:
|
| 4142 |
|
| 4143 |
|
| 4144 |
|
|
@@ -4148,17 +4136,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
|
|
| 4148 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4149 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4150 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4151 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4152 |
-
hf_kernels_swiglu
|
| 4153 |
-
_activation_beeaae6::silu_and_mul
|
| 4154 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4155 |
-
Activity Buffer Request
|
| 4156 |
-
aten::empty
|
| 4157 |
-
cudaLaunchKernel
|
| 4158 |
-
cudaDeviceSynchronize
|
| 4159 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4160 |
-
Self CPU time total:
|
| 4161 |
-
Self CUDA time total:
|
| 4162 |
|
| 4163 |
|
| 4164 |
impl wl p50(ms) ok
|
|
@@ -4175,61 +4163,12 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
|
|
| 4175 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4176 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4177 |
<div class="uv-logs-content" style="display: none;">
|
| 4178 |
-
|
| 4179 |
-
Downloading numpy (15.9MiB)
|
| 4180 |
-
Downloading setuptools (1.1MiB)
|
| 4181 |
-
Downloading sympy (6.0MiB)
|
| 4182 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4183 |
-
Downloading triton (148.4MiB)
|
| 4184 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4185 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4186 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4187 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4188 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4189 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4190 |
-
Downloading pillow (6.7MiB)
|
| 4191 |
-
Downloading networkx (1.9MiB)
|
| 4192 |
-
Downloading matplotlib (8.3MiB)
|
| 4193 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4194 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4195 |
-
Downloading kiwisolver (1.4MiB)
|
| 4196 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4197 |
-
Downloading hf-xet (3.2MiB)
|
| 4198 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4199 |
-
Downloading fonttools (4.7MiB)
|
| 4200 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4201 |
-
Downloading torch (846.8MiB)
|
| 4202 |
-
Downloading nvidia-cufile-cu12
|
| 4203 |
-
Downloading kiwisolver
|
| 4204 |
-
Downloading hf-xet
|
| 4205 |
-
Downloading setuptools
|
| 4206 |
-
Downloading fonttools
|
| 4207 |
-
Downloading networkx
|
| 4208 |
-
Downloading pillow
|
| 4209 |
-
Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4210 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4211 |
-
Downloading matplotlib
|
| 4212 |
-
Downloading numpy
|
| 4213 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4214 |
-
Downloading sympy
|
| 4215 |
-
Downloading nvidia-curand-cu12
|
| 4216 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4217 |
-
Downloading triton
|
| 4218 |
-
Downloading nvidia-cufft-cu12
|
| 4219 |
-
Downloading nvidia-cusolver-cu12
|
| 4220 |
-
Downloading nvidia-cusparse-cu12
|
| 4221 |
-
Downloading nvidia-cusparselt-cu12
|
| 4222 |
-
Downloading nvidia-nccl-cu12
|
| 4223 |
-
Downloading nvidia-cublas-cu12
|
| 4224 |
-
Downloading nvidia-cudnn-cu12
|
| 4225 |
-
Downloading torch
|
| 4226 |
-
Installed 47 packages in 234ms
|
| 4227 |
</div>
|
| 4228 |
</div>
|
| 4229 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4230 |
-
Fetching 7 files:
|
| 4231 |
-
Fetching 7 files:
|
| 4232 |
-
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 12.20it/s]</div>
|
| 4233 |
<div class="cell-artifacts">
|
| 4234 |
<h4>Artifacts:</h4>
|
| 4235 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.26s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:46:00 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
| 3894 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3895 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
+
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 32C P0 153W / 350W | 0MiB / 46068MiB | 75% Default |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
| 3903 |
+-----------------------------------------------------------------------------------------+
|
| 3904 |
| Processes: |
|
| 3905 |
+
| GPU GI CI PID Type Process name GPU Memory |
|
| 3906 |
| ID ID Usage |
|
| 3907 |
|=========================================================================================|
|
| 3908 |
| No running processes found |
|
|
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 4.32s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 80.128us 1940.62% 80.128us 80.128us 1
|
| 3980 |
+
hf_kernels_swiglu 11.19% 199.383us 99.56% 1.774ms 1.774ms 0.000us 0.00% 5.634us 5.634us 1
|
| 3981 |
+
_activation_beeaae6::silu_and_mul 1.10% 19.601us 85.64% 1.526ms 508.618us 4.129us 100.00% 5.634us 1.878us 3
|
| 3982 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.129us 100.00% 4.129us 1.376us 3
|
| 3983 |
+
Activity Buffer Request 82.30% 1.466ms 82.30% 1.466ms 1.466ms 1.505us 36.45% 1.505us 1.505us 1
|
| 3984 |
+
aten::empty 2.73% 48.641us 2.73% 48.641us 16.214us 0.000us 0.00% 0.000us 0.000us 3
|
| 3985 |
+
cudaLaunchKernel 2.24% 39.931us 2.24% 39.931us 13.310us 0.000us 0.00% 0.000us 0.000us 3
|
| 3986 |
+
cudaDeviceSynchronize 0.44% 7.891us 0.44% 7.891us 7.891us 0.000us 0.00% 0.000us 0.000us 1
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
+
Self CPU time total: 1.782ms
|
| 3989 |
+
Self CUDA time total: 4.129us
|
| 3990 |
|
| 3991 |
|
| 3992 |
|
|
|
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 77.823us 1961.76% 77.823us 77.823us 1
|
| 4000 |
+
hf_kernels_swiglu 7.28% 119.722us 99.70% 1.640ms 1.640ms 0.000us 0.00% 5.311us 5.311us 1
|
| 4001 |
+
_activation_beeaae6::silu_and_mul 1.57% 25.841us 91.18% 1.500ms 499.858us 3.967us 100.00% 5.311us 1.770us 3
|
| 4002 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3
|
| 4003 |
+
Activity Buffer Request 87.74% 1.443ms 87.74% 1.443ms 1.443ms 1.344us 33.88% 1.344us 1.344us 1
|
| 4004 |
+
aten::empty 1.24% 20.410us 1.24% 20.410us 6.803us 0.000us 0.00% 0.000us 0.000us 3
|
| 4005 |
+
cudaLaunchKernel 1.86% 30.650us 1.86% 30.650us 10.217us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaDeviceSynchronize 0.30% 4.930us 0.30% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
Self CPU time total: 1.645ms
|
| 4009 |
+
Self CUDA time total: 3.967us
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.487us 1369.46% 67.487us 67.487us 1
|
| 4020 |
+
hf_kernels_swiglu 6.70% 107.400us 99.69% 1.598ms 1.598ms 0.000us 0.00% 6.592us 6.592us 1
|
| 4021 |
+
_activation_beeaae6::silu_and_mul 1.32% 21.191us 91.79% 1.471ms 490.438us 4.928us 100.00% 6.592us 2.197us 3
|
| 4022 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.928us 100.00% 4.928us 1.643us 3
|
| 4023 |
+
Activity Buffer Request 88.89% 1.425ms 88.89% 1.425ms 1.425ms 1.664us 33.77% 1.664us 1.664us 1
|
| 4024 |
+
aten::empty 1.20% 19.281us 1.20% 19.281us 6.427us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
cudaLaunchKernel 1.57% 25.210us 1.57% 25.210us 8.403us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
cudaDeviceSynchronize 0.31% 4.970us 0.31% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 1.603ms
|
| 4029 |
+
Self CUDA time total: 4.928us
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 75.265us 1768.03% 75.265us 75.265us 1
|
| 4040 |
+
hf_kernels_swiglu 6.51% 118.032us 99.70% 1.807ms 1.807ms 0.000us 0.00% 5.697us 5.697us 1
|
| 4041 |
+
_activation_beeaae6::silu_and_mul 1.22% 22.071us 92.05% 1.668ms 556.119us 4.257us 100.00% 5.697us 1.899us 3
|
| 4042 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.257us 100.00% 4.257us 1.419us 3
|
| 4043 |
+
Activity Buffer Request 79.39% 1.439ms 79.39% 1.439ms 1.439ms 1.440us 33.83% 1.440us 1.440us 1
|
| 4044 |
+
aten::empty 1.14% 20.640us 1.14% 20.640us 6.880us 0.000us 0.00% 0.000us 0.000us 3
|
| 4045 |
+
cudaLaunchKernel 11.45% 207.513us 11.45% 207.513us 69.171us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
cudaDeviceSynchronize 0.30% 5.350us 0.30% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
+
Self CPU time total: 1.812ms
|
| 4049 |
+
Self CUDA time total: 4.257us
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.471us 1111.94% 65.471us 65.471us 1
|
| 4060 |
+
hf_kernels_swiglu 19.52% 89.390us 98.84% 452.537us 452.537us 0.000us 0.00% 7.872us 7.872us 1
|
| 4061 |
+
_activation_beeaae6::silu_and_mul 5.02% 23.003us 75.04% 343.547us 114.516us 5.888us 100.00% 7.872us 2.624us 3
|
| 4062 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 100.00% 5.888us 1.963us 3
|
| 4063 |
+
Activity Buffer Request 33.89% 155.152us 33.89% 155.152us 155.152us 1.984us 33.70% 1.984us 1.984us 1
|
| 4064 |
+
aten::empty 4.28% 19.600us 4.28% 19.600us 6.533us 0.000us 0.00% 0.000us 0.000us 3
|
| 4065 |
+
cudaLaunchKernel 36.13% 165.392us 36.13% 165.392us 55.131us 0.000us 0.00% 0.000us 0.000us 3
|
| 4066 |
+
cudaDeviceSynchronize 1.16% 5.290us 1.16% 5.290us 5.290us 0.000us 0.00% 0.000us 0.000us 1
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
+
Self CPU time total: 457.827us
|
| 4069 |
+
Self CUDA time total: 5.888us
|
| 4070 |
|
| 4071 |
|
| 4072 |
|
|
|
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.383us 879.52% 68.383us 68.383us 1
|
| 4080 |
+
hf_kernels_swiglu 6.83% 118.711us 99.72% 1.734ms 1.734ms 0.000us 0.00% 10.367us 10.367us 1
|
| 4081 |
+
_activation_beeaae6::silu_and_mul 1.25% 21.741us 91.78% 1.596ms 531.855us 7.775us 100.00% 10.367us 3.456us 3
|
| 4082 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 100.00% 7.775us 2.592us 3
|
| 4083 |
+
Activity Buffer Request 81.74% 1.421ms 81.74% 1.421ms 1.421ms 2.592us 33.34% 2.592us 2.592us 1
|
| 4084 |
+
aten::empty 1.11% 19.311us 1.11% 19.311us 6.437us 0.000us 0.00% 0.000us 0.000us 3
|
| 4085 |
+
cudaLaunchKernel 8.79% 152.752us 8.79% 152.752us 50.917us 0.000us 0.00% 0.000us 0.000us 3
|
| 4086 |
+
cudaDeviceSynchronize 0.28% 4.930us 0.28% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
Self CPU time total: 1.739ms
|
| 4089 |
+
Self CUDA time total: 7.775us
|
| 4090 |
|
| 4091 |
|
| 4092 |
|
|
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.527us 1069.89% 70.527us 70.527us 1
|
| 4100 |
+
hf_kernels_swiglu 6.20% 108.691us 99.73% 1.749ms 1.749ms 0.000us 0.00% 8.800us 8.800us 1
|
| 4101 |
+
_activation_beeaae6::silu_and_mul 1.29% 22.622us 92.35% 1.619ms 539.785us 6.592us 100.00% 8.800us 2.933us 3
|
| 4102 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 100.00% 6.592us 2.197us 3
|
| 4103 |
+
Activity Buffer Request 82.48% 1.446ms 82.48% 1.446ms 1.446ms 2.208us 33.50% 2.208us 2.208us 1
|
| 4104 |
+
aten::empty 1.18% 20.650us 1.18% 20.650us 6.883us 0.000us 0.00% 0.000us 0.000us 3
|
| 4105 |
+
cudaLaunchKernel 8.58% 150.492us 8.58% 150.492us 50.164us 0.000us 0.00% 0.000us 0.000us 3
|
| 4106 |
+
cudaDeviceSynchronize 0.27% 4.780us 0.27% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1
|
| 4107 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
+
Self CPU time total: 1.753ms
|
| 4109 |
+
Self CUDA time total: 6.592us
|
| 4110 |
|
| 4111 |
|
| 4112 |
|
|
|
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.591us 703.03% 66.591us 66.591us 1
|
| 4120 |
+
hf_kernels_swiglu 22.91% 88.512us 98.75% 381.506us 381.506us 0.000us 0.00% 12.640us 12.640us 1
|
| 4121 |
+
_activation_beeaae6::silu_and_mul 5.22% 20.151us 70.42% 272.064us 90.688us 9.472us 100.00% 12.640us 4.213us 3
|
| 4122 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.472us 100.00% 9.472us 3.157us 3
|
| 4123 |
+
Activity Buffer Request 26.21% 101.241us 26.21% 101.241us 101.241us 3.168us 33.45% 3.168us 3.168us 1
|
| 4124 |
+
aten::empty 5.42% 20.930us 5.42% 20.930us 6.977us 0.000us 0.00% 0.000us 0.000us 3
|
| 4125 |
+
cudaLaunchKernel 39.00% 150.672us 39.00% 150.672us 50.224us 0.000us 0.00% 0.000us 0.000us 3
|
| 4126 |
+
cudaDeviceSynchronize 1.25% 4.820us 1.25% 4.820us 4.820us 0.000us 0.00% 0.000us 0.000us 1
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
+
Self CPU time total: 386.326us
|
| 4129 |
+
Self CUDA time total: 9.472us
|
| 4130 |
|
| 4131 |
|
| 4132 |
|
|
|
|
| 4136 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4137 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.295us 514.21% 67.295us 67.295us 1
|
| 4140 |
+
hf_kernels_swiglu 24.05% 101.492us 98.90% 417.266us 417.266us 0.000us 0.00% 17.503us 17.503us 1
|
| 4141 |
+
_activation_beeaae6::silu_and_mul 5.33% 22.480us 70.08% 295.684us 98.561us 13.087us 100.00% 17.503us 5.834us 3
|
| 4142 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.087us 100.00% 13.087us 4.362us 3
|
| 4143 |
+
Activity Buffer Request 28.92% 122.012us 28.92% 122.012us 122.012us 4.416us 33.74% 4.416us 4.416us 1
|
| 4144 |
+
aten::empty 4.76% 20.090us 4.76% 20.090us 6.697us 0.000us 0.00% 0.000us 0.000us 3
|
| 4145 |
+
cudaLaunchKernel 35.83% 151.192us 35.83% 151.192us 50.397us 0.000us 0.00% 0.000us 0.000us 3
|
| 4146 |
+
cudaDeviceSynchronize 1.10% 4.660us 1.10% 4.660us 4.660us 0.000us 0.00% 0.000us 0.000us 1
|
| 4147 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4148 |
+
Self CPU time total: 421.926us
|
| 4149 |
+
Self CUDA time total: 13.087us
|
| 4150 |
|
| 4151 |
|
| 4152 |
impl wl p50(ms) ok
|
|
|
|
| 4163 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4164 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4165 |
<div class="uv-logs-content" style="display: none;">
|
| 4166 |
+
Installed 15 packages in 15ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4167 |
</div>
|
| 4168 |
</div>
|
| 4169 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4170 |
+
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 13.68it/s]
|
| 4171 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 19.14it/s]</div>
|
|
|
|
| 4172 |
<div class="cell-artifacts">
|
| 4173 |
<h4>Artifacts:</h4>
|
| 4174 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/impls/torch_swiglu.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv |
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,34 +3887,22 @@ Cell: nv | 4.02s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
-
| NVIDIA-SMI
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
| 3894 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3895 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
-
| 0 NVIDIA
|
| 3899 |
-
| N/A
|
| 3900 |
-
| | | N/A |
|
| 3901 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
-
| 1 NVIDIA L4 Off | 00000000:3A:00.0 Off | 0 |
|
| 3903 |
-
| N/A 33C P0 28W / 72W | 1MiB / 23034MiB | 0% Default |
|
| 3904 |
-
| | | N/A |
|
| 3905 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3906 |
-
| 2 NVIDIA L4 Off | 00000000:3C:00.0 Off | 0 |
|
| 3907 |
-
| N/A 34C P0 27W / 72W | 1MiB / 23034MiB | 0% Default |
|
| 3908 |
-
| | | N/A |
|
| 3909 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3910 |
-
| 3 NVIDIA L4 Off | 00000000:3E:00.0 Off | 0 |
|
| 3911 |
-
| N/A 33C P0 27W / 72W | 1MiB / 23034MiB | 2% Default |
|
| 3912 |
| | | N/A |
|
| 3913 |
+-----------------------------------------+------------------------+----------------------+
|
| 3914 |
|
| 3915 |
+-----------------------------------------------------------------------------------------+
|
| 3916 |
| Processes: |
|
| 3917 |
-
| GPU GI CI
|
| 3918 |
| ID ID Usage |
|
| 3919 |
|=========================================================================================|
|
| 3920 |
| No running processes found |
|
|
@@ -3932,7 +3920,7 @@ Cell: nv | 4.02s
|
|
| 3932 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3933 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3934 |
</span> |
|
| 3935 |
-
Cell: benchmark |
|
| 3936 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3937 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3938 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3982,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
|
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3986 |
-
torch_eager 11.
|
| 3987 |
-
aten::silu 3.
|
| 3988 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3989 |
-
aten::mul
|
| 3990 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3991 |
-
Activity Buffer Request
|
| 3992 |
-
aten::slice
|
| 3993 |
-
aten::as_strided 0.
|
| 3994 |
-
cudaLaunchKernel
|
| 3995 |
-
cudaDeviceSynchronize 0.
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
-
Self CPU time total:
|
| 3998 |
-
Self CUDA time total:
|
| 3999 |
|
| 4000 |
|
| 4001 |
|
|
@@ -4005,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
|
|
| 4005 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4006 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4009 |
-
torch_eager 6.
|
| 4010 |
-
aten::silu 2.
|
| 4011 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4012 |
-
aten::mul 1.
|
| 4013 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4014 |
-
Activity Buffer Request
|
| 4015 |
-
aten::slice 1.
|
| 4016 |
-
aten::as_strided 0.36% 6.
|
| 4017 |
-
cudaLaunchKernel 2.
|
| 4018 |
-
cudaDeviceSynchronize 0.31% 5.
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
-
Self CPU time total: 1.
|
| 4021 |
-
Self CUDA time total:
|
| 4022 |
|
| 4023 |
|
| 4024 |
|
|
@@ -4028,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
|
|
| 4028 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4029 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4032 |
-
torch_eager 6.
|
| 4033 |
-
aten::silu 2.
|
| 4034 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4035 |
-
aten::mul 1.
|
| 4036 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4037 |
-
Activity Buffer Request 85.
|
| 4038 |
-
aten::slice 1.
|
| 4039 |
-
aten::as_strided 0.
|
| 4040 |
-
cudaLaunchKernel 2.
|
| 4041 |
-
cudaDeviceSynchronize 0.
|
| 4042 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4043 |
-
Self CPU time total: 1.
|
| 4044 |
-
Self CUDA time total:
|
| 4045 |
|
| 4046 |
|
| 4047 |
|
|
@@ -4051,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
|
|
| 4051 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4052 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4055 |
-
torch_eager 7.
|
| 4056 |
-
aten::silu
|
| 4057 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4058 |
-
aten::mul 1.
|
| 4059 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4060 |
-
Activity Buffer Request
|
| 4061 |
-
aten::slice 1.
|
| 4062 |
-
aten::as_strided 0.
|
| 4063 |
-
cudaLaunchKernel
|
| 4064 |
-
cudaDeviceSynchronize 0.
|
| 4065 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4066 |
-
Self CPU time total: 1.
|
| 4067 |
-
Self CUDA time total:
|
| 4068 |
|
| 4069 |
|
| 4070 |
|
|
@@ -4074,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
|
|
| 4074 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4075 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4078 |
-
torch_eager 5.
|
| 4079 |
-
aten::silu 2.
|
| 4080 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4081 |
-
aten::mul 1.
|
| 4082 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4083 |
-
Activity Buffer Request 78.
|
| 4084 |
-
aten::slice 1.
|
| 4085 |
-
aten::as_strided 0.
|
| 4086 |
-
cudaLaunchKernel 10.
|
| 4087 |
-
cudaDeviceSynchronize 0.
|
| 4088 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4089 |
-
Self CPU time total: 1.
|
| 4090 |
-
Self CUDA time total:
|
| 4091 |
|
| 4092 |
|
| 4093 |
|
|
@@ -4097,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
|
|
| 4097 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4098 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4101 |
-
torch_eager
|
| 4102 |
-
aten::silu
|
| 4103 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4104 |
-
aten::mul
|
| 4105 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4106 |
-
Activity Buffer Request
|
| 4107 |
-
aten::slice
|
| 4108 |
-
aten::as_strided 1.
|
| 4109 |
-
cudaLaunchKernel
|
| 4110 |
-
cudaDeviceSynchronize
|
| 4111 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4112 |
-
Self CPU time total:
|
| 4113 |
-
Self CUDA time total:
|
| 4114 |
|
| 4115 |
|
| 4116 |
|
|
@@ -4120,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
|
|
| 4120 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4121 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4122 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4123 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4124 |
-
torch_eager 5.
|
| 4125 |
-
aten::silu 2.
|
| 4126 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4127 |
-
aten::mul 1.
|
| 4128 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4129 |
-
Activity Buffer Request
|
| 4130 |
-
aten::slice 1.
|
| 4131 |
-
aten::as_strided 0.
|
| 4132 |
-
cudaLaunchKernel 9.
|
| 4133 |
-
cudaDeviceSynchronize 0.
|
| 4134 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4135 |
-
Self CPU time total: 1.
|
| 4136 |
-
Self CUDA time total:
|
| 4137 |
|
| 4138 |
|
| 4139 |
|
|
@@ -4143,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
|
|
| 4143 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4144 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4145 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4146 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4147 |
-
torch_eager
|
| 4148 |
-
aten::silu
|
| 4149 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4150 |
-
aten::mul
|
| 4151 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4152 |
-
Activity Buffer Request
|
| 4153 |
-
aten::slice
|
| 4154 |
-
aten::as_strided
|
| 4155 |
-
cudaLaunchKernel
|
| 4156 |
-
cudaDeviceSynchronize
|
| 4157 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4158 |
-
Self CPU time total:
|
| 4159 |
-
Self CUDA time total:
|
| 4160 |
|
| 4161 |
|
| 4162 |
|
|
@@ -4166,26 +4154,26 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
|
|
| 4166 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4167 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4168 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4169 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4170 |
-
torch_eager 5.
|
| 4171 |
-
aten::silu 2.
|
| 4172 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4173 |
-
aten::mul 1.
|
| 4174 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4175 |
-
Activity Buffer Request 79.
|
| 4176 |
-
aten::slice 1.
|
| 4177 |
-
aten::as_strided 0.
|
| 4178 |
-
cudaLaunchKernel 9.
|
| 4179 |
-
cudaDeviceSynchronize 0.
|
| 4180 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4181 |
-
Self CPU time total: 1.
|
| 4182 |
-
Self CUDA time total:
|
| 4183 |
|
| 4184 |
|
| 4185 |
impl wl p50(ms) ok
|
| 4186 |
torch_eager cuda_T128_D1024 0.05 True
|
| 4187 |
torch_eager cuda_T128_D2048 0.05 True
|
| 4188 |
-
torch_eager cuda_T128_D768 0.
|
| 4189 |
torch_eager cuda_T256_D1024 0.05 True
|
| 4190 |
torch_eager cuda_T256_D2048 0.05 True
|
| 4191 |
torch_eager cuda_T256_D768 0.05 True
|
|
@@ -4196,53 +4184,7 @@ torch_eager cuda_T512_D768 0.05 True
|
|
| 4196 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4197 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4198 |
<div class="uv-logs-content" style="display: none;">
|
| 4199 |
-
|
| 4200 |
-
Downloading setuptools (1.1MiB)
|
| 4201 |
-
Downloading sympy (6.0MiB)
|
| 4202 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4203 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4204 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4205 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4206 |
-
Downloading networkx (1.9MiB)
|
| 4207 |
-
Downloading numpy (15.9MiB)
|
| 4208 |
-
Downloading matplotlib (8.3MiB)
|
| 4209 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4210 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4211 |
-
Downloading pillow (6.7MiB)
|
| 4212 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4213 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4214 |
-
Downloading fonttools (4.7MiB)
|
| 4215 |
-
Downloading kiwisolver (1.4MiB)
|
| 4216 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4217 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4218 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4219 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4220 |
-
Downloading torch (846.8MiB)
|
| 4221 |
-
Downloading triton (148.4MiB)
|
| 4222 |
-
Downloading nvidia-cufile-cu12
|
| 4223 |
-
Downloading kiwisolver
|
| 4224 |
-
Downloading setuptools
|
| 4225 |
-
Downloading networkx
|
| 4226 |
-
Downloading fonttools
|
| 4227 |
-
Downloading pillow
|
| 4228 |
-
Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4229 |
-
Downloading matplotlib
|
| 4230 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4231 |
-
Downloading numpy
|
| 4232 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4233 |
-
Downloading sympy
|
| 4234 |
-
Downloading nvidia-curand-cu12
|
| 4235 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4236 |
-
Downloading triton
|
| 4237 |
-
Downloading nvidia-cufft-cu12
|
| 4238 |
-
Downloading nvidia-cusolver-cu12
|
| 4239 |
-
Downloading nvidia-cusparse-cu12
|
| 4240 |
-
Downloading nvidia-cusparselt-cu12
|
| 4241 |
-
Downloading nvidia-nccl-cu12
|
| 4242 |
-
Downloading nvidia-cublas-cu12
|
| 4243 |
-
Downloading nvidia-cudnn-cu12
|
| 4244 |
-
Downloading torch
|
| 4245 |
-
Installed 37 packages in 214ms
|
| 4246 |
</div>
|
| 4247 |
</div>
|
| 4248 |
<div class="cell-artifacts">
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.26s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:46:00 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
| 3894 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3895 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
+
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 32C P0 153W / 350W | 0MiB / 46068MiB | 75% Default |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
| 3903 |
+-----------------------------------------------------------------------------------------+
|
| 3904 |
| Processes: |
|
| 3905 |
+
| GPU GI CI PID Type Process name GPU Memory |
|
| 3906 |
| ID ID Usage |
|
| 3907 |
|=========================================================================================|
|
| 3908 |
| No running processes found |
|
|
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 6.99s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 183.359us 1436.08% 183.359us 183.359us 1
|
| 3974 |
+
torch_eager 11.24% 212.694us 99.53% 1.883ms 1.883ms 0.000us 0.00% 15.072us 15.072us 1
|
| 3975 |
+
aten::silu 3.31% 62.660us 82.30% 1.557ms 519.134us 6.527us 51.12% 8.831us 2.944us 3
|
| 3976 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.527us 51.12% 6.527us 2.176us 3
|
| 3977 |
+
aten::mul 1.85% 35.100us 2.98% 56.340us 18.780us 6.241us 48.88% 6.241us 2.080us 3
|
| 3978 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.241us 48.88% 6.241us 2.080us 3
|
| 3979 |
+
Activity Buffer Request 76.74% 1.452ms 76.74% 1.452ms 1.452ms 2.304us 18.05% 2.304us 2.304us 1
|
| 3980 |
+
aten::slice 2.41% 45.561us 3.01% 56.902us 9.484us 0.000us 0.00% 0.000us 0.000us 6
|
| 3981 |
+
aten::as_strided 0.60% 11.341us 0.60% 11.341us 1.890us 0.000us 0.00% 0.000us 0.000us 6
|
| 3982 |
+
cudaLaunchKernel 3.37% 63.741us 3.37% 63.741us 10.623us 0.000us 0.00% 0.000us 0.000us 6
|
| 3983 |
+
cudaDeviceSynchronize 0.47% 8.969us 0.47% 8.969us 8.969us 0.000us 0.00% 0.000us 0.000us 1
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
+
Self CPU time total: 1.892ms
|
| 3986 |
+
Self CUDA time total: 12.768us
|
| 3987 |
|
| 3988 |
|
| 3989 |
|
|
|
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.431us 1279.63% 158.431us 158.431us 1
|
| 3997 |
+
torch_eager 6.85% 117.301us 99.69% 1.707ms 1.707ms 0.000us 0.00% 14.557us 14.557us 1
|
| 3998 |
+
aten::silu 2.45% 41.990us 88.25% 1.511ms 503.680us 6.398us 51.68% 8.574us 2.858us 3
|
| 3999 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.398us 51.68% 6.398us 2.133us 3
|
| 4000 |
+
aten::mul 1.63% 27.830us 2.78% 47.630us 15.877us 5.983us 48.32% 5.983us 1.994us 3
|
| 4001 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
|
| 4002 |
+
Activity Buffer Request 84.28% 1.443ms 84.28% 1.443ms 1.443ms 2.176us 17.58% 2.176us 2.176us 1
|
| 4003 |
+
aten::slice 1.45% 24.820us 1.81% 30.931us 5.155us 0.000us 0.00% 0.000us 0.000us 6
|
| 4004 |
+
aten::as_strided 0.36% 6.111us 0.36% 6.111us 1.019us 0.000us 0.00% 0.000us 0.000us 6
|
| 4005 |
+
cudaLaunchKernel 2.67% 45.711us 2.67% 45.711us 7.618us 0.000us 0.00% 0.000us 0.000us 6
|
| 4006 |
+
cudaDeviceSynchronize 0.31% 5.320us 0.31% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
Self CPU time total: 1.712ms
|
| 4009 |
+
Self CUDA time total: 12.381us
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.182us 1095.88% 145.182us 145.182us 1
|
| 4020 |
+
torch_eager 6.28% 105.841us 99.65% 1.680ms 1.680ms 0.000us 0.00% 15.552us 15.552us 1
|
| 4021 |
+
aten::silu 2.40% 40.400us 89.03% 1.501ms 500.258us 6.816us 51.45% 9.120us 3.040us 3
|
| 4022 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.45% 6.816us 2.272us 3
|
| 4023 |
+
aten::mul 1.52% 25.690us 2.64% 44.480us 14.827us 6.432us 48.55% 6.432us 2.144us 3
|
| 4024 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.55% 6.432us 2.144us 3
|
| 4025 |
+
Activity Buffer Request 85.10% 1.434ms 85.10% 1.434ms 1.434ms 2.304us 17.39% 2.304us 2.304us 1
|
| 4026 |
+
aten::slice 1.37% 23.030us 1.70% 28.690us 4.782us 0.000us 0.00% 0.000us 0.000us 6
|
| 4027 |
+
aten::as_strided 0.34% 5.660us 0.34% 5.660us 0.943us 0.000us 0.00% 0.000us 0.000us 6
|
| 4028 |
+
cudaLaunchKernel 2.66% 44.762us 2.66% 44.762us 7.460us 0.000us 0.00% 0.000us 0.000us 6
|
| 4029 |
+
cudaDeviceSynchronize 0.35% 5.820us 0.35% 5.820us 5.820us 0.000us 0.00% 0.000us 0.000us 1
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
+
Self CPU time total: 1.686ms
|
| 4032 |
+
Self CUDA time total: 13.248us
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.025us 1135.85% 145.025us 145.025us 1
|
| 4043 |
+
torch_eager 7.55% 116.292us 99.65% 1.535ms 1.535ms 0.000us 0.00% 14.976us 14.976us 1
|
| 4044 |
+
aten::silu 2.67% 41.061us 87.34% 1.345ms 448.460us 6.592us 51.63% 8.800us 2.933us 3
|
| 4045 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 51.63% 6.592us 2.197us 3
|
| 4046 |
+
aten::mul 1.71% 26.359us 2.88% 44.330us 14.777us 6.176us 48.37% 6.176us 2.059us 3
|
| 4047 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.37% 6.176us 2.059us 3
|
| 4048 |
+
Activity Buffer Request 69.61% 1.072ms 69.61% 1.072ms 1.072ms 2.208us 17.29% 2.208us 2.208us 1
|
| 4049 |
+
aten::slice 1.52% 23.350us 1.89% 29.050us 4.842us 0.000us 0.00% 0.000us 0.000us 6
|
| 4050 |
+
aten::as_strided 0.37% 5.700us 0.37% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
|
| 4051 |
+
cudaLaunchKernel 16.23% 250.045us 16.23% 250.045us 41.674us 0.000us 0.00% 0.000us 0.000us 6
|
| 4052 |
+
cudaDeviceSynchronize 0.35% 5.360us 0.35% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
+
Self CPU time total: 1.540ms
|
| 4055 |
+
Self CUDA time total: 12.768us
|
| 4056 |
|
| 4057 |
|
| 4058 |
|
|
|
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 144.030us 1089.82% 144.030us 144.030us 1
|
| 4066 |
+
torch_eager 5.82% 104.551us 99.68% 1.792ms 1.792ms 0.000us 0.00% 15.488us 15.488us 1
|
| 4067 |
+
aten::silu 2.32% 41.682us 89.81% 1.614ms 538.151us 6.752us 51.09% 9.024us 3.008us 3
|
| 4068 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.09% 6.752us 2.251us 3
|
| 4069 |
+
aten::mul 1.41% 25.409us 2.48% 44.550us 14.850us 6.464us 48.91% 6.464us 2.155us 3
|
| 4070 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.91% 6.464us 2.155us 3
|
| 4071 |
+
Activity Buffer Request 78.50% 1.411ms 78.50% 1.411ms 1.411ms 2.272us 17.19% 2.272us 2.272us 1
|
| 4072 |
+
aten::slice 1.27% 22.830us 1.58% 28.320us 4.720us 0.000us 0.00% 0.000us 0.000us 6
|
| 4073 |
+
aten::as_strided 0.31% 5.490us 0.31% 5.490us 0.915us 0.000us 0.00% 0.000us 0.000us 6
|
| 4074 |
+
cudaLaunchKernel 10.06% 180.853us 10.06% 180.853us 30.142us 0.000us 0.00% 0.000us 0.000us 6
|
| 4075 |
+
cudaDeviceSynchronize 0.32% 5.710us 0.32% 5.710us 5.710us 0.000us 0.00% 0.000us 0.000us 1
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
+
Self CPU time total: 1.798ms
|
| 4078 |
+
Self CUDA time total: 13.216us
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 140.382us 902.66% 140.382us 140.382us 1
|
| 4089 |
+
torch_eager 21.39% 103.633us 98.99% 479.697us 479.697us 0.000us 0.00% 18.240us 18.240us 1
|
| 4090 |
+
aten::silu 8.56% 41.460us 63.18% 306.154us 102.051us 7.936us 51.03% 10.624us 3.541us 3
|
| 4091 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.03% 7.936us 2.645us 3
|
| 4092 |
+
aten::mul 4.90% 23.759us 8.63% 41.840us 13.947us 7.616us 48.97% 7.616us 2.539us 3
|
| 4093 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.97% 7.616us 2.539us 3
|
| 4094 |
+
Activity Buffer Request 23.12% 112.032us 23.12% 112.032us 112.032us 2.688us 17.28% 2.688us 2.688us 1
|
| 4095 |
+
aten::slice 4.68% 22.671us 5.79% 28.070us 4.678us 0.000us 0.00% 0.000us 0.000us 6
|
| 4096 |
+
aten::as_strided 1.11% 5.399us 1.11% 5.399us 0.900us 0.000us 0.00% 0.000us 0.000us 6
|
| 4097 |
+
cudaLaunchKernel 35.23% 170.743us 35.23% 170.743us 28.457us 0.000us 0.00% 0.000us 0.000us 6
|
| 4098 |
+
cudaDeviceSynchronize 1.01% 4.900us 1.01% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
+
Self CPU time total: 484.597us
|
| 4101 |
+
Self CUDA time total: 15.552us
|
| 4102 |
|
| 4103 |
|
| 4104 |
|
|
|
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4110 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4111 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.662us 1011.54% 145.662us 145.662us 1
|
| 4112 |
+
torch_eager 5.99% 108.381us 99.73% 1.804ms 1.804ms 0.000us 0.00% 16.896us 16.896us 1
|
| 4113 |
+
aten::silu 2.28% 41.342us 89.69% 1.623ms 540.945us 7.392us 51.33% 9.888us 3.296us 3
|
| 4114 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 51.33% 7.392us 2.464us 3
|
| 4115 |
+
aten::mul 1.44% 26.049us 2.45% 44.420us 14.807us 7.008us 48.67% 7.008us 2.336us 3
|
| 4116 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.67% 7.008us 2.336us 3
|
| 4117 |
+
Activity Buffer Request 78.99% 1.429ms 78.99% 1.429ms 1.429ms 2.496us 17.33% 2.496us 2.496us 1
|
| 4118 |
+
aten::slice 1.28% 23.160us 1.59% 28.810us 4.802us 0.000us 0.00% 0.000us 0.000us 6
|
| 4119 |
+
aten::as_strided 0.31% 5.650us 0.31% 5.650us 0.942us 0.000us 0.00% 0.000us 0.000us 6
|
| 4120 |
+
cudaLaunchKernel 9.43% 170.603us 9.43% 170.603us 28.434us 0.000us 0.00% 0.000us 0.000us 6
|
| 4121 |
+
cudaDeviceSynchronize 0.27% 4.930us 0.27% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
|
| 4122 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4123 |
+
Self CPU time total: 1.809ms
|
| 4124 |
+
Self CUDA time total: 14.400us
|
| 4125 |
|
| 4126 |
|
| 4127 |
|
|
|
|
| 4131 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4132 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4133 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4134 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 142.206us 914.45% 142.206us 142.206us 1
|
| 4135 |
+
torch_eager 21.70% 105.494us 98.87% 480.727us 480.727us 0.000us 0.00% 18.239us 18.239us 1
|
| 4136 |
+
aten::silu 8.21% 39.900us 62.39% 303.354us 101.118us 7.966us 51.23% 10.654us 3.551us 3
|
| 4137 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.966us 51.23% 7.966us 2.655us 3
|
| 4138 |
+
aten::mul 5.16% 25.070us 8.84% 42.990us 14.330us 7.585us 48.77% 7.585us 2.528us 3
|
| 4139 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.585us 48.77% 7.585us 2.528us 3
|
| 4140 |
+
Activity Buffer Request 23.29% 113.242us 23.29% 113.242us 113.242us 2.688us 17.29% 2.688us 2.688us 1
|
| 4141 |
+
aten::slice 4.75% 23.080us 5.94% 28.889us 4.815us 0.000us 0.00% 0.000us 0.000us 6
|
| 4142 |
+
aten::as_strided 1.19% 5.809us 1.19% 5.809us 0.968us 0.000us 0.00% 0.000us 0.000us 6
|
| 4143 |
+
cudaLaunchKernel 34.58% 168.132us 34.58% 168.132us 28.022us 0.000us 0.00% 0.000us 0.000us 6
|
| 4144 |
+
cudaDeviceSynchronize 1.13% 5.500us 1.13% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
|
| 4145 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4146 |
+
Self CPU time total: 486.227us
|
| 4147 |
+
Self CUDA time total: 15.551us
|
| 4148 |
|
| 4149 |
|
| 4150 |
|
|
|
|
| 4154 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4155 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4156 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4157 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.022us 661.50% 149.022us 149.022us 1
|
| 4158 |
+
torch_eager 5.72% 105.900us 99.72% 1.847ms 1.847ms 0.000us 0.00% 26.431us 26.431us 1
|
| 4159 |
+
aten::silu 2.24% 41.461us 90.05% 1.668ms 555.875us 11.552us 51.28% 15.455us 5.152us 3
|
| 4160 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 51.28% 11.552us 3.851us 3
|
| 4161 |
+
aten::mul 1.41% 26.021us 2.40% 44.421us 14.807us 10.976us 48.72% 10.976us 3.659us 3
|
| 4162 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.976us 48.72% 10.976us 3.659us 3
|
| 4163 |
+
Activity Buffer Request 79.50% 1.472ms 79.50% 1.472ms 1.472ms 3.903us 17.33% 3.903us 3.903us 1
|
| 4164 |
+
aten::slice 1.25% 23.131us 1.56% 28.831us 4.805us 0.000us 0.00% 0.000us 0.000us 6
|
| 4165 |
+
aten::as_strided 0.31% 5.700us 0.31% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
|
| 4166 |
+
cudaLaunchKernel 9.31% 172.382us 9.31% 172.382us 28.730us 0.000us 0.00% 0.000us 0.000us 6
|
| 4167 |
+
cudaDeviceSynchronize 0.28% 5.130us 0.28% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
|
| 4168 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4169 |
+
Self CPU time total: 1.852ms
|
| 4170 |
+
Self CUDA time total: 22.528us
|
| 4171 |
|
| 4172 |
|
| 4173 |
impl wl p50(ms) ok
|
| 4174 |
torch_eager cuda_T128_D1024 0.05 True
|
| 4175 |
torch_eager cuda_T128_D2048 0.05 True
|
| 4176 |
+
torch_eager cuda_T128_D768 0.04 True
|
| 4177 |
torch_eager cuda_T256_D1024 0.05 True
|
| 4178 |
torch_eager cuda_T256_D2048 0.05 True
|
| 4179 |
torch_eager cuda_T256_D768 0.05 True
|
|
|
|
| 4184 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4185 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4186 |
<div class="uv-logs-content" style="display: none;">
|
| 4187 |
+
Installed 37 packages in 246ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4188 |
</div>
|
| 4189 |
</div>
|
| 4190 |
<div class="cell-artifacts">
|
activation/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
activation/results/combined_results.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4021,83 +4021,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4021 |
<g id="matplotlib.axis_2">
|
| 4022 |
<g id="ytick_1">
|
| 4023 |
<g id="grid-y--2" class="grid grid-y">
|
| 4024 |
-
<path d="M 60.23
|
| 4025 |
</g>
|
| 4026 |
<g id="line2d_10">
|
| 4027 |
<defs>
|
| 4028 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4029 |
</defs>
|
| 4030 |
<g>
|
| 4031 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_2">
|
| 4039 |
<g id="grid-y--3" class="grid grid-y">
|
| 4040 |
-
<path d="M 60.23
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_3">
|
| 4052 |
<g id="grid-y--4" class="grid grid-y">
|
| 4053 |
-
<path d="M 60.23
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_4">
|
| 4065 |
<g id="grid-y--5" class="grid grid-y">
|
| 4066 |
-
<path d="M 60.23
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="ytick_5">
|
| 4078 |
<g id="grid-y--6" class="grid grid-y">
|
| 4079 |
-
<path d="M 60.23
|
| 4080 |
</g>
|
| 4081 |
<g id="line2d_14">
|
| 4082 |
<g>
|
| 4083 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4084 |
</g>
|
| 4085 |
</g>
|
| 4086 |
<g id="text_14">
|
| 4087 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="ytick_6">
|
| 4091 |
<g id="grid-y--7" class="grid grid-y">
|
| 4092 |
-
<path d="M 60.23
|
| 4093 |
</g>
|
| 4094 |
<g id="line2d_15">
|
| 4095 |
<g>
|
| 4096 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4097 |
</g>
|
| 4098 |
</g>
|
| 4099 |
<g id="text_15">
|
| 4100 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="label--y" class="ylabel">
|
|
@@ -4105,37 +4105,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4108 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4109 |
<defs>
|
| 4110 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4111 |
</defs>
|
| 4112 |
<g clip-path="url(#p620c7d392f)">
|
| 4113 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4114 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4115 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4116 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4117 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4118 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4119 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4120 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4121 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--torch-eager" class="series">
|
| 4125 |
-
<path d="M 96.005644
|
| 4126 |
<defs>
|
| 4127 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4128 |
</defs>
|
| 4129 |
<g clip-path="url(#p620c7d392f)">
|
| 4130 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4131 |
-
<use ns4:href="#m9b8c54d372" x="185.444754" y="
|
| 4132 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4133 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4134 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4135 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4136 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4137 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4138 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4139 |
</g>
|
| 4140 |
</g>
|
| 4141 |
<g id="patch_3">
|
|
@@ -4155,25 +4155,25 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4155 |
</g>
|
| 4156 |
<g id="legend" class="legend">
|
| 4157 |
<g id="patch_7">
|
| 4158 |
-
<path d="M 720.811356
|
| 4159 |
</g>
|
| 4160 |
<g id="line2d_16">
|
| 4161 |
-
<path d="M 722.811356
|
| 4162 |
<g>
|
| 4163 |
-
<use ns4:href="#md7efaf3aec" x="732.811356" y="
|
| 4164 |
</g>
|
| 4165 |
</g>
|
| 4166 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4167 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4168 |
</g>
|
| 4169 |
<g id="line2d_17">
|
| 4170 |
-
<path d="M 722.811356
|
| 4171 |
<g>
|
| 4172 |
-
<use ns4:href="#m9b8c54d372" x="732.811356" y="
|
| 4173 |
</g>
|
| 4174 |
</g>
|
| 4175 |
<g id="legend-label--torch-eager" class="legend">
|
| 4176 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4177 |
</g>
|
| 4178 |
</g>
|
| 4179 |
</g>
|
|
@@ -4193,7 +4193,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4193 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4194 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4195 |
</span> |
|
| 4196 |
-
Cell: combine |
|
| 4197 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4198 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4199 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4267,13 +4267,13 @@ Cell: combine | 38.46s
|
|
| 4267 |
<div class="cell-stdout"><pre class="stdout-text">======================================================================
|
| 4268 |
LOADING BENCHMARK DATA
|
| 4269 |
======================================================================
|
| 4270 |
-
✓ HF Kernels SwiGLU : /
|
| 4271 |
-
✓ PyTorch SwiGLU : /
|
| 4272 |
|
| 4273 |
✓ Found HF Kernels SwiGLU
|
| 4274 |
-
Path: /
|
| 4275 |
✓ Found PyTorch SwiGLU
|
| 4276 |
-
Path: /
|
| 4277 |
|
| 4278 |
======================================================================
|
| 4279 |
Summary: 2 found, 0 skipped, 0 missing
|
|
@@ -4293,7 +4293,7 @@ hf_kernels_swiglu cuda_T512_D2048 0.03 True
|
|
| 4293 |
hf_kernels_swiglu cuda_T512_D768 0.03 True
|
| 4294 |
torch_eager cuda_T128_D1024 0.05 True
|
| 4295 |
torch_eager cuda_T128_D2048 0.05 True
|
| 4296 |
-
torch_eager cuda_T128_D768 0.
|
| 4297 |
torch_eager cuda_T256_D1024 0.05 True
|
| 4298 |
torch_eager cuda_T256_D2048 0.05 True
|
| 4299 |
torch_eager cuda_T256_D768 0.05 True
|
|
@@ -4319,53 +4319,7 @@ Implementations included:
|
|
| 4319 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4320 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4321 |
<div class="uv-logs-content" style="display: none;">
|
| 4322 |
-
|
| 4323 |
-
Downloading numpy (15.9MiB)
|
| 4324 |
-
Downloading sympy (6.0MiB)
|
| 4325 |
-
Downloading networkx (1.9MiB)
|
| 4326 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4327 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4328 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4329 |
-
Downloading torch (846.8MiB)
|
| 4330 |
-
Downloading setuptools (1.1MiB)
|
| 4331 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4332 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4333 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4334 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4335 |
-
Downloading pillow (6.7MiB)
|
| 4336 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4337 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4338 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4339 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4340 |
-
Downloading fonttools (4.7MiB)
|
| 4341 |
-
Downloading triton (148.4MiB)
|
| 4342 |
-
Downloading matplotlib (8.3MiB)
|
| 4343 |
-
Downloading kiwisolver (1.4MiB)
|
| 4344 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4345 |
-
Downloading nvidia-cufile-cu12
|
| 4346 |
-
Downloading kiwisolver
|
| 4347 |
-
Downloading setuptools
|
| 4348 |
-
Downloading networkx
|
| 4349 |
-
Downloading fonttools
|
| 4350 |
-
Downloading pillow
|
| 4351 |
-
Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4352 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4353 |
-
Downloading matplotlib
|
| 4354 |
-
Downloading numpy
|
| 4355 |
-
Downloading sympy
|
| 4356 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4357 |
-
Downloading nvidia-curand-cu12
|
| 4358 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4359 |
-
Downloading triton
|
| 4360 |
-
Downloading nvidia-cufft-cu12
|
| 4361 |
-
Downloading nvidia-cusolver-cu12
|
| 4362 |
-
Downloading nvidia-cusparse-cu12
|
| 4363 |
-
Downloading nvidia-cusparselt-cu12
|
| 4364 |
-
Downloading nvidia-nccl-cu12
|
| 4365 |
-
Downloading nvidia-cublas-cu12
|
| 4366 |
-
Downloading nvidia-cudnn-cu12
|
| 4367 |
-
Downloading torch
|
| 4368 |
-
Installed 37 packages in 212ms
|
| 4369 |
</div>
|
| 4370 |
</div>
|
| 4371 |
<div class="cell-artifacts">
|
|
@@ -4378,7 +4332,7 @@ Installed 37 packages in 212ms
|
|
| 4378 |
<rdf:RDF>
|
| 4379 |
<ns2:Work>
|
| 4380 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4381 |
-
<dc:date>2025-10-
|
| 4382 |
<dc:format>image/svg+xml</dc:format>
|
| 4383 |
<dc:creator>
|
| 4384 |
<ns2:Agent>
|
|
@@ -4527,83 +4481,83 @@ Installed 37 packages in 212ms
|
|
| 4527 |
<g id="matplotlib.axis_2">
|
| 4528 |
<g id="ytick_1">
|
| 4529 |
<g id="grid-y--2" class="grid grid-y">
|
| 4530 |
-
<path d="M 60.23
|
| 4531 |
</g>
|
| 4532 |
<g id="line2d_10">
|
| 4533 |
<defs>
|
| 4534 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4535 |
</defs>
|
| 4536 |
<g>
|
| 4537 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4538 |
</g>
|
| 4539 |
</g>
|
| 4540 |
<g id="text_10">
|
| 4541 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4542 |
</g>
|
| 4543 |
</g>
|
| 4544 |
<g id="ytick_2">
|
| 4545 |
<g id="grid-y--3" class="grid grid-y">
|
| 4546 |
-
<path d="M 60.23
|
| 4547 |
</g>
|
| 4548 |
<g id="line2d_11">
|
| 4549 |
<g>
|
| 4550 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4551 |
</g>
|
| 4552 |
</g>
|
| 4553 |
<g id="text_11">
|
| 4554 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4555 |
</g>
|
| 4556 |
</g>
|
| 4557 |
<g id="ytick_3">
|
| 4558 |
<g id="grid-y--4" class="grid grid-y">
|
| 4559 |
-
<path d="M 60.23
|
| 4560 |
</g>
|
| 4561 |
<g id="line2d_12">
|
| 4562 |
<g>
|
| 4563 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4564 |
</g>
|
| 4565 |
</g>
|
| 4566 |
<g id="text_12">
|
| 4567 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4568 |
</g>
|
| 4569 |
</g>
|
| 4570 |
<g id="ytick_4">
|
| 4571 |
<g id="grid-y--5" class="grid grid-y">
|
| 4572 |
-
<path d="M 60.23
|
| 4573 |
</g>
|
| 4574 |
<g id="line2d_13">
|
| 4575 |
<g>
|
| 4576 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4577 |
</g>
|
| 4578 |
</g>
|
| 4579 |
<g id="text_13">
|
| 4580 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4581 |
</g>
|
| 4582 |
</g>
|
| 4583 |
<g id="ytick_5">
|
| 4584 |
<g id="grid-y--6" class="grid grid-y">
|
| 4585 |
-
<path d="M 60.23
|
| 4586 |
</g>
|
| 4587 |
<g id="line2d_14">
|
| 4588 |
<g>
|
| 4589 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4590 |
</g>
|
| 4591 |
</g>
|
| 4592 |
<g id="text_14">
|
| 4593 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4594 |
</g>
|
| 4595 |
</g>
|
| 4596 |
<g id="ytick_6">
|
| 4597 |
<g id="grid-y--7" class="grid grid-y">
|
| 4598 |
-
<path d="M 60.23
|
| 4599 |
</g>
|
| 4600 |
<g id="line2d_15">
|
| 4601 |
<g>
|
| 4602 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4603 |
</g>
|
| 4604 |
</g>
|
| 4605 |
<g id="text_15">
|
| 4606 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4607 |
</g>
|
| 4608 |
</g>
|
| 4609 |
<g id="label--y" class="ylabel">
|
|
@@ -4611,37 +4565,37 @@ Installed 37 packages in 212ms
|
|
| 4611 |
</g>
|
| 4612 |
</g>
|
| 4613 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4614 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4615 |
<defs>
|
| 4616 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4617 |
</defs>
|
| 4618 |
<g clip-path="url(#p620c7d392f)">
|
| 4619 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4620 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4621 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4622 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4623 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4624 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4625 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4626 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4627 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4628 |
</g>
|
| 4629 |
</g>
|
| 4630 |
<g id="series--torch-eager" class="series">
|
| 4631 |
-
<path d="M 96.005644
|
| 4632 |
<defs>
|
| 4633 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4634 |
</defs>
|
| 4635 |
<g clip-path="url(#p620c7d392f)">
|
| 4636 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4637 |
-
<use ns4:href="#m9b8c54d372" x="185.444754" y="
|
| 4638 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4639 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4640 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4641 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4642 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4643 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="
|
| 4644 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4645 |
</g>
|
| 4646 |
</g>
|
| 4647 |
<g id="patch_3">
|
|
@@ -4661,25 +4615,25 @@ Installed 37 packages in 212ms
|
|
| 4661 |
</g>
|
| 4662 |
<g id="legend" class="legend">
|
| 4663 |
<g id="patch_7">
|
| 4664 |
-
<path d="M 720.811356
|
| 4665 |
</g>
|
| 4666 |
<g id="line2d_16">
|
| 4667 |
-
<path d="M 722.811356
|
| 4668 |
<g>
|
| 4669 |
-
<use ns4:href="#md7efaf3aec" x="732.811356" y="
|
| 4670 |
</g>
|
| 4671 |
</g>
|
| 4672 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4673 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4674 |
</g>
|
| 4675 |
<g id="line2d_17">
|
| 4676 |
-
<path d="M 722.811356
|
| 4677 |
<g>
|
| 4678 |
-
<use ns4:href="#m9b8c54d372" x="732.811356" y="
|
| 4679 |
</g>
|
| 4680 |
</g>
|
| 4681 |
<g id="legend-label--torch-eager" class="legend">
|
| 4682 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="
|
| 4683 |
</g>
|
| 4684 |
</g>
|
| 4685 |
</g>
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-27T14:46:43.482898</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 4021 |
<g id="matplotlib.axis_2">
|
| 4022 |
<g id="ytick_1">
|
| 4023 |
<g id="grid-y--2" class="grid grid-y">
|
| 4024 |
+
<path d="M 60.23 452.615548 L 847.294169 452.615548 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4025 |
</g>
|
| 4026 |
<g id="line2d_10">
|
| 4027 |
<defs>
|
| 4028 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4029 |
</defs>
|
| 4030 |
<g>
|
| 4031 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="452.615548" style="stroke: #000000; stroke-width: 0.8" />
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="456.414767" transform="rotate(-0 53.23 456.414767)">0.025</text>
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_2">
|
| 4039 |
<g id="grid-y--3" class="grid grid-y">
|
| 4040 |
+
<path d="M 60.23 373.068398 L 847.294169 373.068398 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="373.068398" style="stroke: #000000; stroke-width: 0.8" />
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.867617" transform="rotate(-0 53.23 376.867617)">0.030</text>
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_3">
|
| 4052 |
<g id="grid-y--4" class="grid grid-y">
|
| 4053 |
+
<path d="M 60.23 293.521249 L 847.294169 293.521249 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="293.521249" style="stroke: #000000; stroke-width: 0.8" />
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="297.320468" transform="rotate(-0 53.23 297.320468)">0.035</text>
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_4">
|
| 4065 |
<g id="grid-y--5" class="grid grid-y">
|
| 4066 |
+
<path d="M 60.23 213.974099 L 847.294169 213.974099 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="213.974099" style="stroke: #000000; stroke-width: 0.8" />
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.773318" transform="rotate(-0 53.23 217.773318)">0.040</text>
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="ytick_5">
|
| 4078 |
<g id="grid-y--6" class="grid grid-y">
|
| 4079 |
+
<path d="M 60.23 134.42695 L 847.294169 134.42695 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4080 |
</g>
|
| 4081 |
<g id="line2d_14">
|
| 4082 |
<g>
|
| 4083 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="134.42695" style="stroke: #000000; stroke-width: 0.8" />
|
| 4084 |
</g>
|
| 4085 |
</g>
|
| 4086 |
<g id="text_14">
|
| 4087 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="138.226168" transform="rotate(-0 53.23 138.226168)">0.045</text>
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="ytick_6">
|
| 4091 |
<g id="grid-y--7" class="grid grid-y">
|
| 4092 |
+
<path d="M 60.23 54.8798 L 847.294169 54.8798 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4093 |
</g>
|
| 4094 |
<g id="line2d_15">
|
| 4095 |
<g>
|
| 4096 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="54.8798" style="stroke: #000000; stroke-width: 0.8" />
|
| 4097 |
</g>
|
| 4098 |
</g>
|
| 4099 |
<g id="text_15">
|
| 4100 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="58.679019" transform="rotate(-0 53.23 58.679019)">0.050</text>
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4108 |
+
<path d="M 96.005644 451.16779 L 185.444754 379.591266 L 274.883864 367.802376 L 364.322974 382.120864 L 453.762084 356.82487 L 543.201194 396.121166 L 632.640304 374.96162 L 722.079415 389.136924 L 811.518525 358.734003 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4109 |
<defs>
|
| 4110 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4111 |
</defs>
|
| 4112 |
<g clip-path="url(#p620c7d392f)">
|
| 4113 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4114 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="379.591266" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4115 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="367.802376" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4116 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="382.120864" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4117 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="356.82487" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4118 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="396.121166" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4119 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="374.96162" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4120 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="389.136924" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4121 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="358.734003" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--torch-eager" class="series">
|
| 4125 |
+
<path d="M 96.005644 189.63267 L 185.444754 53.272948 L 274.883864 47.08418 L 364.322974 66.175497 L 453.762084 61.545851 L 543.201194 66.795966 L 632.640304 59.954911 L 722.079415 85.26681 L 811.518525 95.751126 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4126 |
<defs>
|
| 4127 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4128 |
</defs>
|
| 4129 |
<g clip-path="url(#p620c7d392f)">
|
| 4130 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="189.63267" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4131 |
+
<use ns4:href="#m9b8c54d372" x="185.444754" y="53.272948" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4132 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4133 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="66.175497" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4134 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="61.545851" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4135 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="66.795966" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4136 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="59.954911" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4137 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="85.26681" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4138 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="95.751126" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4139 |
</g>
|
| 4140 |
</g>
|
| 4141 |
<g id="patch_3">
|
|
|
|
| 4155 |
</g>
|
| 4156 |
<g id="legend" class="legend">
|
| 4157 |
<g id="patch_7">
|
| 4158 |
+
<path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4159 |
</g>
|
| 4160 |
<g id="line2d_16">
|
| 4161 |
+
<path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4162 |
<g>
|
| 4163 |
+
<use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4164 |
</g>
|
| 4165 |
</g>
|
| 4166 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4167 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
|
| 4168 |
</g>
|
| 4169 |
<g id="line2d_17">
|
| 4170 |
+
<path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4171 |
<g>
|
| 4172 |
+
<use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4173 |
</g>
|
| 4174 |
</g>
|
| 4175 |
<g id="legend-label--torch-eager" class="legend">
|
| 4176 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
|
| 4177 |
</g>
|
| 4178 |
</g>
|
| 4179 |
</g>
|
|
|
|
| 4193 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4194 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4195 |
</span> |
|
| 4196 |
+
Cell: combine | 4.45s
|
| 4197 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4198 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4199 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4267 |
<div class="cell-stdout"><pre class="stdout-text">======================================================================
|
| 4268 |
LOADING BENCHMARK DATA
|
| 4269 |
======================================================================
|
| 4270 |
+
✓ HF Kernels SwiGLU : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b
|
| 4271 |
+
✓ PyTorch SwiGLU : /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb
|
| 4272 |
|
| 4273 |
✓ Found HF Kernels SwiGLU
|
| 4274 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/2775e6386f1caf1fda935a997130c06dcaf7641efb0db21560c35301fdabfd9b/activation.jsonl
|
| 4275 |
✓ Found PyTorch SwiGLU
|
| 4276 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/activation/impls/.uvnote/cache/661ca38adec8893d7c284140e922da661f0afcea4aaff6a3bf48a6494ce7c6eb/activation.jsonl
|
| 4277 |
|
| 4278 |
======================================================================
|
| 4279 |
Summary: 2 found, 0 skipped, 0 missing
|
|
|
|
| 4293 |
hf_kernels_swiglu cuda_T512_D768 0.03 True
|
| 4294 |
torch_eager cuda_T128_D1024 0.05 True
|
| 4295 |
torch_eager cuda_T128_D2048 0.05 True
|
| 4296 |
+
torch_eager cuda_T128_D768 0.04 True
|
| 4297 |
torch_eager cuda_T256_D1024 0.05 True
|
| 4298 |
torch_eager cuda_T256_D2048 0.05 True
|
| 4299 |
torch_eager cuda_T256_D768 0.05 True
|
|
|
|
| 4319 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4320 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4321 |
<div class="uv-logs-content" style="display: none;">
|
| 4322 |
+
Installed 37 packages in 250ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4323 |
</div>
|
| 4324 |
</div>
|
| 4325 |
<div class="cell-artifacts">
|
|
|
|
| 4332 |
<rdf:RDF>
|
| 4333 |
<ns2:Work>
|
| 4334 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4335 |
+
<dc:date>2025-10-27T14:46:43.482898</dc:date>
|
| 4336 |
<dc:format>image/svg+xml</dc:format>
|
| 4337 |
<dc:creator>
|
| 4338 |
<ns2:Agent>
|
|
|
|
| 4481 |
<g id="matplotlib.axis_2">
|
| 4482 |
<g id="ytick_1">
|
| 4483 |
<g id="grid-y--2" class="grid grid-y">
|
| 4484 |
+
<path d="M 60.23 452.615548 L 847.294169 452.615548 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4485 |
</g>
|
| 4486 |
<g id="line2d_10">
|
| 4487 |
<defs>
|
| 4488 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4489 |
</defs>
|
| 4490 |
<g>
|
| 4491 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="452.615548" style="stroke: #000000; stroke-width: 0.8" />
|
| 4492 |
</g>
|
| 4493 |
</g>
|
| 4494 |
<g id="text_10">
|
| 4495 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="456.414767" transform="rotate(-0 53.23 456.414767)">0.025</text>
|
| 4496 |
</g>
|
| 4497 |
</g>
|
| 4498 |
<g id="ytick_2">
|
| 4499 |
<g id="grid-y--3" class="grid grid-y">
|
| 4500 |
+
<path d="M 60.23 373.068398 L 847.294169 373.068398 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4501 |
</g>
|
| 4502 |
<g id="line2d_11">
|
| 4503 |
<g>
|
| 4504 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="373.068398" style="stroke: #000000; stroke-width: 0.8" />
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="text_11">
|
| 4508 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.867617" transform="rotate(-0 53.23 376.867617)">0.030</text>
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="ytick_3">
|
| 4512 |
<g id="grid-y--4" class="grid grid-y">
|
| 4513 |
+
<path d="M 60.23 293.521249 L 847.294169 293.521249 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4514 |
</g>
|
| 4515 |
<g id="line2d_12">
|
| 4516 |
<g>
|
| 4517 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="293.521249" style="stroke: #000000; stroke-width: 0.8" />
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_12">
|
| 4521 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="297.320468" transform="rotate(-0 53.23 297.320468)">0.035</text>
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_4">
|
| 4525 |
<g id="grid-y--5" class="grid grid-y">
|
| 4526 |
+
<path d="M 60.23 213.974099 L 847.294169 213.974099 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_13">
|
| 4529 |
<g>
|
| 4530 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="213.974099" style="stroke: #000000; stroke-width: 0.8" />
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_13">
|
| 4534 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.773318" transform="rotate(-0 53.23 217.773318)">0.040</text>
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_5">
|
| 4538 |
<g id="grid-y--6" class="grid grid-y">
|
| 4539 |
+
<path d="M 60.23 134.42695 L 847.294169 134.42695 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_14">
|
| 4542 |
<g>
|
| 4543 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="134.42695" style="stroke: #000000; stroke-width: 0.8" />
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_14">
|
| 4547 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="138.226168" transform="rotate(-0 53.23 138.226168)">0.045</text>
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_6">
|
| 4551 |
<g id="grid-y--7" class="grid grid-y">
|
| 4552 |
+
<path d="M 60.23 54.8798 L 847.294169 54.8798 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_15">
|
| 4555 |
<g>
|
| 4556 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="54.8798" style="stroke: #000000; stroke-width: 0.8" />
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_15">
|
| 4560 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="58.679019" transform="rotate(-0 53.23 58.679019)">0.050</text>
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4565 |
</g>
|
| 4566 |
</g>
|
| 4567 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4568 |
+
<path d="M 96.005644 451.16779 L 185.444754 379.591266 L 274.883864 367.802376 L 364.322974 382.120864 L 453.762084 356.82487 L 543.201194 396.121166 L 632.640304 374.96162 L 722.079415 389.136924 L 811.518525 358.734003 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4569 |
<defs>
|
| 4570 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4571 |
</defs>
|
| 4572 |
<g clip-path="url(#p620c7d392f)">
|
| 4573 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4574 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="379.591266" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4575 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="367.802376" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4576 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="382.120864" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4577 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="356.82487" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4578 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="396.121166" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4579 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="374.96162" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4580 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="389.136924" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4581 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="358.734003" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4582 |
</g>
|
| 4583 |
</g>
|
| 4584 |
<g id="series--torch-eager" class="series">
|
| 4585 |
+
<path d="M 96.005644 189.63267 L 185.444754 53.272948 L 274.883864 47.08418 L 364.322974 66.175497 L 453.762084 61.545851 L 543.201194 66.795966 L 632.640304 59.954911 L 722.079415 85.26681 L 811.518525 95.751126 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4586 |
<defs>
|
| 4587 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4588 |
</defs>
|
| 4589 |
<g clip-path="url(#p620c7d392f)">
|
| 4590 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="189.63267" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4591 |
+
<use ns4:href="#m9b8c54d372" x="185.444754" y="53.272948" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4592 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4593 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="66.175497" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4594 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="61.545851" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4595 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="66.795966" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4596 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="59.954911" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4597 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="85.26681" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4598 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="95.751126" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4599 |
</g>
|
| 4600 |
</g>
|
| 4601 |
<g id="patch_3">
|
|
|
|
| 4615 |
</g>
|
| 4616 |
<g id="legend" class="legend">
|
| 4617 |
<g id="patch_7">
|
| 4618 |
+
<path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4619 |
</g>
|
| 4620 |
<g id="line2d_16">
|
| 4621 |
+
<path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4622 |
<g>
|
| 4623 |
+
<use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4624 |
</g>
|
| 4625 |
</g>
|
| 4626 |
<g id="legend-label--hf-kernels-swiglu" class="legend">
|
| 4627 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
|
| 4628 |
</g>
|
| 4629 |
<g id="line2d_17">
|
| 4630 |
+
<path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4631 |
<g>
|
| 4632 |
+
<use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4633 |
</g>
|
| 4634 |
</g>
|
| 4635 |
<g id="legend-label--torch-eager" class="legend">
|
| 4636 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
|
| 4637 |
</g>
|
| 4638 |
</g>
|
| 4639 |
</g>
|
flash_attn/impls/artifacts/benchmark/attention.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
|
| 2 |
+
{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
|
| 3 |
+
{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
|
| 4 |
+
{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
|
| 5 |
+
{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
|
| 6 |
+
{"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -3,8 +3,9 @@
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
-
# "kernels-benchmark-tools",
|
| 7 |
# "kernels",
|
|
|
|
|
|
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -15,18 +16,18 @@ import sys
|
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
from kernels import get_kernel
|
| 17 |
|
| 18 |
-
# Load the
|
| 19 |
-
|
| 20 |
|
| 21 |
|
| 22 |
-
def
|
| 23 |
-
"""
|
| 24 |
-
return
|
| 25 |
|
| 26 |
|
| 27 |
run_benchmark(
|
| 28 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 29 |
-
impl_name="
|
| 30 |
-
impl_tags={"family": "
|
| 31 |
-
impl_func=
|
| 32 |
)
|
|
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
|
|
|
| 6 |
# "kernels",
|
| 7 |
+
# "kernels-benchmark-tools",
|
| 8 |
+
# "sageattention",
|
| 9 |
# ]
|
| 10 |
#
|
| 11 |
# [tool.uv.sources]
|
|
|
|
| 16 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 17 |
from kernels import get_kernel
|
| 18 |
|
| 19 |
+
# Load the sage attention kernel
|
| 20 |
+
hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention")
|
| 21 |
|
| 22 |
|
| 23 |
+
def sage_attention(query, key, value):
|
| 24 |
+
"""SageAttention with INT8 Q/K quantization and FP16 P/V"""
|
| 25 |
+
return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
|
| 26 |
|
| 27 |
|
| 28 |
run_benchmark(
|
| 29 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 30 |
+
impl_name="sage_int8_fp16",
|
| 31 |
+
impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"},
|
| 32 |
+
impl_func=sage_attention,
|
| 33 |
)
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv |
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3888,34 +3888,22 @@ Cell: nv | 4.05s
|
|
| 3888 |
</div>
|
| 3889 |
</div>
|
| 3890 |
<div id="output-nv" class="cell-output">
|
| 3891 |
-
<div class="cell-stdout"><pre class="stdout-text">
|
| 3892 |
+-----------------------------------------------------------------------------------------+
|
| 3893 |
-
| NVIDIA-SMI
|
| 3894 |
|-----------------------------------------+------------------------+----------------------+
|
| 3895 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3896 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3897 |
| | | MIG M. |
|
| 3898 |
|=========================================+========================+======================|
|
| 3899 |
-
| 0 NVIDIA
|
| 3900 |
-
| N/A
|
| 3901 |
-
| | | N/A |
|
| 3902 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3903 |
-
| 1 NVIDIA L4 Off | 00000000:3A:00.0 Off | 0 |
|
| 3904 |
-
| N/A 33C P0 28W / 72W | 1MiB / 23034MiB | 2% Default |
|
| 3905 |
-
| | | N/A |
|
| 3906 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3907 |
-
| 2 NVIDIA L4 Off | 00000000:3C:00.0 Off | 0 |
|
| 3908 |
-
| N/A 34C P0 27W / 72W | 1MiB / 23034MiB | 1% Default |
|
| 3909 |
-
| | | N/A |
|
| 3910 |
-
+-----------------------------------------+------------------------+----------------------+
|
| 3911 |
-
| 3 NVIDIA L4 Off | 00000000:3E:00.0 Off | 0 |
|
| 3912 |
-
| N/A 33C P0 27W / 72W | 1MiB / 23034MiB | 2% Default |
|
| 3913 |
| | | N/A |
|
| 3914 |
+-----------------------------------------+------------------------+----------------------+
|
| 3915 |
|
| 3916 |
+-----------------------------------------------------------------------------------------+
|
| 3917 |
| Processes: |
|
| 3918 |
-
| GPU GI CI
|
| 3919 |
| ID ID Usage |
|
| 3920 |
|=========================================================================================|
|
| 3921 |
| No running processes found |
|
|
@@ -3931,9 +3919,9 @@ Cell: nv | 4.05s
|
|
| 3931 |
<span class="collapse-indicators">
|
| 3932 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3933 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3934 |
-
<span id="uv-indicator-benchmark"
|
| 3935 |
</span> |
|
| 3936 |
-
Cell: benchmark |
|
| 3937 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3938 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3939 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3984,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
|
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
-
torch_flash_ma
|
| 3988 |
-
torch_flash_ma
|
| 3989 |
-
aten::scaled_dot_product_attention 0.
|
| 3990 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 3991 |
-
aten::_flash_attention_forward 0.
|
| 3992 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3993 |
-
aten::contiguous 0.
|
| 3994 |
-
aten::clone 0.
|
| 3995 |
-
aten::copy_
|
| 3996 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3997 |
-
Activity Buffer Request
|
| 3998 |
-
aten::transpose
|
| 3999 |
-
aten::as_strided 0.
|
| 4000 |
-
aten::empty_like 0.
|
| 4001 |
-
aten::empty
|
| 4002 |
-
cudaLaunchKernel
|
| 4003 |
-
aten::empty_strided 0.
|
| 4004 |
-
cudaDeviceGetAttribute 0.
|
| 4005 |
-
cudaFuncSetAttribute 0.
|
| 4006 |
-
cudaDeviceSynchronize
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
Self CPU time total:
|
| 4009 |
-
Self CUDA time total:
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
@@ -4016,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
torch_flash_ma
|
| 4020 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4021 |
-
aten::scaled_dot_product_attention 0.
|
| 4022 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4023 |
-
aten::_flash_attention_forward 0.
|
| 4024 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4025 |
-
aten::contiguous 0.
|
| 4026 |
-
aten::clone 0.
|
| 4027 |
-
aten::copy_
|
| 4028 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4029 |
-
Activity Buffer Request
|
| 4030 |
-
aten::transpose
|
| 4031 |
-
aten::as_strided 0.
|
| 4032 |
-
aten::empty_like 0.
|
| 4033 |
-
aten::empty
|
| 4034 |
-
cudaLaunchKernel
|
| 4035 |
-
aten::empty_strided 0.
|
| 4036 |
-
cudaDeviceGetAttribute 0.
|
| 4037 |
-
cudaFuncSetAttribute 0.
|
| 4038 |
-
cudaDeviceSynchronize
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
-
Self CPU time total:
|
| 4041 |
-
Self CUDA time total:
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
@@ -4048,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
-
torch_flash_ma
|
| 4052 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4053 |
-
aten::scaled_dot_product_attention 0.
|
| 4054 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4055 |
-
aten::_flash_attention_forward 0.
|
| 4056 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4057 |
-
aten::contiguous 0.
|
| 4058 |
-
aten::clone 0.
|
| 4059 |
-
aten::copy_
|
| 4060 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4061 |
-
Activity Buffer Request
|
| 4062 |
-
aten::transpose
|
| 4063 |
-
aten::as_strided 0.
|
| 4064 |
-
aten::empty_like 0.
|
| 4065 |
-
aten::empty
|
| 4066 |
-
cudaLaunchKernel
|
| 4067 |
-
aten::empty_strided 0.
|
| 4068 |
-
cudaDeviceGetAttribute 0.
|
| 4069 |
-
cudaFuncSetAttribute 0.
|
| 4070 |
-
cudaDeviceSynchronize
|
| 4071 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4072 |
-
Self CPU time total:
|
| 4073 |
-
Self CUDA time total:
|
| 4074 |
|
| 4075 |
|
| 4076 |
|
|
@@ -4080,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
|
|
| 4080 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4081 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4082 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4083 |
-
torch_flash_ma
|
| 4084 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4085 |
-
aten::scaled_dot_product_attention 0.
|
| 4086 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4087 |
-
aten::_flash_attention_forward 0.
|
| 4088 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4089 |
-
aten::contiguous 0.
|
| 4090 |
-
aten::clone 0.
|
| 4091 |
-
aten::copy_
|
| 4092 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4093 |
-
Activity Buffer Request
|
| 4094 |
-
aten::transpose 0.
|
| 4095 |
-
aten::as_strided 0.
|
| 4096 |
-
aten::empty_like 0.
|
| 4097 |
-
aten::empty
|
| 4098 |
-
cudaLaunchKernel
|
| 4099 |
-
aten::empty_strided 0.
|
| 4100 |
-
cudaDeviceGetAttribute 0.
|
| 4101 |
-
cudaFuncSetAttribute 0.
|
| 4102 |
-
cudaDeviceSynchronize
|
| 4103 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4104 |
-
Self CPU time total:
|
| 4105 |
-
Self CUDA time total:
|
| 4106 |
|
| 4107 |
|
| 4108 |
|
|
@@ -4112,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
|
|
| 4112 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4113 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4114 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4115 |
-
torch_flash_ma
|
| 4116 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4117 |
-
aten::scaled_dot_product_attention 0.
|
| 4118 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4119 |
-
aten::_flash_attention_forward 0.
|
| 4120 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4121 |
-
aten::contiguous 0.
|
| 4122 |
-
aten::clone 0.
|
| 4123 |
-
aten::copy_
|
| 4124 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4125 |
-
Activity Buffer Request
|
| 4126 |
-
aten::transpose 0.
|
| 4127 |
-
aten::as_strided 0.
|
| 4128 |
-
aten::empty_like 0.
|
| 4129 |
-
aten::empty
|
| 4130 |
-
cudaLaunchKernel
|
| 4131 |
-
aten::empty_strided 0.
|
| 4132 |
-
cudaDeviceGetAttribute 0.
|
| 4133 |
-
cudaFuncSetAttribute 0.
|
| 4134 |
-
cudaDeviceSynchronize
|
| 4135 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4136 |
-
Self CPU time total:
|
| 4137 |
-
Self CUDA time total:
|
| 4138 |
|
| 4139 |
|
| 4140 |
|
|
@@ -4144,91 +4132,39 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
|
|
| 4144 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4145 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4146 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4147 |
-
torch_flash_ma
|
| 4148 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4149 |
-
aten::scaled_dot_product_attention 0.
|
| 4150 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4151 |
-
aten::_flash_attention_forward 0.
|
| 4152 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4153 |
-
aten::contiguous 0.
|
| 4154 |
-
aten::clone 0.
|
| 4155 |
-
aten::copy_
|
| 4156 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4157 |
-
Activity Buffer Request
|
| 4158 |
-
aten::transpose 0.
|
| 4159 |
-
aten::as_strided 0.
|
| 4160 |
-
aten::empty_like 0.
|
| 4161 |
-
aten::empty
|
| 4162 |
-
cudaLaunchKernel
|
| 4163 |
-
aten::empty_strided 0.
|
| 4164 |
-
cudaDeviceGetAttribute 0.
|
| 4165 |
-
cudaFuncSetAttribute 0.
|
| 4166 |
-
cudaDeviceSynchronize
|
| 4167 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4168 |
-
Self CPU time total:
|
| 4169 |
-
Self CUDA time total:
|
| 4170 |
|
| 4171 |
|
| 4172 |
impl wl p50(ms) ok
|
| 4173 |
-
torch_flash_ma cuda_attn_L128_bfloat16
|
| 4174 |
-
torch_flash_ma cuda_attn_L256_bfloat16
|
| 4175 |
-
torch_flash_ma cuda_attn_L320_bfloat16
|
| 4176 |
-
torch_flash_ma cuda_attn_L384_bfloat16
|
| 4177 |
-
torch_flash_ma cuda_attn_L448_bfloat16
|
| 4178 |
-
torch_flash_ma cuda_attn_L512_bfloat16
|
| 4179 |
</pre></div>
|
| 4180 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4181 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4182 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4183 |
-
Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4184 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4185 |
-
Downloading numpy (15.9MiB)
|
| 4186 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4187 |
-
Downloading pillow (6.7MiB)
|
| 4188 |
-
Downloading fonttools (4.7MiB)
|
| 4189 |
-
Downloading networkx (1.9MiB)
|
| 4190 |
-
Downloading setuptools (1.1MiB)
|
| 4191 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4192 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4193 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4194 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4195 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4196 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4197 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4198 |
-
Downloading torch (846.8MiB)
|
| 4199 |
-
Downloading kiwisolver (1.4MiB)
|
| 4200 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4201 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4202 |
-
Downloading matplotlib (8.3MiB)
|
| 4203 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4204 |
-
Downloading sympy (6.0MiB)
|
| 4205 |
-
Downloading triton (148.4MiB)
|
| 4206 |
-
Downloading nvidia-cufile-cu12
|
| 4207 |
-
Downloading kiwisolver
|
| 4208 |
-
Downloading setuptools
|
| 4209 |
-
Downloading networkx
|
| 4210 |
-
Downloading fonttools
|
| 4211 |
-
Downloading pillow
|
| 4212 |
-
Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4213 |
-
Downloading matplotlib
|
| 4214 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4215 |
-
Downloading numpy
|
| 4216 |
-
Downloading sympy
|
| 4217 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4218 |
-
Downloading nvidia-curand-cu12
|
| 4219 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4220 |
-
Downloading triton
|
| 4221 |
-
Downloading nvidia-cufft-cu12
|
| 4222 |
-
Downloading nvidia-cusolver-cu12
|
| 4223 |
-
Downloading nvidia-cusparse-cu12
|
| 4224 |
-
Downloading nvidia-cusparselt-cu12
|
| 4225 |
-
Downloading nvidia-nccl-cu12
|
| 4226 |
-
Downloading nvidia-cublas-cu12
|
| 4227 |
-
Downloading nvidia-cudnn-cu12
|
| 4228 |
-
Downloading torch
|
| 4229 |
-
Installed 37 packages in 231ms
|
| 4230 |
-
</div>
|
| 4231 |
-
</div>
|
| 4232 |
<div class="cell-artifacts">
|
| 4233 |
<h4>Artifacts:</h4>
|
| 4234 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.26s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3888 |
</div>
|
| 3889 |
</div>
|
| 3890 |
<div id="output-nv" class="cell-output">
|
| 3891 |
+
<div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:45:45 2025
|
| 3892 |
+-----------------------------------------------------------------------------------------+
|
| 3893 |
+
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3894 |
|-----------------------------------------+------------------------+----------------------+
|
| 3895 |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3896 |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3897 |
| | | MIG M. |
|
| 3898 |
|=========================================+========================+======================|
|
| 3899 |
+
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3900 |
+
| N/A 31C P0 135W / 350W | 0MiB / 46068MiB | 100% Default |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3901 |
| | | N/A |
|
| 3902 |
+-----------------------------------------+------------------------+----------------------+
|
| 3903 |
|
| 3904 |
+-----------------------------------------------------------------------------------------+
|
| 3905 |
| Processes: |
|
| 3906 |
+
| GPU GI CI PID Type Process name GPU Memory |
|
| 3907 |
| ID ID Usage |
|
| 3908 |
|=========================================================================================|
|
| 3909 |
| No running processes found |
|
|
|
|
| 3919 |
<span class="collapse-indicators">
|
| 3920 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3921 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3922 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3923 |
</span> |
|
| 3924 |
+
Cell: benchmark | 3.87s
|
| 3925 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3926 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3927 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.610ms 101.76% 3.610ms 3.610ms 1
|
| 3976 |
+
torch_flash_ma 6.54% 340.396us 46.01% 2.394ms 2.394ms 0.000us 0.00% 3.588ms 3.588ms 1
|
| 3977 |
+
aten::scaled_dot_product_attention 0.84% 43.810us 4.24% 220.593us 73.531us 0.000us 0.00% 2.829ms 943.091us 3
|
| 3978 |
+
aten::_scaled_dot_product_flash_attention 0.51% 26.609us 3.40% 176.783us 58.928us 0.000us 0.00% 2.829ms 943.091us 3
|
| 3979 |
+
aten::_flash_attention_forward 0.74% 38.381us 2.45% 127.692us 42.564us 2.829ms 79.74% 2.829ms 943.091us 3
|
| 3980 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.829ms 79.74% 2.829ms 943.091us 3
|
| 3981 |
+
aten::contiguous 0.29% 15.001us 33.86% 1.762ms 146.802us 0.000us 0.00% 759.072us 63.256us 12
|
| 3982 |
+
aten::clone 0.76% 39.432us 33.57% 1.747ms 145.552us 0.000us 0.00% 759.072us 63.256us 12
|
| 3983 |
+
aten::copy_ 1.71% 88.801us 31.26% 1.626ms 135.534us 718.688us 20.26% 759.072us 63.256us 12
|
| 3984 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 718.688us 20.26% 718.688us 59.891us 12
|
| 3985 |
+
Activity Buffer Request 27.68% 1.440ms 27.68% 1.440ms 1.440ms 40.384us 1.14% 40.384us 40.384us 1
|
| 3986 |
+
aten::transpose 1.34% 69.973us 1.80% 93.503us 3.896us 0.000us 0.00% 0.000us 0.000us 24
|
| 3987 |
+
aten::as_strided 0.45% 23.530us 0.45% 23.530us 0.980us 0.000us 0.00% 0.000us 0.000us 24
|
| 3988 |
+
aten::empty_like 0.50% 25.908us 1.97% 102.319us 6.821us 0.000us 0.00% 0.000us 0.000us 15
|
| 3989 |
+
aten::empty 1.75% 91.041us 1.75% 91.041us 3.793us 0.000us 0.00% 0.000us 0.000us 24
|
| 3990 |
+
cudaLaunchKernel 2.36% 123.031us 2.36% 123.031us 8.202us 0.000us 0.00% 0.000us 0.000us 15
|
| 3991 |
+
aten::empty_strided 0.31% 16.010us 0.31% 16.010us 5.337us 0.000us 0.00% 0.000us 0.000us 3
|
| 3992 |
+
cudaDeviceGetAttribute 0.05% 2.700us 0.05% 2.700us 0.450us 0.000us 0.00% 0.000us 0.000us 6
|
| 3993 |
+
cudaFuncSetAttribute 0.17% 8.980us 0.17% 8.980us 2.993us 0.000us 0.00% 0.000us 0.000us 3
|
| 3994 |
+
cudaDeviceSynchronize 53.99% 2.809ms 53.99% 2.809ms 2.809ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
+
Self CPU time total: 5.203ms
|
| 3997 |
+
Self CUDA time total: 3.548ms
|
| 3998 |
|
| 3999 |
|
| 4000 |
|
|
|
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
+
torch_flash_ma 5.17% 272.917us 42.06% 2.218ms 2.218ms 0.000us 0.00% 3.821ms 3.821ms 1
|
| 4008 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.777ms 100.28% 3.777ms 3.777ms 1
|
| 4009 |
+
aten::scaled_dot_product_attention 0.53% 27.761us 3.55% 187.333us 62.444us 0.000us 0.00% 3.004ms 1.001ms 3
|
| 4010 |
+
aten::_scaled_dot_product_flash_attention 0.37% 19.492us 3.03% 159.572us 53.191us 0.000us 0.00% 3.004ms 1.001ms 3
|
| 4011 |
+
aten::_flash_attention_forward 0.75% 39.549us 2.23% 117.371us 39.124us 3.004ms 79.75% 3.004ms 1.001ms 3
|
| 4012 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.004ms 79.75% 3.004ms 1.001ms 3
|
| 4013 |
+
aten::contiguous 0.20% 10.320us 32.06% 1.691ms 140.876us 0.000us 0.00% 817.314us 68.110us 12
|
| 4014 |
+
aten::clone 0.55% 29.048us 31.86% 1.680ms 140.016us 0.000us 0.00% 817.314us 68.110us 12
|
| 4015 |
+
aten::copy_ 1.64% 86.662us 30.11% 1.588ms 132.347us 762.658us 20.25% 817.314us 68.110us 12
|
| 4016 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 762.658us 20.25% 762.658us 63.555us 12
|
| 4017 |
+
Activity Buffer Request 26.84% 1.415ms 26.84% 1.415ms 1.415ms 54.656us 1.45% 54.656us 54.656us 1
|
| 4018 |
+
aten::transpose 1.36% 71.528us 1.71% 90.179us 3.757us 0.000us 0.00% 0.000us 0.000us 24
|
| 4019 |
+
aten::as_strided 0.35% 18.651us 0.35% 18.651us 0.777us 0.000us 0.00% 0.000us 0.000us 24
|
| 4020 |
+
aten::empty_like 0.38% 19.801us 1.55% 81.840us 5.456us 0.000us 0.00% 0.000us 0.000us 15
|
| 4021 |
+
aten::empty 1.46% 77.040us 1.46% 77.040us 3.210us 0.000us 0.00% 0.000us 0.000us 24
|
| 4022 |
+
cudaLaunchKernel 2.07% 108.973us 2.07% 108.973us 7.265us 0.000us 0.00% 0.000us 0.000us 15
|
| 4023 |
+
aten::empty_strided 0.26% 13.940us 0.26% 13.940us 4.647us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaDeviceGetAttribute 0.06% 2.910us 0.06% 2.910us 0.485us 0.000us 0.00% 0.000us 0.000us 6
|
| 4025 |
+
cudaFuncSetAttribute 0.08% 4.240us 0.08% 4.240us 1.413us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
cudaDeviceSynchronize 57.94% 3.056ms 57.94% 3.056ms 3.056ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 5.274ms
|
| 4029 |
+
Self CUDA time total: 3.767ms
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
torch_flash_ma 4.99% 269.576us 41.89% 2.262ms 2.262ms 0.000us 0.00% 3.875ms 3.875ms 1
|
| 4040 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.827ms 100.29% 3.827ms 3.827ms 1
|
| 4041 |
+
aten::scaled_dot_product_attention 0.50% 27.011us 3.47% 187.262us 62.421us 0.000us 0.00% 3.037ms 1.012ms 3
|
| 4042 |
+
aten::_scaled_dot_product_flash_attention 0.35% 18.851us 2.97% 160.251us 53.417us 0.000us 0.00% 3.037ms 1.012ms 3
|
| 4043 |
+
aten::_flash_attention_forward 0.72% 39.000us 2.20% 118.550us 39.517us 3.037ms 79.57% 3.037ms 1.012ms 3
|
| 4044 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.037ms 79.57% 3.037ms 1.012ms 3
|
| 4045 |
+
aten::contiguous 0.18% 9.780us 32.51% 1.755ms 146.253us 0.000us 0.00% 838.461us 69.872us 12
|
| 4046 |
+
aten::clone 0.54% 29.119us 32.32% 1.745ms 145.438us 0.000us 0.00% 838.461us 69.872us 12
|
| 4047 |
+
aten::copy_ 1.56% 84.200us 30.52% 1.648ms 137.328us 779.741us 20.43% 838.461us 69.872us 12
|
| 4048 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.741us 20.43% 779.741us 64.978us 12
|
| 4049 |
+
Activity Buffer Request 27.41% 1.480ms 27.41% 1.480ms 1.480ms 58.720us 1.54% 58.720us 58.720us 1
|
| 4050 |
+
aten::transpose 1.00% 54.180us 1.34% 72.500us 3.021us 0.000us 0.00% 0.000us 0.000us 24
|
| 4051 |
+
aten::as_strided 0.34% 18.320us 0.34% 18.320us 0.763us 0.000us 0.00% 0.000us 0.000us 24
|
| 4052 |
+
aten::empty_like 0.36% 19.560us 1.66% 89.381us 5.959us 0.000us 0.00% 0.000us 0.000us 15
|
| 4053 |
+
aten::empty 1.53% 82.821us 1.53% 82.821us 3.451us 0.000us 0.00% 0.000us 0.000us 24
|
| 4054 |
+
cudaLaunchKernel 1.99% 107.272us 1.99% 107.272us 7.151us 0.000us 0.00% 0.000us 0.000us 15
|
| 4055 |
+
aten::empty_strided 0.30% 16.380us 0.30% 16.380us 5.460us 0.000us 0.00% 0.000us 0.000us 3
|
| 4056 |
+
cudaDeviceGetAttribute 0.03% 1.850us 0.03% 1.850us 0.308us 0.000us 0.00% 0.000us 0.000us 6
|
| 4057 |
+
cudaFuncSetAttribute 0.07% 3.830us 0.07% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
|
| 4058 |
+
cudaDeviceSynchronize 58.11% 3.138ms 58.11% 3.138ms 3.138ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
+
Self CPU time total: 5.399ms
|
| 4061 |
+
Self CUDA time total: 3.817ms
|
| 4062 |
|
| 4063 |
|
| 4064 |
|
|
|
|
| 4068 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4069 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
+
torch_flash_ma 4.76% 268.853us 43.13% 2.435ms 2.435ms 0.000us 0.00% 3.964ms 3.964ms 1
|
| 4072 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.917ms 100.30% 3.917ms 3.917ms 1
|
| 4073 |
+
aten::scaled_dot_product_attention 0.49% 27.720us 3.46% 195.333us 65.111us 0.000us 0.00% 3.118ms 1.039ms 3
|
| 4074 |
+
aten::_scaled_dot_product_flash_attention 0.34% 19.471us 2.97% 167.613us 55.871us 0.000us 0.00% 3.118ms 1.039ms 3
|
| 4075 |
+
aten::_flash_attention_forward 0.70% 39.530us 2.23% 125.742us 41.914us 3.118ms 79.84% 3.118ms 1.039ms 3
|
| 4076 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.118ms 79.84% 3.118ms 1.039ms 3
|
| 4077 |
+
aten::contiguous 0.17% 9.719us 34.03% 1.921ms 160.116us 0.000us 0.00% 845.599us 70.467us 12
|
| 4078 |
+
aten::clone 0.52% 29.239us 33.85% 1.912ms 159.306us 0.000us 0.00% 845.599us 70.467us 12
|
| 4079 |
+
aten::copy_ 1.54% 86.910us 32.19% 1.818ms 151.460us 787.167us 20.16% 845.599us 70.467us 12
|
| 4080 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 787.167us 20.16% 787.167us 65.597us 12
|
| 4081 |
+
Activity Buffer Request 25.41% 1.435ms 25.41% 1.435ms 1.435ms 58.432us 1.50% 58.432us 58.432us 1
|
| 4082 |
+
aten::transpose 0.96% 54.080us 1.28% 72.141us 3.006us 0.000us 0.00% 0.000us 0.000us 24
|
| 4083 |
+
aten::as_strided 0.32% 18.061us 0.32% 18.061us 0.753us 0.000us 0.00% 0.000us 0.000us 24
|
| 4084 |
+
aten::empty_like 0.35% 19.512us 1.49% 84.134us 5.609us 0.000us 0.00% 0.000us 0.000us 15
|
| 4085 |
+
aten::empty 1.53% 86.581us 1.53% 86.581us 3.608us 0.000us 0.00% 0.000us 0.000us 24
|
| 4086 |
+
cudaLaunchKernel 5.66% 319.547us 5.66% 319.547us 21.303us 0.000us 0.00% 0.000us 0.000us 15
|
| 4087 |
+
aten::empty_strided 0.26% 14.430us 0.26% 14.430us 4.810us 0.000us 0.00% 0.000us 0.000us 3
|
| 4088 |
+
cudaDeviceGetAttribute 0.05% 2.740us 0.05% 2.740us 0.457us 0.000us 0.00% 0.000us 0.000us 6
|
| 4089 |
+
cudaFuncSetAttribute 0.07% 4.201us 0.07% 4.201us 1.400us 0.000us 0.00% 0.000us 0.000us 3
|
| 4090 |
+
cudaDeviceSynchronize 56.87% 3.211ms 56.87% 3.211ms 3.211ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4091 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4092 |
+
Self CPU time total: 5.647ms
|
| 4093 |
+
Self CUDA time total: 3.906ms
|
| 4094 |
|
| 4095 |
|
| 4096 |
|
|
|
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
+
torch_flash_ma 5.25% 320.614us 40.80% 2.490ms 2.490ms 0.000us 0.00% 4.428ms 4.428ms 1
|
| 4104 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.377ms 100.25% 4.377ms 4.377ms 1
|
| 4105 |
+
aten::scaled_dot_product_attention 0.44% 26.800us 3.27% 199.713us 66.571us 0.000us 0.00% 3.558ms 1.186ms 3
|
| 4106 |
+
aten::_scaled_dot_product_flash_attention 0.32% 19.239us 2.83% 172.913us 57.638us 0.000us 0.00% 3.558ms 1.186ms 3
|
| 4107 |
+
aten::_flash_attention_forward 0.64% 38.816us 2.13% 129.963us 43.321us 3.558ms 81.48% 3.558ms 1.186ms 3
|
| 4108 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.558ms 81.48% 3.558ms 1.186ms 3
|
| 4109 |
+
aten::contiguous 0.17% 10.568us 31.48% 1.922ms 160.138us 0.000us 0.00% 870.015us 72.501us 12
|
| 4110 |
+
aten::clone 0.48% 29.552us 31.31% 1.911ms 159.257us 0.000us 0.00% 870.015us 72.501us 12
|
| 4111 |
+
aten::copy_ 1.37% 83.622us 29.71% 1.813ms 151.123us 808.479us 18.52% 870.015us 72.501us 12
|
| 4112 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 808.479us 18.52% 808.479us 67.373us 12
|
| 4113 |
+
Activity Buffer Request 24.07% 1.469ms 24.07% 1.469ms 1.469ms 61.536us 1.41% 61.536us 61.536us 1
|
| 4114 |
+
aten::transpose 0.88% 53.494us 1.18% 71.893us 2.996us 0.000us 0.00% 0.000us 0.000us 24
|
| 4115 |
+
aten::as_strided 0.30% 18.399us 0.30% 18.399us 0.767us 0.000us 0.00% 0.000us 0.000us 24
|
| 4116 |
+
aten::empty_like 0.45% 27.388us 1.61% 98.450us 6.563us 0.000us 0.00% 0.000us 0.000us 15
|
| 4117 |
+
aten::empty 1.35% 82.243us 1.35% 82.243us 3.427us 0.000us 0.00% 0.000us 0.000us 24
|
| 4118 |
+
cudaLaunchKernel 4.68% 285.943us 4.68% 285.943us 19.063us 0.000us 0.00% 0.000us 0.000us 15
|
| 4119 |
+
aten::empty_strided 0.29% 17.820us 0.29% 17.820us 5.940us 0.000us 0.00% 0.000us 0.000us 3
|
| 4120 |
+
cudaDeviceGetAttribute 0.04% 2.328us 0.04% 2.328us 0.388us 0.000us 0.00% 0.000us 0.000us 6
|
| 4121 |
+
cudaFuncSetAttribute 0.07% 4.078us 0.07% 4.078us 1.359us 0.000us 0.00% 0.000us 0.000us 3
|
| 4122 |
+
cudaDeviceSynchronize 59.20% 3.614ms 59.20% 3.614ms 3.614ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4123 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4124 |
+
Self CPU time total: 6.104ms
|
| 4125 |
+
Self CUDA time total: 4.366ms
|
| 4126 |
|
| 4127 |
|
| 4128 |
|
|
|
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4134 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4135 |
+
torch_flash_ma 4.45% 272.752us 38.96% 2.390ms 2.390ms 0.000us 0.00% 4.517ms 4.517ms 1
|
| 4136 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.467ms 100.24% 4.467ms 4.467ms 1
|
| 4137 |
+
aten::scaled_dot_product_attention 0.45% 27.641us 3.22% 197.213us 65.738us 0.000us 0.00% 3.636ms 1.212ms 3
|
| 4138 |
+
aten::_scaled_dot_product_flash_attention 0.32% 19.841us 2.76% 169.572us 56.524us 0.000us 0.00% 3.636ms 1.212ms 3
|
| 4139 |
+
aten::_flash_attention_forward 0.71% 43.282us 2.06% 126.092us 42.031us 3.636ms 81.58% 3.636ms 1.212ms 3
|
| 4140 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.636ms 81.58% 3.636ms 1.212ms 3
|
| 4141 |
+
aten::contiguous 0.18% 11.069us 30.46% 1.869ms 155.711us 0.000us 0.00% 881.085us 73.424us 12
|
| 4142 |
+
aten::clone 0.50% 30.953us 30.28% 1.857ms 154.789us 0.000us 0.00% 881.085us 73.424us 12
|
| 4143 |
+
aten::copy_ 1.39% 85.529us 28.66% 1.758ms 146.482us 820.670us 18.42% 881.085us 73.424us 12
|
| 4144 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 820.670us 18.42% 820.670us 68.389us 12
|
| 4145 |
+
Activity Buffer Request 23.40% 1.435ms 23.40% 1.435ms 1.435ms 60.415us 1.36% 60.415us 60.415us 1
|
| 4146 |
+
aten::transpose 0.92% 56.138us 1.22% 75.130us 3.130us 0.000us 0.00% 0.000us 0.000us 24
|
| 4147 |
+
aten::as_strided 0.31% 18.992us 0.31% 18.992us 0.791us 0.000us 0.00% 0.000us 0.000us 24
|
| 4148 |
+
aten::empty_like 0.33% 20.287us 1.48% 90.810us 6.054us 0.000us 0.00% 0.000us 0.000us 15
|
| 4149 |
+
aten::empty 1.36% 83.613us 1.36% 83.613us 3.484us 0.000us 0.00% 0.000us 0.000us 24
|
| 4150 |
+
cudaLaunchKernel 4.26% 261.175us 4.26% 261.175us 17.412us 0.000us 0.00% 0.000us 0.000us 15
|
| 4151 |
+
aten::empty_strided 0.28% 17.260us 0.28% 17.260us 5.753us 0.000us 0.00% 0.000us 0.000us 3
|
| 4152 |
+
cudaDeviceGetAttribute 0.03% 1.850us 0.03% 1.850us 0.308us 0.000us 0.00% 0.000us 0.000us 6
|
| 4153 |
+
cudaFuncSetAttribute 0.07% 4.250us 0.07% 4.250us 1.417us 0.000us 0.00% 0.000us 0.000us 3
|
| 4154 |
+
cudaDeviceSynchronize 61.04% 3.744ms 61.04% 3.744ms 3.744ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
+
Self CPU time total: 6.134ms
|
| 4157 |
+
Self CUDA time total: 4.456ms
|
| 4158 |
|
| 4159 |
|
| 4160 |
impl wl p50(ms) ok
|
| 4161 |
+
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4162 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
|
| 4163 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.31 True
|
| 4164 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.34 True
|
| 4165 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
|
| 4166 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
|
| 4167 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4168 |
<div class="cell-artifacts">
|
| 4169 |
<h4>Artifacts:</h4>
|
| 4170 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
|
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3928 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3929 |
-
hf_kernels_flash_attn
|
| 3930 |
-
_flash_attn_9e27194::fwd
|
| 3931 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3932 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3933 |
-
Activity Buffer Request
|
| 3934 |
-
cudaDeviceGetAttribute 0.
|
| 3935 |
-
aten::empty_like 0.
|
| 3936 |
-
aten::empty_strided 0.
|
| 3937 |
-
aten::empty 0.
|
| 3938 |
-
cudaFuncSetAttribute 0.
|
| 3939 |
-
cudaLaunchKernel 0.
|
| 3940 |
-
cudaDeviceSynchronize
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
-
Self CPU time total:
|
| 3943 |
-
Self CUDA time total:
|
| 3944 |
|
| 3945 |
|
| 3946 |
|
|
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
|
|
| 3950 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3951 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3952 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3953 |
-
hf_kernels_flash_attn
|
| 3954 |
-
_flash_attn_9e27194::fwd
|
| 3955 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3956 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3957 |
-
Activity Buffer Request
|
| 3958 |
-
cudaDeviceGetAttribute 0.
|
| 3959 |
-
aten::empty_like 0.
|
| 3960 |
-
aten::empty_strided 0.
|
| 3961 |
-
aten::empty 0.
|
| 3962 |
-
cudaFuncSetAttribute 0.
|
| 3963 |
-
cudaLaunchKernel 0.
|
| 3964 |
-
cudaDeviceSynchronize
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
-
Self CPU time total:
|
| 3967 |
-
Self CUDA time total:
|
| 3968 |
|
| 3969 |
|
| 3970 |
|
|
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
|
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
-
hf_kernels_flash_attn
|
| 3978 |
-
_flash_attn_9e27194::fwd
|
| 3979 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3980 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3981 |
-
Activity Buffer Request
|
| 3982 |
-
cudaDeviceGetAttribute 0.
|
| 3983 |
-
aten::empty_like 0.
|
| 3984 |
-
aten::empty_strided 0.
|
| 3985 |
-
aten::empty 0.
|
| 3986 |
-
cudaFuncSetAttribute 0.
|
| 3987 |
-
cudaLaunchKernel 0.
|
| 3988 |
-
cudaDeviceSynchronize
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
-
Self CPU time total:
|
| 3991 |
-
Self CUDA time total:
|
| 3992 |
|
| 3993 |
|
| 3994 |
|
|
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
|
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
-
hf_kernels_flash_attn
|
| 4002 |
-
_flash_attn_9e27194::fwd
|
| 4003 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4004 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4005 |
-
Activity Buffer Request
|
| 4006 |
-
cudaDeviceGetAttribute 0.
|
| 4007 |
-
aten::empty_like 0.
|
| 4008 |
-
aten::empty_strided 0.
|
| 4009 |
-
aten::empty 0.
|
| 4010 |
-
cudaFuncSetAttribute 0.
|
| 4011 |
-
cudaLaunchKernel
|
| 4012 |
-
cudaDeviceSynchronize
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
-
Self CPU time total:
|
| 4015 |
-
Self CUDA time total:
|
| 4016 |
|
| 4017 |
|
| 4018 |
|
|
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
|
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
-
hf_kernels_flash_attn
|
| 4026 |
-
_flash_attn_9e27194::fwd
|
| 4027 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4028 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4029 |
-
Activity Buffer Request
|
| 4030 |
-
cudaDeviceGetAttribute 0.
|
| 4031 |
-
aten::empty_like 0.
|
| 4032 |
-
aten::empty_strided 0.
|
| 4033 |
-
aten::empty 0.
|
| 4034 |
-
cudaFuncSetAttribute 0.
|
| 4035 |
-
cudaLaunchKernel
|
| 4036 |
-
cudaDeviceSynchronize
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
Self CPU time total:
|
| 4039 |
-
Self CUDA time total:
|
| 4040 |
|
| 4041 |
|
| 4042 |
|
|
@@ -4046,89 +4046,88 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
|
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
hf_kernels_flash_attn
|
| 4050 |
-
_flash_attn_9e27194::fwd
|
| 4051 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4052 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4053 |
-
Activity Buffer Request
|
| 4054 |
-
cudaDeviceGetAttribute 0.
|
| 4055 |
-
aten::empty_like 0.
|
| 4056 |
-
aten::empty_strided 0.
|
| 4057 |
-
aten::empty 0.
|
| 4058 |
-
cudaFuncSetAttribute 0.
|
| 4059 |
-
cudaLaunchKernel 4.
|
| 4060 |
-
cudaDeviceSynchronize
|
| 4061 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4062 |
-
Self CPU time total:
|
| 4063 |
-
Self CUDA time total:
|
| 4064 |
|
| 4065 |
|
| 4066 |
impl wl p50(ms) ok
|
| 4067 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16
|
| 4068 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16
|
| 4069 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16
|
| 4070 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16
|
| 4071 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16
|
| 4072 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16
|
| 4073 |
</pre></div>
|
| 4074 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4075 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4076 |
<div class="uv-logs-content" style="display: none;">
|
| 4077 |
-
Building kernels-benchmark-tools @ file:///
|
| 4078 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4079 |
Downloading hf-xet (3.2MiB)
|
| 4080 |
-
Downloading
|
| 4081 |
-
Downloading sympy (6.0MiB)
|
| 4082 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4083 |
-
Downloading fonttools (4.7MiB)
|
| 4084 |
-
Downloading kiwisolver (1.4MiB)
|
| 4085 |
Downloading networkx (1.9MiB)
|
| 4086 |
-
Downloading nvidia-
|
| 4087 |
-
Downloading pillow (6.7MiB)
|
| 4088 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4089 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4090 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4091 |
-
Downloading
|
| 4092 |
-
Downloading
|
| 4093 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4094 |
-
Downloading nvidia-
|
| 4095 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4096 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4097 |
-
Downloading
|
| 4098 |
-
Downloading
|
|
|
|
| 4099 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4100 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 4101 |
Downloading nvidia-cufile-cu12
|
| 4102 |
Downloading kiwisolver
|
| 4103 |
Downloading hf-xet
|
| 4104 |
Downloading setuptools
|
| 4105 |
-
Downloading fonttools
|
| 4106 |
Downloading networkx
|
|
|
|
| 4107 |
Downloading pillow
|
| 4108 |
-
Built kernels-benchmark-tools @ file:///
|
| 4109 |
Downloading nvidia-cuda-cupti-cu12
|
| 4110 |
Downloading matplotlib
|
| 4111 |
Downloading numpy
|
| 4112 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4113 |
Downloading sympy
|
|
|
|
| 4114 |
Downloading nvidia-curand-cu12
|
| 4115 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4116 |
Downloading triton
|
| 4117 |
Downloading nvidia-cufft-cu12
|
| 4118 |
Downloading nvidia-cusolver-cu12
|
| 4119 |
-
Downloading nvidia-cusparse-cu12
|
| 4120 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 4121 |
Downloading nvidia-nccl-cu12
|
| 4122 |
Downloading nvidia-cublas-cu12
|
| 4123 |
Downloading nvidia-cudnn-cu12
|
| 4124 |
Downloading torch
|
| 4125 |
-
Installed
|
| 4126 |
</div>
|
| 4127 |
</div>
|
| 4128 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4129 |
-
Fetching 20 files:
|
| 4130 |
-
Fetching 20 files:
|
| 4131 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:03<00:00, 5.86it/s]</div>
|
| 4132 |
<div class="cell-artifacts">
|
| 4133 |
<h4>Artifacts:</h4>
|
| 4134 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 35.44s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3928 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3929 |
+
hf_kernels_flash_attn 3.89% 173.532us 41.54% 1.852ms 1.852ms 0.000us 0.00% 3.821ms 3.821ms 1
|
| 3930 |
+
_flash_attn_9e27194::fwd 1.71% 76.382us 37.65% 1.679ms 559.513us 2.851ms 100.00% 3.821ms 1.274ms 3
|
| 3931 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.852ms 100.05% 2.852ms 2.852ms 1
|
| 3932 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.851ms 100.00% 2.851ms 950.289us 3
|
| 3933 |
+
Activity Buffer Request 32.53% 1.450ms 32.53% 1.450ms 1.450ms 970.364us 34.04% 970.364us 970.364us 1
|
| 3934 |
+
cudaDeviceGetAttribute 0.10% 4.520us 0.10% 4.520us 0.301us 0.000us 0.00% 0.000us 0.000us 15
|
| 3935 |
+
aten::empty_like 0.46% 20.440us 1.29% 57.461us 19.154us 0.000us 0.00% 0.000us 0.000us 3
|
| 3936 |
+
aten::empty_strided 0.83% 37.021us 0.83% 37.021us 12.340us 0.000us 0.00% 0.000us 0.000us 3
|
| 3937 |
+
aten::empty 0.76% 33.730us 0.76% 33.730us 3.748us 0.000us 0.00% 0.000us 0.000us 9
|
| 3938 |
+
cudaFuncSetAttribute 0.29% 12.870us 0.29% 12.870us 4.290us 0.000us 0.00% 0.000us 0.000us 3
|
| 3939 |
+
cudaLaunchKernel 0.97% 43.280us 0.97% 43.280us 14.427us 0.000us 0.00% 0.000us 0.000us 3
|
| 3940 |
+
cudaDeviceSynchronize 58.46% 2.606ms 58.46% 2.606ms 2.606ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
+
Self CPU time total: 4.458ms
|
| 3943 |
+
Self CUDA time total: 2.851ms
|
| 3944 |
|
| 3945 |
|
| 3946 |
|
|
|
|
| 3950 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3951 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3952 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3953 |
+
hf_kernels_flash_attn 2.32% 104.162us 37.24% 1.676ms 1.676ms 0.000us 0.00% 4.000ms 4.000ms 1
|
| 3954 |
+
_flash_attn_9e27194::fwd 1.05% 47.052us 34.93% 1.571ms 523.812us 2.988ms 100.00% 4.000ms 1.333ms 3
|
| 3955 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.989ms 100.04% 2.989ms 2.989ms 1
|
| 3956 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.988ms 100.00% 2.988ms 995.942us 3
|
| 3957 |
+
Activity Buffer Request 32.02% 1.441ms 32.02% 1.441ms 1.441ms 1.012ms 33.87% 1.012ms 1.012ms 1
|
| 3958 |
+
cudaDeviceGetAttribute 0.10% 4.331us 0.10% 4.331us 0.289us 0.000us 0.00% 0.000us 0.000us 15
|
| 3959 |
+
aten::empty_like 0.16% 7.210us 0.52% 23.350us 7.783us 0.000us 0.00% 0.000us 0.000us 3
|
| 3960 |
+
aten::empty_strided 0.36% 16.140us 0.36% 16.140us 5.380us 0.000us 0.00% 0.000us 0.000us 3
|
| 3961 |
+
aten::empty 0.47% 21.320us 0.47% 21.320us 2.369us 0.000us 0.00% 0.000us 0.000us 9
|
| 3962 |
+
cudaFuncSetAttribute 0.10% 4.349us 0.10% 4.349us 1.450us 0.000us 0.00% 0.000us 0.000us 3
|
| 3963 |
+
cudaLaunchKernel 0.67% 30.329us 0.67% 30.329us 10.110us 0.000us 0.00% 0.000us 0.000us 3
|
| 3964 |
+
cudaDeviceSynchronize 62.76% 2.824ms 62.76% 2.824ms 2.824ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
+
Self CPU time total: 4.499ms
|
| 3967 |
+
Self CUDA time total: 2.988ms
|
| 3968 |
|
| 3969 |
|
| 3970 |
|
|
|
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
+
hf_kernels_flash_attn 2.58% 116.241us 37.17% 1.677ms 1.677ms 0.000us 0.00% 4.040ms 4.040ms 1
|
| 3978 |
+
_flash_attn_9e27194::fwd 1.11% 49.909us 34.60% 1.561ms 520.326us 3.012ms 100.00% 4.040ms 1.347ms 3
|
| 3979 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.013ms 100.04% 3.013ms 3.013ms 1
|
| 3980 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.012ms 100.00% 3.012ms 1.004ms 3
|
| 3981 |
+
Activity Buffer Request 31.60% 1.426ms 31.60% 1.426ms 1.426ms 1.029ms 34.16% 1.029ms 1.029ms 1
|
| 3982 |
+
cudaDeviceGetAttribute 0.08% 3.801us 0.08% 3.801us 0.253us 0.000us 0.00% 0.000us 0.000us 15
|
| 3983 |
+
aten::empty_like 0.18% 8.151us 0.55% 24.960us 8.320us 0.000us 0.00% 0.000us 0.000us 3
|
| 3984 |
+
aten::empty_strided 0.37% 16.809us 0.37% 16.809us 5.603us 0.000us 0.00% 0.000us 0.000us 3
|
| 3985 |
+
aten::empty 0.47% 21.201us 0.47% 21.201us 2.356us 0.000us 0.00% 0.000us 0.000us 9
|
| 3986 |
+
cudaFuncSetAttribute 0.09% 3.950us 0.09% 3.950us 1.317us 0.000us 0.00% 0.000us 0.000us 3
|
| 3987 |
+
cudaLaunchKernel 0.69% 31.260us 0.69% 31.260us 10.420us 0.000us 0.00% 0.000us 0.000us 3
|
| 3988 |
+
cudaDeviceSynchronize 62.83% 2.835ms 62.83% 2.835ms 2.835ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
+
Self CPU time total: 4.512ms
|
| 3991 |
+
Self CUDA time total: 3.012ms
|
| 3992 |
|
| 3993 |
|
| 3994 |
|
|
|
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
+
hf_kernels_flash_attn 2.01% 99.212us 38.53% 1.898ms 1.898ms 0.000us 0.00% 4.264ms 4.264ms 1
|
| 4002 |
+
_flash_attn_9e27194::fwd 1.06% 52.152us 36.51% 1.799ms 599.723us 3.190ms 100.00% 4.264ms 1.421ms 3
|
| 4003 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.191ms 100.05% 3.191ms 3.191ms 1
|
| 4004 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.190ms 100.00% 3.190ms 1.063ms 3
|
| 4005 |
+
Activity Buffer Request 28.82% 1.420ms 28.82% 1.420ms 1.420ms 1.074ms 33.68% 1.074ms 1.074ms 1
|
| 4006 |
+
cudaDeviceGetAttribute 0.09% 4.479us 0.09% 4.479us 0.299us 0.000us 0.00% 0.000us 0.000us 15
|
| 4007 |
+
aten::empty_like 0.16% 7.900us 0.54% 26.470us 8.823us 0.000us 0.00% 0.000us 0.000us 3
|
| 4008 |
+
aten::empty_strided 0.38% 18.570us 0.38% 18.570us 6.190us 0.000us 0.00% 0.000us 0.000us 3
|
| 4009 |
+
aten::empty 0.46% 22.430us 0.46% 22.430us 2.492us 0.000us 0.00% 0.000us 0.000us 9
|
| 4010 |
+
cudaFuncSetAttribute 0.08% 3.830us 0.08% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
|
| 4011 |
+
cudaLaunchKernel 5.47% 269.763us 5.47% 269.763us 89.921us 0.000us 0.00% 0.000us 0.000us 3
|
| 4012 |
+
cudaDeviceSynchronize 61.47% 3.029ms 61.47% 3.029ms 3.029ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
+
Self CPU time total: 4.928ms
|
| 4015 |
+
Self CUDA time total: 3.190ms
|
| 4016 |
|
| 4017 |
|
| 4018 |
|
|
|
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
+
hf_kernels_flash_attn 2.16% 88.971us 14.91% 614.057us 614.057us 0.000us 0.00% 4.875ms 4.875ms 1
|
| 4026 |
+
_flash_attn_9e27194::fwd 1.23% 50.539us 12.75% 525.086us 175.029us 3.652ms 100.00% 4.875ms 1.625ms 3
|
| 4027 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.653ms 100.04% 3.653ms 3.653ms 1
|
| 4028 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3
|
| 4029 |
+
Activity Buffer Request 5.08% 209.112us 5.08% 209.112us 209.112us 1.223ms 33.50% 1.223ms 1.223ms 1
|
| 4030 |
+
cudaDeviceGetAttribute 0.10% 3.960us 0.10% 3.960us 0.264us 0.000us 0.00% 0.000us 0.000us 15
|
| 4031 |
+
aten::empty_like 0.19% 7.749us 0.60% 24.700us 8.233us 0.000us 0.00% 0.000us 0.000us 3
|
| 4032 |
+
aten::empty_strided 0.41% 16.951us 0.41% 16.951us 5.650us 0.000us 0.00% 0.000us 0.000us 3
|
| 4033 |
+
aten::empty 0.54% 22.121us 0.54% 22.121us 2.458us 0.000us 0.00% 0.000us 0.000us 9
|
| 4034 |
+
cudaFuncSetAttribute 0.10% 4.190us 0.10% 4.190us 1.397us 0.000us 0.00% 0.000us 0.000us 3
|
| 4035 |
+
cudaLaunchKernel 5.11% 210.464us 5.11% 210.464us 70.155us 0.000us 0.00% 0.000us 0.000us 3
|
| 4036 |
+
cudaDeviceSynchronize 85.09% 3.504ms 85.09% 3.504ms 3.504ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
+
Self CPU time total: 4.118ms
|
| 4039 |
+
Self CUDA time total: 3.652ms
|
| 4040 |
|
| 4041 |
|
| 4042 |
|
|
|
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
hf_kernels_flash_attn 2.23% 91.402us 14.65% 600.857us 600.857us 0.000us 0.00% 4.881ms 4.881ms 1
|
| 4050 |
+
_flash_attn_9e27194::fwd 1.15% 47.191us 12.42% 509.455us 169.818us 3.654ms 100.00% 4.881ms 1.627ms 3
|
| 4051 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 100.04% 3.655ms 3.655ms 1
|
| 4052 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.00% 3.654ms 1.218ms 3
|
| 4053 |
+
Activity Buffer Request 5.38% 220.623us 5.38% 220.623us 220.623us 1.227ms 33.59% 1.227ms 1.227ms 1
|
| 4054 |
+
cudaDeviceGetAttribute 0.09% 3.601us 0.09% 3.601us 0.240us 0.000us 0.00% 0.000us 0.000us 15
|
| 4055 |
+
aten::empty_like 0.18% 7.230us 0.58% 23.840us 7.947us 0.000us 0.00% 0.000us 0.000us 3
|
| 4056 |
+
aten::empty_strided 0.40% 16.610us 0.40% 16.610us 5.537us 0.000us 0.00% 0.000us 0.000us 3
|
| 4057 |
+
aten::empty 0.51% 20.851us 0.51% 20.851us 2.317us 0.000us 0.00% 0.000us 0.000us 9
|
| 4058 |
+
cudaFuncSetAttribute 0.09% 3.688us 0.09% 3.688us 1.229us 0.000us 0.00% 0.000us 0.000us 3
|
| 4059 |
+
cudaLaunchKernel 4.62% 189.661us 4.62% 189.661us 63.220us 0.000us 0.00% 0.000us 0.000us 3
|
| 4060 |
+
cudaDeviceSynchronize 85.35% 3.502ms 85.35% 3.502ms 3.502ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4061 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4062 |
+
Self CPU time total: 4.103ms
|
| 4063 |
+
Self CUDA time total: 3.654ms
|
| 4064 |
|
| 4065 |
|
| 4066 |
impl wl p50(ms) ok
|
| 4067 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.98 True
|
| 4068 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
|
| 4069 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
|
| 4070 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
|
| 4071 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
|
| 4072 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
|
| 4073 |
</pre></div>
|
| 4074 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4075 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4076 |
<div class="uv-logs-content" style="display: none;">
|
| 4077 |
+
Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
|
|
|
| 4078 |
Downloading hf-xet (3.2MiB)
|
| 4079 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4080 |
Downloading networkx (1.9MiB)
|
| 4081 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
|
|
|
| 4082 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4083 |
+
Downloading kiwisolver (1.4MiB)
|
| 4084 |
+
Downloading pillow (6.7MiB)
|
| 4085 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4086 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4087 |
+
Downloading sympy (6.0MiB)
|
| 4088 |
+
Downloading setuptools (1.1MiB)
|
| 4089 |
+
Downloading matplotlib (8.3MiB)
|
| 4090 |
+
Downloading numpy (16.2MiB)
|
| 4091 |
+
Downloading triton (148.3MiB)
|
| 4092 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4093 |
+
Downloading fonttools (4.7MiB)
|
| 4094 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4095 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4096 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4097 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4098 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4099 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4100 |
+
Downloading torch (846.9MiB)
|
| 4101 |
Downloading nvidia-cufile-cu12
|
| 4102 |
Downloading kiwisolver
|
| 4103 |
Downloading hf-xet
|
| 4104 |
Downloading setuptools
|
|
|
|
| 4105 |
Downloading networkx
|
| 4106 |
+
Downloading fonttools
|
| 4107 |
Downloading pillow
|
| 4108 |
+
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4109 |
Downloading nvidia-cuda-cupti-cu12
|
| 4110 |
Downloading matplotlib
|
| 4111 |
Downloading numpy
|
|
|
|
| 4112 |
Downloading sympy
|
| 4113 |
+
Downloading nvidia-nvjitlink-cu12
|
| 4114 |
Downloading nvidia-curand-cu12
|
| 4115 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4116 |
Downloading triton
|
| 4117 |
Downloading nvidia-cufft-cu12
|
| 4118 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4119 |
Downloading nvidia-cusparselt-cu12
|
| 4120 |
+
Downloading nvidia-cusparse-cu12
|
| 4121 |
Downloading nvidia-nccl-cu12
|
| 4122 |
Downloading nvidia-cublas-cu12
|
| 4123 |
Downloading nvidia-cudnn-cu12
|
| 4124 |
Downloading torch
|
| 4125 |
+
Installed 52 packages in 223ms
|
| 4126 |
</div>
|
| 4127 |
</div>
|
| 4128 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4129 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:12, 1.43it/s]
|
| 4130 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 14.34it/s]</div>
|
|
|
|
| 4131 |
<div class="cell-artifacts">
|
| 4132 |
<h4>Artifacts:</h4>
|
| 4133 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark"
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
|
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3927 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3928 |
-
hf_kernels_flash_attn3
|
| 3929 |
-
FlashAttnFunc
|
| 3930 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 3931 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3932 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3933 |
-
Activity Buffer Request
|
| 3934 |
-
aten::empty
|
| 3935 |
-
cudaFuncSetAttribute 0.
|
| 3936 |
-
cudaLaunchKernel
|
| 3937 |
-
cudaDeviceSynchronize
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
-
Self CPU time total:
|
| 3940 |
-
Self CUDA time total:
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
-
|
| 3951 |
-
|
| 3952 |
-
|
| 3953 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3954 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3955 |
-
Activity Buffer Request
|
| 3956 |
-
aten::empty 0.
|
| 3957 |
-
cudaFuncSetAttribute 0.
|
| 3958 |
-
cudaLaunchKernel 0.
|
| 3959 |
-
cudaDeviceSynchronize
|
| 3960 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3961 |
-
Self CPU time total:
|
| 3962 |
-
Self CUDA time total:
|
| 3963 |
|
| 3964 |
|
| 3965 |
|
|
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
|
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
-
hf_kernels_flash_attn3
|
| 3973 |
-
FlashAttnFunc
|
| 3974 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 3975 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3976 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3977 |
-
Activity Buffer Request
|
| 3978 |
-
aten::empty 0.
|
| 3979 |
-
cudaFuncSetAttribute 0.
|
| 3980 |
-
cudaLaunchKernel 0.
|
| 3981 |
-
cudaDeviceSynchronize
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
-
Self CPU time total:
|
| 3984 |
-
Self CUDA time total:
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
-
hf_kernels_flash_attn3
|
| 3995 |
-
FlashAttnFunc
|
| 3996 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 3997 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3998 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
Activity Buffer Request
|
| 4000 |
-
aten::empty 0.
|
| 4001 |
-
cudaFuncSetAttribute 0.
|
| 4002 |
-
cudaLaunchKernel
|
| 4003 |
-
cudaDeviceSynchronize
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
-
Self CPU time total:
|
| 4006 |
-
Self CUDA time total:
|
| 4007 |
|
| 4008 |
|
| 4009 |
|
|
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
|
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
-
hf_kernels_flash_attn3
|
| 4017 |
-
FlashAttnFunc
|
| 4018 |
-
_flash_attn3_48fe103_dirty::fwd 0.
|
| 4019 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4020 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4021 |
-
Activity Buffer Request
|
| 4022 |
-
aten::empty 0.
|
| 4023 |
-
cudaFuncSetAttribute 0.
|
| 4024 |
-
cudaLaunchKernel
|
| 4025 |
-
cudaDeviceSynchronize
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
-
Self CPU time total:
|
| 4028 |
-
Self CUDA time total:
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
@@ -4035,87 +4035,34 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
hf_kernels_flash_attn3
|
| 4039 |
-
FlashAttnFunc
|
| 4040 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 4041 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4042 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4043 |
-
Activity Buffer Request
|
| 4044 |
-
aten::empty 0.
|
| 4045 |
-
cudaFuncSetAttribute 0.
|
| 4046 |
-
cudaLaunchKernel
|
| 4047 |
-
cudaDeviceSynchronize
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
Self CPU time total:
|
| 4050 |
-
Self CUDA time total:
|
| 4051 |
|
| 4052 |
|
| 4053 |
impl wl p50(ms) ok
|
| 4054 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16
|
| 4055 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16
|
| 4056 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16
|
| 4057 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16
|
| 4058 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16
|
| 4059 |
-
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16
|
| 4060 |
</pre></div>
|
| 4061 |
-
<div class="
|
| 4062 |
-
|
| 4063 |
-
|
| 4064 |
-
|
| 4065 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4066 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4067 |
-
Downloading sympy (6.0MiB)
|
| 4068 |
-
Downloading setuptools (1.1MiB)
|
| 4069 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4070 |
-
Downloading numpy (15.9MiB)
|
| 4071 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4072 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4073 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4074 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4075 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4076 |
-
Downloading matplotlib (8.3MiB)
|
| 4077 |
-
Downloading hf-xet (3.2MiB)
|
| 4078 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4079 |
-
Downloading pillow (6.7MiB)
|
| 4080 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4081 |
-
Downloading networkx (1.9MiB)
|
| 4082 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4083 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4084 |
-
Downloading triton (148.4MiB)
|
| 4085 |
-
Downloading fonttools (4.7MiB)
|
| 4086 |
-
Downloading kiwisolver (1.4MiB)
|
| 4087 |
-
Downloading torch (846.8MiB)
|
| 4088 |
-
Downloading nvidia-cufile-cu12
|
| 4089 |
-
Downloading kiwisolver
|
| 4090 |
-
Downloading hf-xet
|
| 4091 |
-
Downloading setuptools
|
| 4092 |
-
Downloading networkx
|
| 4093 |
-
Downloading fonttools
|
| 4094 |
-
Downloading pillow
|
| 4095 |
-
Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4096 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4097 |
-
Downloading matplotlib
|
| 4098 |
-
Downloading numpy
|
| 4099 |
-
Downloading sympy
|
| 4100 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4101 |
-
Downloading nvidia-curand-cu12
|
| 4102 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4103 |
-
Downloading triton
|
| 4104 |
-
Downloading nvidia-cufft-cu12
|
| 4105 |
-
Downloading nvidia-cusolver-cu12
|
| 4106 |
-
Downloading nvidia-cusparse-cu12
|
| 4107 |
-
Downloading nvidia-cusparselt-cu12
|
| 4108 |
-
Downloading nvidia-nccl-cu12
|
| 4109 |
-
Downloading nvidia-cublas-cu12
|
| 4110 |
-
Downloading nvidia-cudnn-cu12
|
| 4111 |
-
Downloading torch
|
| 4112 |
-
Installed 47 packages in 222ms
|
| 4113 |
</div>
|
| 4114 |
-
</div>
|
| 4115 |
-
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4116 |
-
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 7.95it/s]
|
| 4117 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.15it/s]
|
| 4118 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.64it/s]</div>
|
| 4119 |
<div class="cell-artifacts">
|
| 4120 |
<h4>Artifacts:</h4>
|
| 4121 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 5.62s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3927 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3928 |
+
hf_kernels_flash_attn3 3.90% 171.143us 44.22% 1.941ms 1.941ms 0.000us 0.00% 3.653ms 3.653ms 1
|
| 3929 |
+
FlashAttnFunc 2.92% 128.011us 40.32% 1.769ms 589.788us 0.000us 0.00% 3.653ms 1.218ms 3
|
| 3930 |
+
_flash_attn3_48fe103_dirty::fwd 1.90% 83.422us 37.41% 1.641ms 547.118us 2.755ms 100.00% 3.653ms 1.218ms 3
|
| 3931 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.756ms 100.05% 2.756ms 2.756ms 1
|
| 3932 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.00% 2.755ms 918.306us 3
|
| 3933 |
+
Activity Buffer Request 33.13% 1.454ms 33.13% 1.454ms 1.454ms 898.082us 32.60% 898.082us 898.082us 1
|
| 3934 |
+
aten::empty 1.02% 44.762us 1.02% 44.762us 7.460us 0.000us 0.00% 0.000us 0.000us 6
|
| 3935 |
+
cudaFuncSetAttribute 0.33% 14.660us 0.33% 14.660us 4.887us 0.000us 0.00% 0.000us 0.000us 3
|
| 3936 |
+
cudaLaunchKernel 1.02% 44.660us 1.02% 44.660us 14.887us 0.000us 0.00% 0.000us 0.000us 3
|
| 3937 |
+
cudaDeviceSynchronize 55.78% 2.447ms 55.78% 2.447ms 2.447ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
+
Self CPU time total: 4.388ms
|
| 3940 |
+
Self CUDA time total: 2.755ms
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
+
hf_kernels_flash_attn3 2.42% 105.470us 40.03% 1.743ms 1.743ms 0.000us 0.00% 3.784ms 3.784ms 1
|
| 3951 |
+
FlashAttnFunc 2.12% 92.121us 37.61% 1.638ms 546.005us 0.000us 0.00% 3.784ms 1.261ms 3
|
| 3952 |
+
_flash_attn3_48fe103_dirty::fwd 1.23% 53.460us 35.49% 1.546ms 515.298us 2.836ms 100.00% 3.784ms 1.261ms 3
|
| 3953 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.838ms 100.05% 2.838ms 2.838ms 1
|
| 3954 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.836ms 100.00% 2.836ms 945.359us 3
|
| 3955 |
+
Activity Buffer Request 32.85% 1.431ms 32.85% 1.431ms 1.431ms 947.652us 33.41% 947.652us 947.652us 1
|
| 3956 |
+
aten::empty 0.62% 27.052us 0.62% 27.052us 4.509us 0.000us 0.00% 0.000us 0.000us 6
|
| 3957 |
+
cudaFuncSetAttribute 0.11% 4.721us 0.11% 4.721us 1.574us 0.000us 0.00% 0.000us 0.000us 3
|
| 3958 |
+
cudaLaunchKernel 0.68% 29.730us 0.68% 29.730us 9.910us 0.000us 0.00% 0.000us 0.000us 3
|
| 3959 |
+
cudaDeviceSynchronize 59.97% 2.612ms 59.97% 2.612ms 2.612ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3960 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3961 |
+
Self CPU time total: 4.355ms
|
| 3962 |
+
Self CUDA time total: 2.836ms
|
| 3963 |
|
| 3964 |
|
| 3965 |
|
|
|
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
+
hf_kernels_flash_attn3 2.34% 104.112us 39.68% 1.767ms 1.767ms 0.000us 0.00% 3.931ms 3.931ms 1
|
| 3973 |
+
FlashAttnFunc 2.59% 115.143us 37.35% 1.662ms 554.155us 0.000us 0.00% 3.931ms 1.310ms 3
|
| 3974 |
+
_flash_attn3_48fe103_dirty::fwd 1.23% 54.772us 34.76% 1.547ms 515.774us 2.932ms 100.00% 3.931ms 1.310ms 3
|
| 3975 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.934ms 100.05% 2.934ms 2.934ms 1
|
| 3976 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.432us 3
|
| 3977 |
+
Activity Buffer Request 32.05% 1.427ms 32.05% 1.427ms 1.427ms 998.487us 34.05% 998.487us 998.487us 1
|
| 3978 |
+
aten::empty 0.66% 29.309us 0.66% 29.309us 4.885us 0.000us 0.00% 0.000us 0.000us 6
|
| 3979 |
+
cudaFuncSetAttribute 0.11% 4.840us 0.11% 4.840us 1.613us 0.000us 0.00% 0.000us 0.000us 3
|
| 3980 |
+
cudaLaunchKernel 0.71% 31.520us 0.71% 31.520us 10.507us 0.000us 0.00% 0.000us 0.000us 3
|
| 3981 |
+
cudaDeviceSynchronize 60.32% 2.685ms 60.32% 2.685ms 2.685ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
+
Self CPU time total: 4.452ms
|
| 3984 |
+
Self CUDA time total: 2.932ms
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
+
hf_kernels_flash_attn3 2.48% 118.391us 41.58% 1.983ms 1.983ms 0.000us 0.00% 4.029ms 4.029ms 1
|
| 3995 |
+
FlashAttnFunc 2.00% 95.232us 39.09% 1.865ms 621.579us 0.000us 0.00% 4.029ms 1.343ms 3
|
| 3996 |
+
_flash_attn3_48fe103_dirty::fwd 1.18% 56.301us 37.10% 1.770ms 589.835us 3.014ms 100.00% 4.029ms 1.343ms 3
|
| 3997 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.016ms 100.06% 3.016ms 3.016ms 1
|
| 3998 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.014ms 100.00% 3.014ms 1.005ms 3
|
| 3999 |
+
Activity Buffer Request 30.19% 1.440ms 30.19% 1.440ms 1.440ms 1.015ms 33.67% 1.015ms 1.015ms 1
|
| 4000 |
+
aten::empty 0.58% 27.710us 0.58% 27.710us 4.618us 0.000us 0.00% 0.000us 0.000us 6
|
| 4001 |
+
cudaFuncSetAttribute 0.10% 4.771us 0.10% 4.771us 1.590us 0.000us 0.00% 0.000us 0.000us 3
|
| 4002 |
+
cudaLaunchKernel 5.05% 240.873us 5.05% 240.873us 80.291us 0.000us 0.00% 0.000us 0.000us 3
|
| 4003 |
+
cudaDeviceSynchronize 58.42% 2.787ms 58.42% 2.787ms 2.787ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
+
Self CPU time total: 4.770ms
|
| 4006 |
+
Self CUDA time total: 3.014ms
|
| 4007 |
|
| 4008 |
|
| 4009 |
|
|
|
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
+
hf_kernels_flash_attn3 2.45% 127.821us 37.14% 1.937ms 1.937ms 0.000us 0.00% 4.669ms 4.669ms 1
|
| 4017 |
+
FlashAttnFunc 1.78% 92.961us 34.69% 1.809ms 603.079us 0.000us 0.00% 4.669ms 1.556ms 3
|
| 4018 |
+
_flash_attn3_48fe103_dirty::fwd 0.98% 50.990us 32.91% 1.716ms 572.092us 3.496ms 100.00% 4.669ms 1.556ms 3
|
| 4019 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.498ms 100.05% 3.498ms 3.498ms 1
|
| 4020 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.496ms 100.00% 3.496ms 1.165ms 3
|
| 4021 |
+
Activity Buffer Request 27.66% 1.443ms 27.66% 1.443ms 1.443ms 1.173ms 33.56% 1.173ms 1.173ms 1
|
| 4022 |
+
aten::empty 0.56% 28.951us 0.56% 28.951us 4.825us 0.000us 0.00% 0.000us 0.000us 6
|
| 4023 |
+
cudaFuncSetAttribute 0.09% 4.870us 0.09% 4.870us 1.623us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaLaunchKernel 3.62% 188.673us 3.62% 188.673us 62.891us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
cudaDeviceSynchronize 62.86% 3.279ms 62.86% 3.279ms 3.279ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
+
Self CPU time total: 5.216ms
|
| 4028 |
+
Self CUDA time total: 3.496ms
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
+
hf_kernels_flash_attn3 2.26% 115.651us 36.11% 1.844ms 1.844ms 0.000us 0.00% 4.648ms 4.648ms 1
|
| 4039 |
+
FlashAttnFunc 1.78% 91.130us 33.84% 1.728ms 576.085us 0.000us 0.00% 4.648ms 1.549ms 3
|
| 4040 |
+
_flash_attn3_48fe103_dirty::fwd 1.06% 54.250us 32.06% 1.637ms 545.708us 3.480ms 100.00% 4.648ms 1.549ms 3
|
| 4041 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.481ms 100.04% 3.481ms 3.481ms 1
|
| 4042 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.480ms 100.00% 3.480ms 1.160ms 3
|
| 4043 |
+
Activity Buffer Request 27.00% 1.379ms 27.00% 1.379ms 1.379ms 1.168ms 33.58% 1.168ms 1.168ms 1
|
| 4044 |
+
aten::empty 0.55% 28.142us 0.55% 28.142us 4.690us 0.000us 0.00% 0.000us 0.000us 6
|
| 4045 |
+
cudaFuncSetAttribute 0.10% 5.261us 0.10% 5.261us 1.754us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
cudaLaunchKernel 3.35% 170.883us 3.35% 170.883us 56.961us 0.000us 0.00% 0.000us 0.000us 3
|
| 4047 |
+
cudaDeviceSynchronize 63.89% 3.263ms 63.89% 3.263ms 3.263ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
Self CPU time total: 5.107ms
|
| 4050 |
+
Self CUDA time total: 3.480ms
|
| 4051 |
|
| 4052 |
|
| 4053 |
impl wl p50(ms) ok
|
| 4054 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.95 True
|
| 4055 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
|
| 4056 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
|
| 4057 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True
|
| 4058 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.21 True
|
| 4059 |
+
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4060 |
</pre></div>
|
| 4061 |
+
<div class="cell-stderr">
|
| 4062 |
+
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4063 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.33it/s]
|
| 4064 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.66it/s]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4065 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4066 |
<div class="cell-artifacts">
|
| 4067 |
<h4>Artifacts:</h4>
|
| 4068 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark"
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
|
|
| 3924 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3925 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
-
torch_mem_eff
|
| 3928 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3929 |
-
aten::scaled_dot_product_attention 0.
|
| 3930 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3931 |
-
aten::_efficient_attention_forward 0.
|
| 3932 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3933 |
-
aten::contiguous 0.
|
| 3934 |
-
aten::clone 0.
|
| 3935 |
-
aten::copy_
|
| 3936 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3937 |
-
Activity Buffer Request
|
| 3938 |
-
aten::transpose 0.
|
| 3939 |
-
aten::as_strided 0.
|
| 3940 |
-
aten::empty_like 0.
|
| 3941 |
-
aten::empty
|
| 3942 |
-
cudaLaunchKernel
|
| 3943 |
-
cudaStreamIsCapturing 0.
|
| 3944 |
-
cudaFuncSetAttribute 0.
|
| 3945 |
-
cudaDeviceSynchronize
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
-
Self CPU time total:
|
| 3948 |
-
Self CUDA time total:
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
-
torch_mem_eff
|
| 3959 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3960 |
-
aten::scaled_dot_product_attention 0.
|
| 3961 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3962 |
-
aten::_efficient_attention_forward 0.
|
| 3963 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3964 |
-
aten::contiguous 0.
|
| 3965 |
-
aten::clone 0.
|
| 3966 |
-
aten::copy_ 0.
|
| 3967 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3968 |
-
Activity Buffer Request
|
| 3969 |
-
aten::transpose 0.
|
| 3970 |
-
aten::as_strided 0.
|
| 3971 |
-
aten::empty_like 0.
|
| 3972 |
-
aten::empty 0.
|
| 3973 |
-
cudaLaunchKernel
|
| 3974 |
-
cudaStreamIsCapturing 0.
|
| 3975 |
-
cudaFuncSetAttribute 0.
|
| 3976 |
-
cudaDeviceSynchronize
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
-
Self CPU time total:
|
| 3979 |
-
Self CUDA time total:
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
-
torch_mem_eff
|
| 3990 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3991 |
-
aten::scaled_dot_product_attention 0.
|
| 3992 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3993 |
-
aten::_efficient_attention_forward 0.
|
| 3994 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3995 |
-
aten::contiguous 0.
|
| 3996 |
-
aten::clone 0.
|
| 3997 |
-
aten::copy_ 0.
|
| 3998 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
Activity Buffer Request
|
| 4000 |
-
aten::transpose 0.
|
| 4001 |
-
aten::as_strided 0.
|
| 4002 |
-
aten::empty_like 0.
|
| 4003 |
-
aten::empty 0.
|
| 4004 |
-
cudaLaunchKernel
|
| 4005 |
-
cudaStreamIsCapturing 0.
|
| 4006 |
-
cudaFuncSetAttribute 0.
|
| 4007 |
-
cudaDeviceSynchronize
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
-
Self CPU time total:
|
| 4010 |
-
Self CUDA time total:
|
| 4011 |
|
| 4012 |
|
| 4013 |
|
|
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
|
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
-
torch_mem_eff
|
| 4021 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4022 |
-
aten::scaled_dot_product_attention 0.
|
| 4023 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4024 |
-
aten::_efficient_attention_forward 0.
|
| 4025 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4026 |
-
aten::contiguous 0.
|
| 4027 |
-
aten::clone 0.
|
| 4028 |
-
aten::copy_
|
| 4029 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4030 |
-
Activity Buffer Request
|
| 4031 |
-
aten::transpose 0.
|
| 4032 |
-
aten::as_strided 0.
|
| 4033 |
-
aten::empty_like 0.
|
| 4034 |
-
aten::empty
|
| 4035 |
-
cudaLaunchKernel
|
| 4036 |
-
cudaStreamIsCapturing 0.
|
| 4037 |
-
cudaFuncSetAttribute 0.
|
| 4038 |
-
cudaDeviceSynchronize
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
-
Self CPU time total:
|
| 4041 |
-
Self CUDA time total:
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
-
torch_mem_eff
|
| 4052 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4053 |
-
aten::scaled_dot_product_attention 0.
|
| 4054 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4055 |
-
aten::_efficient_attention_forward 0.
|
| 4056 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4057 |
-
aten::contiguous 0.
|
| 4058 |
-
aten::clone 0.
|
| 4059 |
-
aten::copy_ 0.
|
| 4060 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4061 |
-
Activity Buffer Request
|
| 4062 |
-
aten::transpose 0.
|
| 4063 |
-
aten::as_strided 0.
|
| 4064 |
-
aten::empty_like 0.
|
| 4065 |
-
aten::empty 0.
|
| 4066 |
-
cudaLaunchKernel
|
| 4067 |
-
cudaStreamIsCapturing 0.
|
| 4068 |
-
cudaFuncSetAttribute 0.
|
| 4069 |
-
cudaDeviceSynchronize
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
Self CPU time total:
|
| 4072 |
-
Self CUDA time total:
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
@@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
-
torch_mem_eff
|
| 4083 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4084 |
-
aten::scaled_dot_product_attention 0.
|
| 4085 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4086 |
-
aten::_efficient_attention_forward 0.
|
| 4087 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4088 |
-
aten::contiguous 0.
|
| 4089 |
-
aten::clone 0.
|
| 4090 |
-
aten::copy_ 0.
|
| 4091 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4092 |
-
Activity Buffer Request
|
| 4093 |
-
aten::transpose 0.
|
| 4094 |
-
aten::as_strided 0.
|
| 4095 |
-
aten::empty_like 0.
|
| 4096 |
-
aten::empty 0.
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
cudaStreamIsCapturing 0.
|
| 4099 |
-
cudaFuncSetAttribute 0.
|
| 4100 |
-
cudaDeviceSynchronize
|
| 4101 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4102 |
-
Self CPU time total:
|
| 4103 |
-
Self CUDA time total:
|
| 4104 |
|
| 4105 |
|
| 4106 |
impl wl p50(ms) ok
|
| 4107 |
-
torch_mem_eff cuda_attn_L128_bfloat16
|
| 4108 |
-
torch_mem_eff cuda_attn_L256_bfloat16
|
| 4109 |
-
torch_mem_eff cuda_attn_L320_bfloat16
|
| 4110 |
-
torch_mem_eff cuda_attn_L384_bfloat16
|
| 4111 |
-
torch_mem_eff cuda_attn_L448_bfloat16
|
| 4112 |
-
torch_mem_eff cuda_attn_L512_bfloat16
|
| 4113 |
</pre></div>
|
| 4114 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4115 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4116 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4117 |
-
Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4118 |
-
Downloading pillow (6.7MiB)
|
| 4119 |
-
Downloading sympy (6.0MiB)
|
| 4120 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4121 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4122 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4123 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4124 |
-
Downloading fonttools (4.7MiB)
|
| 4125 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4126 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4127 |
-
Downloading numpy (15.9MiB)
|
| 4128 |
-
Downloading setuptools (1.1MiB)
|
| 4129 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4130 |
-
Downloading matplotlib (8.3MiB)
|
| 4131 |
-
Downloading networkx (1.9MiB)
|
| 4132 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4133 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4134 |
-
Downloading kiwisolver (1.4MiB)
|
| 4135 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4136 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4137 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4138 |
-
Downloading torch (846.8MiB)
|
| 4139 |
-
Downloading triton (148.4MiB)
|
| 4140 |
-
Downloading nvidia-cufile-cu12
|
| 4141 |
-
Downloading kiwisolver
|
| 4142 |
-
Downloading setuptools
|
| 4143 |
-
Downloading fonttools
|
| 4144 |
-
Downloading networkx
|
| 4145 |
-
Downloading pillow
|
| 4146 |
-
Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4147 |
-
Downloading matplotlib
|
| 4148 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4149 |
-
Downloading numpy
|
| 4150 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4151 |
-
Downloading sympy
|
| 4152 |
-
Downloading nvidia-curand-cu12
|
| 4153 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4154 |
-
Downloading triton
|
| 4155 |
-
Downloading nvidia-cufft-cu12
|
| 4156 |
-
Downloading nvidia-cusolver-cu12
|
| 4157 |
-
Downloading nvidia-cusparse-cu12
|
| 4158 |
-
Downloading nvidia-cusparselt-cu12
|
| 4159 |
-
Downloading nvidia-nccl-cu12
|
| 4160 |
-
Downloading nvidia-cublas-cu12
|
| 4161 |
-
Downloading nvidia-cudnn-cu12
|
| 4162 |
-
Downloading torch
|
| 4163 |
-
Installed 37 packages in 228ms
|
| 4164 |
-
</div>
|
| 4165 |
-
</div>
|
| 4166 |
<div class="cell-artifacts">
|
| 4167 |
<h4>Artifacts:</h4>
|
| 4168 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 4.02s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3924 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3925 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
+
torch_mem_eff 4.61% 329.029us 32.49% 2.320ms 2.320ms 0.000us 0.00% 5.545ms 5.545ms 1
|
| 3928 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.524ms 100.54% 5.524ms 5.524ms 1
|
| 3929 |
+
aten::scaled_dot_product_attention 0.42% 29.860us 2.75% 196.242us 65.414us 0.000us 0.00% 4.878ms 1.626ms 3
|
| 3930 |
+
aten::_scaled_dot_product_efficient_attention 0.35% 25.230us 2.33% 166.382us 55.461us 0.000us 0.00% 4.878ms 1.626ms 3
|
| 3931 |
+
aten::_efficient_attention_forward 0.73% 52.049us 1.68% 119.861us 39.954us 4.878ms 88.79% 4.878ms 1.626ms 3
|
| 3932 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.878ms 88.79% 4.878ms 1.626ms 3
|
| 3933 |
+
aten::contiguous 0.18% 13.143us 24.28% 1.734ms 192.643us 0.000us 0.00% 666.300us 74.033us 9
|
| 3934 |
+
aten::clone 0.50% 35.608us 24.09% 1.721ms 191.183us 0.000us 0.00% 666.300us 74.033us 9
|
| 3935 |
+
aten::copy_ 1.01% 71.952us 22.59% 1.613ms 179.214us 615.708us 11.21% 666.300us 74.033us 9
|
| 3936 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 615.708us 11.21% 615.708us 68.412us 9
|
| 3937 |
+
Activity Buffer Request 20.33% 1.452ms 20.33% 1.452ms 1.452ms 50.592us 0.92% 50.592us 50.592us 1
|
| 3938 |
+
aten::transpose 0.87% 61.994us 1.16% 82.494us 3.437us 0.000us 0.00% 0.000us 0.000us 24
|
| 3939 |
+
aten::as_strided 0.29% 20.500us 0.29% 20.500us 0.854us 0.000us 0.00% 0.000us 0.000us 24
|
| 3940 |
+
aten::empty_like 0.25% 17.742us 1.01% 72.112us 8.012us 0.000us 0.00% 0.000us 0.000us 9
|
| 3941 |
+
aten::empty 1.17% 83.610us 1.17% 83.610us 3.981us 0.000us 0.00% 0.000us 0.000us 21
|
| 3942 |
+
cudaLaunchKernel 1.60% 114.582us 1.60% 114.582us 9.548us 0.000us 0.00% 0.000us 0.000us 12
|
| 3943 |
+
cudaStreamIsCapturing 0.04% 3.180us 0.04% 3.180us 1.060us 0.000us 0.00% 0.000us 0.000us 3
|
| 3944 |
+
cudaFuncSetAttribute 0.14% 10.280us 0.14% 10.280us 3.427us 0.000us 0.00% 0.000us 0.000us 3
|
| 3945 |
+
cudaDeviceSynchronize 67.51% 4.821ms 67.51% 4.821ms 4.821ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
+
Self CPU time total: 7.141ms
|
| 3948 |
+
Self CUDA time total: 5.494ms
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
+
torch_mem_eff 3.39% 253.102us 28.13% 2.097ms 2.097ms 0.000us 0.00% 5.972ms 5.972ms 1
|
| 3959 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.926ms 100.15% 5.926ms 5.926ms 1
|
| 3960 |
+
aten::scaled_dot_product_attention 0.26% 19.190us 1.92% 143.113us 47.704us 0.000us 0.00% 5.278ms 1.759ms 3
|
| 3961 |
+
aten::_scaled_dot_product_efficient_attention 0.26% 19.540us 1.66% 123.923us 41.308us 0.000us 0.00% 5.278ms 1.759ms 3
|
| 3962 |
+
aten::_efficient_attention_forward 0.37% 27.385us 1.10% 81.652us 27.217us 5.278ms 89.20% 5.278ms 1.759ms 3
|
| 3963 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.278ms 89.20% 5.278ms 1.759ms 3
|
| 3964 |
+
aten::contiguous 0.09% 6.999us 22.26% 1.660ms 184.423us 0.000us 0.00% 693.503us 77.056us 9
|
| 3965 |
+
aten::clone 0.31% 23.031us 22.17% 1.653ms 183.645us 0.000us 0.00% 693.503us 77.056us 9
|
| 3966 |
+
aten::copy_ 0.83% 61.989us 21.18% 1.579ms 175.477us 638.911us 10.80% 693.503us 77.056us 9
|
| 3967 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 638.911us 10.80% 638.911us 70.990us 9
|
| 3968 |
+
Activity Buffer Request 19.45% 1.450ms 19.45% 1.450ms 1.450ms 54.592us 0.92% 54.592us 54.592us 1
|
| 3969 |
+
aten::transpose 0.64% 47.641us 0.86% 64.101us 2.671us 0.000us 0.00% 0.000us 0.000us 24
|
| 3970 |
+
aten::as_strided 0.22% 16.460us 0.22% 16.460us 0.686us 0.000us 0.00% 0.000us 0.000us 24
|
| 3971 |
+
aten::empty_like 0.16% 11.730us 0.68% 50.483us 5.609us 0.000us 0.00% 0.000us 0.000us 9
|
| 3972 |
+
aten::empty 0.86% 64.470us 0.86% 64.470us 3.070us 0.000us 0.00% 0.000us 0.000us 21
|
| 3973 |
+
cudaLaunchKernel 1.21% 90.240us 1.21% 90.240us 7.520us 0.000us 0.00% 0.000us 0.000us 12
|
| 3974 |
+
cudaStreamIsCapturing 0.03% 2.290us 0.03% 2.290us 0.763us 0.000us 0.00% 0.000us 0.000us 3
|
| 3975 |
+
cudaFuncSetAttribute 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
cudaDeviceSynchronize 71.87% 5.359ms 71.87% 5.359ms 5.359ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
+
Self CPU time total: 7.456ms
|
| 3979 |
+
Self CUDA time total: 5.917ms
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
+
torch_mem_eff 3.16% 240.823us 26.89% 2.051ms 2.051ms 0.000us 0.00% 6.167ms 6.167ms 1
|
| 3990 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.117ms 100.14% 6.117ms 6.117ms 1
|
| 3991 |
+
aten::scaled_dot_product_attention 0.24% 18.220us 1.81% 137.732us 45.911us 0.000us 0.00% 5.453ms 1.818ms 3
|
| 3992 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 18.402us 1.57% 119.512us 39.837us 0.000us 0.00% 5.453ms 1.818ms 3
|
| 3993 |
+
aten::_efficient_attention_forward 0.35% 26.389us 1.04% 79.670us 26.557us 5.453ms 89.28% 5.453ms 1.818ms 3
|
| 3994 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.453ms 89.28% 5.453ms 1.818ms 3
|
| 3995 |
+
aten::contiguous 0.09% 6.950us 21.38% 1.630ms 181.132us 0.000us 0.00% 713.534us 79.282us 9
|
| 3996 |
+
aten::clone 0.28% 21.189us 21.28% 1.623ms 180.360us 0.000us 0.00% 713.534us 79.282us 9
|
| 3997 |
+
aten::copy_ 0.81% 62.032us 20.34% 1.551ms 172.330us 655.038us 10.72% 713.534us 79.282us 9
|
| 3998 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 655.038us 10.72% 655.038us 72.782us 9
|
| 3999 |
+
Activity Buffer Request 18.63% 1.421ms 18.63% 1.421ms 1.421ms 58.496us 0.96% 58.496us 58.496us 1
|
| 4000 |
+
aten::transpose 0.62% 47.348us 0.84% 63.699us 2.654us 0.000us 0.00% 0.000us 0.000us 24
|
| 4001 |
+
aten::as_strided 0.21% 16.351us 0.21% 16.351us 0.681us 0.000us 0.00% 0.000us 0.000us 24
|
| 4002 |
+
aten::empty_like 0.15% 11.091us 0.67% 51.081us 5.676us 0.000us 0.00% 0.000us 0.000us 9
|
| 4003 |
+
aten::empty 0.86% 65.760us 0.86% 65.760us 3.131us 0.000us 0.00% 0.000us 0.000us 21
|
| 4004 |
+
cudaLaunchKernel 1.18% 89.982us 1.18% 89.982us 7.498us 0.000us 0.00% 0.000us 0.000us 12
|
| 4005 |
+
cudaStreamIsCapturing 0.03% 2.210us 0.03% 2.210us 0.737us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaFuncSetAttribute 0.04% 3.100us 0.04% 3.100us 1.033us 0.000us 0.00% 0.000us 0.000us 3
|
| 4007 |
+
cudaDeviceSynchronize 73.11% 5.575ms 73.11% 5.575ms 5.575ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
+
Self CPU time total: 7.626ms
|
| 4010 |
+
Self CUDA time total: 6.108ms
|
| 4011 |
|
| 4012 |
|
| 4013 |
|
|
|
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
+
torch_mem_eff 4.44% 356.182us 33.00% 2.648ms 2.648ms 0.000us 0.00% 6.210ms 6.210ms 1
|
| 4021 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.165ms 100.21% 6.165ms 6.165ms 1
|
| 4022 |
+
aten::scaled_dot_product_attention 0.29% 23.400us 2.31% 185.263us 61.754us 0.000us 0.00% 5.497ms 1.832ms 3
|
| 4023 |
+
aten::_scaled_dot_product_efficient_attention 0.29% 23.202us 2.02% 161.863us 53.954us 0.000us 0.00% 5.497ms 1.832ms 3
|
| 4024 |
+
aten::_efficient_attention_forward 0.44% 35.239us 1.36% 108.811us 36.270us 5.497ms 89.36% 5.497ms 1.832ms 3
|
| 4025 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.497ms 89.36% 5.497ms 1.832ms 3
|
| 4026 |
+
aten::contiguous 0.11% 9.040us 25.54% 2.050ms 227.726us 0.000us 0.00% 712.735us 79.193us 9
|
| 4027 |
+
aten::clone 0.35% 28.461us 25.43% 2.040ms 226.722us 0.000us 0.00% 712.735us 79.193us 9
|
| 4028 |
+
aten::copy_ 1.02% 82.020us 24.22% 1.944ms 215.993us 654.527us 10.64% 712.735us 79.193us 9
|
| 4029 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.527us 10.64% 654.527us 72.725us 9
|
| 4030 |
+
Activity Buffer Request 19.35% 1.553ms 19.35% 1.553ms 1.553ms 58.208us 0.95% 58.208us 58.208us 1
|
| 4031 |
+
aten::transpose 0.81% 64.960us 1.09% 87.330us 3.639us 0.000us 0.00% 0.000us 0.000us 24
|
| 4032 |
+
aten::as_strided 0.28% 22.370us 0.28% 22.370us 0.932us 0.000us 0.00% 0.000us 0.000us 24
|
| 4033 |
+
aten::empty_like 0.19% 15.081us 0.85% 68.092us 7.566us 0.000us 0.00% 0.000us 0.000us 9
|
| 4034 |
+
aten::empty 1.09% 87.522us 1.09% 87.522us 4.168us 0.000us 0.00% 0.000us 0.000us 21
|
| 4035 |
+
cudaLaunchKernel 4.25% 341.154us 4.25% 341.154us 28.429us 0.000us 0.00% 0.000us 0.000us 12
|
| 4036 |
+
cudaStreamIsCapturing 0.04% 2.841us 0.04% 2.841us 0.947us 0.000us 0.00% 0.000us 0.000us 3
|
| 4037 |
+
cudaFuncSetAttribute 0.05% 4.120us 0.05% 4.120us 1.373us 0.000us 0.00% 0.000us 0.000us 3
|
| 4038 |
+
cudaDeviceSynchronize 67.00% 5.376ms 67.00% 5.376ms 5.376ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
+
Self CPU time total: 8.025ms
|
| 4041 |
+
Self CUDA time total: 6.152ms
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
+
torch_mem_eff 3.33% 272.217us 28.45% 2.323ms 2.323ms 0.000us 0.00% 6.452ms 6.452ms 1
|
| 4052 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.401ms 100.14% 6.401ms 6.401ms 1
|
| 4053 |
+
aten::scaled_dot_product_attention 0.25% 20.040us 1.74% 141.700us 47.233us 0.000us 0.00% 5.729ms 1.910ms 3
|
| 4054 |
+
aten::_scaled_dot_product_efficient_attention 0.23% 18.560us 1.49% 121.660us 40.553us 0.000us 0.00% 5.729ms 1.910ms 3
|
| 4055 |
+
aten::_efficient_attention_forward 0.34% 27.420us 1.00% 81.440us 27.147us 5.729ms 89.62% 5.729ms 1.910ms 3
|
| 4056 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.729ms 89.62% 5.729ms 1.910ms 3
|
| 4057 |
+
aten::contiguous 0.09% 7.310us 22.83% 1.865ms 207.177us 0.000us 0.00% 723.614us 80.402us 9
|
| 4058 |
+
aten::clone 0.27% 22.438us 22.75% 1.857ms 206.364us 0.000us 0.00% 723.614us 80.402us 9
|
| 4059 |
+
aten::copy_ 0.75% 61.292us 21.84% 1.783ms 198.108us 663.806us 10.38% 723.614us 80.402us 9
|
| 4060 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 663.806us 10.38% 663.806us 73.756us 9
|
| 4061 |
+
Activity Buffer Request 18.13% 1.481ms 18.13% 1.481ms 1.481ms 59.808us 0.94% 59.808us 59.808us 1
|
| 4062 |
+
aten::transpose 0.61% 49.591us 0.81% 66.019us 2.751us 0.000us 0.00% 0.000us 0.000us 24
|
| 4063 |
+
aten::as_strided 0.20% 16.428us 0.20% 16.428us 0.684us 0.000us 0.00% 0.000us 0.000us 24
|
| 4064 |
+
aten::empty_like 0.14% 11.501us 0.64% 51.871us 5.763us 0.000us 0.00% 0.000us 0.000us 9
|
| 4065 |
+
aten::empty 0.80% 65.620us 0.80% 65.620us 3.125us 0.000us 0.00% 0.000us 0.000us 21
|
| 4066 |
+
cudaLaunchKernel 3.24% 264.473us 3.24% 264.473us 22.039us 0.000us 0.00% 0.000us 0.000us 12
|
| 4067 |
+
cudaStreamIsCapturing 0.03% 2.310us 0.03% 2.310us 0.770us 0.000us 0.00% 0.000us 0.000us 3
|
| 4068 |
+
cudaFuncSetAttribute 0.04% 3.060us 0.04% 3.060us 1.020us 0.000us 0.00% 0.000us 0.000us 3
|
| 4069 |
+
cudaDeviceSynchronize 71.55% 5.843ms 71.55% 5.843ms 5.843ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
+
Self CPU time total: 8.166ms
|
| 4072 |
+
Self CUDA time total: 6.392ms
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
+
torch_mem_eff 2.84% 238.921us 26.25% 2.206ms 2.206ms 0.000us 0.00% 6.803ms 6.803ms 1
|
| 4083 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.751ms 100.13% 6.751ms 6.751ms 1
|
| 4084 |
+
aten::scaled_dot_product_attention 0.23% 19.080us 1.67% 140.122us 46.707us 0.000us 0.00% 6.072ms 2.024ms 3
|
| 4085 |
+
aten::_scaled_dot_product_efficient_attention 0.22% 18.680us 1.44% 121.042us 40.347us 0.000us 0.00% 6.072ms 2.024ms 3
|
| 4086 |
+
aten::_efficient_attention_forward 0.32% 27.009us 0.95% 79.840us 26.613us 6.072ms 90.07% 6.072ms 2.024ms 3
|
| 4087 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.072ms 90.07% 6.072ms 2.024ms 3
|
| 4088 |
+
aten::contiguous 0.09% 7.439us 21.24% 1.785ms 198.324us 0.000us 0.00% 731.099us 81.233us 9
|
| 4089 |
+
aten::clone 0.26% 21.852us 21.15% 1.777ms 197.498us 0.000us 0.00% 731.099us 81.233us 9
|
| 4090 |
+
aten::copy_ 0.77% 64.769us 20.27% 1.703ms 189.239us 669.820us 9.93% 731.099us 81.233us 9
|
| 4091 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 669.820us 9.93% 669.820us 74.424us 9
|
| 4092 |
+
Activity Buffer Request 16.92% 1.422ms 16.92% 1.422ms 1.422ms 61.279us 0.91% 61.279us 61.279us 1
|
| 4093 |
+
aten::transpose 0.57% 48.271us 0.77% 64.334us 2.681us 0.000us 0.00% 0.000us 0.000us 24
|
| 4094 |
+
aten::as_strided 0.19% 16.063us 0.19% 16.063us 0.669us 0.000us 0.00% 0.000us 0.000us 24
|
| 4095 |
+
aten::empty_like 0.14% 11.440us 0.62% 52.480us 5.831us 0.000us 0.00% 0.000us 0.000us 9
|
| 4096 |
+
aten::empty 0.79% 66.661us 0.79% 66.661us 3.174us 0.000us 0.00% 0.000us 0.000us 21
|
| 4097 |
+
cudaLaunchKernel 2.84% 238.383us 2.84% 238.383us 19.865us 0.000us 0.00% 0.000us 0.000us 12
|
| 4098 |
+
cudaStreamIsCapturing 0.03% 2.270us 0.03% 2.270us 0.757us 0.000us 0.00% 0.000us 0.000us 3
|
| 4099 |
+
cudaFuncSetAttribute 0.04% 3.090us 0.04% 3.090us 1.030us 0.000us 0.00% 0.000us 0.000us 3
|
| 4100 |
+
cudaDeviceSynchronize 73.75% 6.196ms 73.75% 6.196ms 6.196ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4101 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4102 |
+
Self CPU time total: 8.402ms
|
| 4103 |
+
Self CUDA time total: 6.742ms
|
| 4104 |
|
| 4105 |
|
| 4106 |
impl wl p50(ms) ok
|
| 4107 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.89 True
|
| 4108 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
|
| 4109 |
+
torch_mem_eff cuda_attn_L320_bfloat16 2.05 True
|
| 4110 |
+
torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
|
| 4111 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.13 True
|
| 4112 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.27 True
|
| 4113 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4114 |
<div class="cell-artifacts">
|
| 4115 |
<h4>Artifacts:</h4>
|
| 4116 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3921,76 +3921,28 @@ Cell: benchmark | 44.02s
|
|
| 3921 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3922 |
impl wl p50(ms) ok
|
| 3923 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3924 |
-
Error: module '
|
| 3925 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3926 |
-
Error: module '
|
| 3927 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3928 |
-
Error: module '
|
| 3929 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3930 |
-
Error: module '
|
| 3931 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3932 |
-
Error: module '
|
| 3933 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3934 |
-
Error: module '
|
| 3935 |
</pre></div>
|
| 3936 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3937 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3938 |
<div class="uv-logs-content" style="display: none;">
|
| 3939 |
-
|
| 3940 |
-
Downloading pillow (6.7MiB)
|
| 3941 |
-
Downloading hf-xet (3.2MiB)
|
| 3942 |
-
Downloading networkx (1.9MiB)
|
| 3943 |
-
Downloading setuptools (1.1MiB)
|
| 3944 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3945 |
-
Downloading numpy (15.9MiB)
|
| 3946 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3947 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3948 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3949 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3950 |
-
Downloading sympy (6.0MiB)
|
| 3951 |
-
Downloading kiwisolver (1.4MiB)
|
| 3952 |
-
Downloading fonttools (4.7MiB)
|
| 3953 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3954 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3955 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3956 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3957 |
-
Downloading matplotlib (8.3MiB)
|
| 3958 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3959 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3960 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3961 |
-
Downloading triton (148.4MiB)
|
| 3962 |
-
Downloading torch (846.8MiB)
|
| 3963 |
-
Downloading nvidia-cufile-cu12
|
| 3964 |
-
Downloading kiwisolver
|
| 3965 |
-
Downloading hf-xet
|
| 3966 |
-
Downloading setuptools
|
| 3967 |
-
Downloading fonttools
|
| 3968 |
-
Downloading networkx
|
| 3969 |
-
Downloading pillow
|
| 3970 |
-
Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 3971 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 3972 |
-
Downloading matplotlib
|
| 3973 |
-
Downloading numpy
|
| 3974 |
-
Downloading sympy
|
| 3975 |
-
Downloading nvidia-nvjitlink-cu12
|
| 3976 |
-
Downloading nvidia-curand-cu12
|
| 3977 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 3978 |
-
Downloading triton
|
| 3979 |
-
Downloading nvidia-cufft-cu12
|
| 3980 |
-
Downloading nvidia-cusolver-cu12
|
| 3981 |
-
Downloading nvidia-cusparselt-cu12
|
| 3982 |
-
Downloading nvidia-cusparse-cu12
|
| 3983 |
-
Downloading nvidia-nccl-cu12
|
| 3984 |
-
Downloading nvidia-cublas-cu12
|
| 3985 |
-
Downloading nvidia-cudnn-cu12
|
| 3986 |
-
Downloading torch
|
| 3987 |
-
Installed 48 packages in 211ms
|
| 3988 |
</div>
|
| 3989 |
</div>
|
| 3990 |
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3991 |
-
Fetching 11 files:
|
| 3992 |
-
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00,
|
| 3993 |
-
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00,
|
| 3994 |
<div class="cell-artifacts">
|
| 3995 |
<h4>Artifacts:</h4>
|
| 3996 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 4.37s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3921 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3922 |
impl wl p50(ms) ok
|
| 3923 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3924 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 3925 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3926 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 3927 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3928 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 3929 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3930 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 3931 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3932 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 3933 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3934 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 3935 |
</pre></div>
|
| 3936 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3937 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3938 |
<div class="uv-logs-content" style="display: none;">
|
| 3939 |
+
Installed 1 package in 11ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3940 |
</div>
|
| 3941 |
</div>
|
| 3942 |
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3943 |
+
Fetching 11 files: 27%|██▋ | 3/11 [00:00<00:00, 14.92it/s]
|
| 3944 |
+
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 14.19it/s]
|
| 3945 |
+
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 19.60it/s]</div>
|
| 3946 |
<div class="cell-artifacts">
|
| 3947 |
<h4>Artifacts:</h4>
|
| 3948 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
|
|
| 3923 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3924 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
-
xformers_meff
|
| 3927 |
-
xformers_flash3::flash_fwd
|
| 3928 |
-
flash_attn_3::fwd
|
| 3929 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3930 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3931 |
-
Activity Buffer Request
|
| 3932 |
-
aten::empty 0.
|
| 3933 |
-
cudaFuncSetAttribute 0.
|
| 3934 |
-
cudaLaunchKernel 0.
|
| 3935 |
-
aten::reshape 0.
|
| 3936 |
-
aten::view 0.
|
| 3937 |
-
cudaDeviceSynchronize
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
-
Self CPU time total:
|
| 3940 |
-
Self CUDA time total:
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
-
xformers_meff
|
| 3951 |
-
xformers_flash3::flash_fwd
|
| 3952 |
-
flash_attn_3::fwd
|
| 3953 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3954 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3955 |
-
Activity Buffer Request
|
| 3956 |
-
aten::empty 0.
|
| 3957 |
-
cudaFuncSetAttribute 0.
|
| 3958 |
-
cudaLaunchKernel 0.
|
| 3959 |
-
aten::reshape 0.
|
| 3960 |
-
aten::view 0.
|
| 3961 |
-
cudaDeviceSynchronize
|
| 3962 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3963 |
-
Self CPU time total:
|
| 3964 |
-
Self CUDA time total:
|
| 3965 |
|
| 3966 |
|
| 3967 |
|
|
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
|
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
-
xformers_meff
|
| 3975 |
-
xformers_flash3::flash_fwd
|
| 3976 |
-
flash_attn_3::fwd
|
| 3977 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3978 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3979 |
-
Activity Buffer Request
|
| 3980 |
-
aten::empty 0.
|
| 3981 |
-
cudaFuncSetAttribute 0.
|
| 3982 |
-
cudaLaunchKernel 0.
|
| 3983 |
-
aten::reshape 0.
|
| 3984 |
-
aten::view 0.
|
| 3985 |
-
cudaDeviceSynchronize
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
-
Self CPU time total:
|
| 3988 |
-
Self CUDA time total:
|
| 3989 |
|
| 3990 |
|
| 3991 |
|
|
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
-
xformers_meff
|
| 3999 |
-
xformers_flash3::flash_fwd
|
| 4000 |
-
flash_attn_3::fwd
|
| 4001 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4002 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4003 |
-
Activity Buffer Request
|
| 4004 |
-
aten::empty 0.
|
| 4005 |
-
cudaFuncSetAttribute 0.
|
| 4006 |
-
cudaLaunchKernel
|
| 4007 |
-
aten::reshape 0.
|
| 4008 |
-
aten::view 0.
|
| 4009 |
-
cudaDeviceSynchronize
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
-
Self CPU time total:
|
| 4012 |
-
Self CUDA time total:
|
| 4013 |
|
| 4014 |
|
| 4015 |
|
|
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
|
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
-
xformers_meff
|
| 4023 |
-
xformers_flash3::flash_fwd
|
| 4024 |
-
flash_attn_3::fwd 0.
|
| 4025 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4026 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4027 |
-
Activity Buffer Request
|
| 4028 |
-
aten::empty 0.
|
| 4029 |
-
cudaFuncSetAttribute 0.
|
| 4030 |
-
cudaLaunchKernel
|
| 4031 |
-
aten::reshape 0.
|
| 4032 |
-
aten::view 0.
|
| 4033 |
-
cudaDeviceSynchronize
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
-
Self CPU time total:
|
| 4036 |
-
Self CUDA time total:
|
| 4037 |
|
| 4038 |
|
| 4039 |
|
|
@@ -4043,83 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
|
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
-
xformers_meff
|
| 4047 |
-
xformers_flash3::flash_fwd
|
| 4048 |
-
flash_attn_3::fwd
|
| 4049 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4050 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4051 |
-
Activity Buffer Request
|
| 4052 |
-
aten::empty 0.
|
| 4053 |
-
cudaFuncSetAttribute 0.
|
| 4054 |
-
cudaLaunchKernel
|
| 4055 |
-
aten::reshape 0.
|
| 4056 |
-
aten::view 0.
|
| 4057 |
-
cudaDeviceSynchronize
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
Self CPU time total:
|
| 4060 |
-
Self CUDA time total:
|
| 4061 |
|
| 4062 |
|
| 4063 |
impl wl p50(ms) ok
|
| 4064 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4065 |
-
xformers_meff cuda_attn_L256_bfloat16
|
| 4066 |
-
xformers_meff cuda_attn_L320_bfloat16
|
| 4067 |
-
xformers_meff cuda_attn_L384_bfloat16
|
| 4068 |
-
xformers_meff cuda_attn_L448_bfloat16
|
| 4069 |
-
xformers_meff cuda_attn_L512_bfloat16
|
| 4070 |
</pre></div>
|
| 4071 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4072 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4073 |
<div class="uv-logs-content" style="display: none;">
|
| 4074 |
-
Building kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4075 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4076 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4077 |
-
Downloading triton (148.4MiB)
|
| 4078 |
-
Downloading matplotlib (8.3MiB)
|
| 4079 |
-
Downloading pillow (6.7MiB)
|
| 4080 |
-
Downloading setuptools (1.1MiB)
|
| 4081 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4082 |
-
Downloading torch (846.8MiB)
|
| 4083 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4084 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4085 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4086 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4087 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4088 |
-
Downloading networkx (1.9MiB)
|
| 4089 |
-
Downloading kiwisolver (1.4MiB)
|
| 4090 |
-
Downloading fonttools (4.7MiB)
|
| 4091 |
-
Downloading numpy (15.9MiB)
|
| 4092 |
-
Downloading sympy (6.0MiB)
|
| 4093 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4094 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4095 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4096 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4097 |
Downloading xformers (111.8MiB)
|
| 4098 |
-
Downloading nvidia-cufile-cu12
|
| 4099 |
-
Downloading kiwisolver
|
| 4100 |
-
Downloading setuptools
|
| 4101 |
-
Downloading fonttools
|
| 4102 |
-
Downloading networkx
|
| 4103 |
-
Downloading pillow
|
| 4104 |
-
Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4105 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4106 |
-
Downloading matplotlib
|
| 4107 |
-
Downloading numpy
|
| 4108 |
-
Downloading sympy
|
| 4109 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4110 |
-
Downloading nvidia-curand-cu12
|
| 4111 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4112 |
Downloading xformers
|
| 4113 |
-
|
| 4114 |
-
Downloading nvidia-cufft-cu12
|
| 4115 |
-
Downloading nvidia-cusolver-cu12
|
| 4116 |
-
Downloading nvidia-cusparse-cu12
|
| 4117 |
-
Downloading nvidia-cusparselt-cu12
|
| 4118 |
-
Downloading nvidia-nccl-cu12
|
| 4119 |
-
Downloading nvidia-cublas-cu12
|
| 4120 |
-
Downloading nvidia-cudnn-cu12
|
| 4121 |
-
Downloading torch
|
| 4122 |
-
Installed 38 packages in 211ms
|
| 4123 |
</div>
|
| 4124 |
</div>
|
| 4125 |
<div class="cell-artifacts">
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 5.09s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3923 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3924 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
+
xformers_meff 10.73% 481.606us 51.24% 2.299ms 2.299ms 0.000us 0.00% 3.630ms 3.630ms 1
|
| 3927 |
+
xformers_flash3::flash_fwd 4.33% 194.084us 39.70% 1.781ms 593.782us 0.000us 0.00% 3.630ms 1.210ms 3
|
| 3928 |
+
flash_attn_3::fwd 1.76% 78.961us 35.37% 1.587ms 529.087us 2.729ms 100.00% 3.630ms 1.210ms 3
|
| 3929 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.730ms 100.05% 2.730ms 2.730ms 1
|
| 3930 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.729ms 100.00% 2.729ms 909.588us 3
|
| 3931 |
+
Activity Buffer Request 31.70% 1.423ms 31.70% 1.423ms 1.423ms 901.535us 33.04% 901.535us 901.535us 1
|
| 3932 |
+
aten::empty 0.75% 33.761us 0.75% 33.761us 5.627us 0.000us 0.00% 0.000us 0.000us 6
|
| 3933 |
+
cudaFuncSetAttribute 0.28% 12.380us 0.28% 12.380us 4.127us 0.000us 0.00% 0.000us 0.000us 3
|
| 3934 |
+
cudaLaunchKernel 0.88% 39.570us 0.88% 39.570us 13.190us 0.000us 0.00% 0.000us 0.000us 3
|
| 3935 |
+
aten::reshape 0.30% 13.520us 0.80% 36.080us 6.013us 0.000us 0.00% 0.000us 0.000us 6
|
| 3936 |
+
aten::view 0.50% 22.560us 0.50% 22.560us 3.760us 0.000us 0.00% 0.000us 0.000us 6
|
| 3937 |
+
cudaDeviceSynchronize 48.76% 2.188ms 48.76% 2.188ms 2.188ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
+
Self CPU time total: 4.487ms
|
| 3940 |
+
Self CUDA time total: 2.729ms
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
+
xformers_meff 7.10% 312.113us 46.81% 2.059ms 2.059ms 0.000us 0.00% 3.744ms 3.744ms 1
|
| 3951 |
+
xformers_flash3::flash_fwd 3.88% 170.673us 39.17% 1.723ms 574.405us 0.000us 0.00% 3.744ms 1.248ms 3
|
| 3952 |
+
flash_attn_3::fwd 1.28% 56.171us 35.29% 1.553ms 517.514us 2.795ms 100.00% 3.744ms 1.248ms 3
|
| 3953 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.05% 2.796ms 2.796ms 1
|
| 3954 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.630us 3
|
| 3955 |
+
Activity Buffer Request 32.47% 1.428ms 32.47% 1.428ms 1.428ms 948.729us 33.95% 948.729us 948.729us 1
|
| 3956 |
+
aten::empty 0.66% 29.091us 0.66% 29.091us 4.848us 0.000us 0.00% 0.000us 0.000us 6
|
| 3957 |
+
cudaFuncSetAttribute 0.13% 5.590us 0.13% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3
|
| 3958 |
+
cudaLaunchKernel 0.76% 33.440us 0.76% 33.440us 11.147us 0.000us 0.00% 0.000us 0.000us 3
|
| 3959 |
+
aten::reshape 0.20% 8.951us 0.54% 23.831us 3.972us 0.000us 0.00% 0.000us 0.000us 6
|
| 3960 |
+
aten::view 0.34% 14.880us 0.34% 14.880us 2.480us 0.000us 0.00% 0.000us 0.000us 6
|
| 3961 |
+
cudaDeviceSynchronize 53.19% 2.340ms 53.19% 2.340ms 2.340ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3962 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3963 |
+
Self CPU time total: 4.399ms
|
| 3964 |
+
Self CUDA time total: 2.795ms
|
| 3965 |
|
| 3966 |
|
| 3967 |
|
|
|
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
+
xformers_meff 6.52% 299.466us 45.41% 2.085ms 2.085ms 0.000us 0.00% 3.907ms 3.907ms 1
|
| 3975 |
+
xformers_flash3::flash_fwd 3.09% 142.061us 38.39% 1.763ms 587.558us 0.000us 0.00% 3.907ms 1.302ms 3
|
| 3976 |
+
flash_attn_3::fwd 1.15% 53.012us 35.30% 1.621ms 540.204us 2.913ms 100.00% 3.907ms 1.302ms 3
|
| 3977 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.915ms 100.06% 2.915ms 2.915ms 1
|
| 3978 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.913ms 100.00% 2.913ms 971.158us 3
|
| 3979 |
+
Activity Buffer Request 32.68% 1.500ms 32.68% 1.500ms 1.500ms 993.281us 34.09% 993.281us 993.281us 1
|
| 3980 |
+
aten::empty 0.62% 28.380us 0.62% 28.380us 4.730us 0.000us 0.00% 0.000us 0.000us 6
|
| 3981 |
+
cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
|
| 3982 |
+
cudaLaunchKernel 0.73% 33.640us 0.73% 33.640us 11.213us 0.000us 0.00% 0.000us 0.000us 3
|
| 3983 |
+
aten::reshape 0.18% 8.421us 0.49% 22.660us 3.777us 0.000us 0.00% 0.000us 0.000us 6
|
| 3984 |
+
aten::view 0.31% 14.239us 0.31% 14.239us 2.373us 0.000us 0.00% 0.000us 0.000us 6
|
| 3985 |
+
cudaDeviceSynchronize 54.59% 2.507ms 54.59% 2.507ms 2.507ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
+
Self CPU time total: 4.591ms
|
| 3988 |
+
Self CUDA time total: 2.913ms
|
| 3989 |
|
| 3990 |
|
| 3991 |
|
|
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
+
xformers_meff 6.26% 300.335us 46.54% 2.234ms 2.234ms 0.000us 0.00% 3.980ms 3.980ms 1
|
| 3999 |
+
xformers_flash3::flash_fwd 3.08% 147.673us 39.81% 1.911ms 637.009us 0.000us 0.00% 3.980ms 1.327ms 3
|
| 4000 |
+
flash_attn_3::fwd 1.12% 53.571us 36.74% 1.763ms 587.785us 2.981ms 100.00% 3.980ms 1.327ms 3
|
| 4001 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.982ms 100.05% 2.982ms 2.982ms 1
|
| 4002 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.981ms 100.00% 2.981ms 993.631us 3
|
| 4003 |
+
Activity Buffer Request 29.81% 1.431ms 29.81% 1.431ms 1.431ms 999.263us 33.52% 999.263us 999.263us 1
|
| 4004 |
+
aten::empty 0.60% 28.930us 0.60% 28.930us 4.822us 0.000us 0.00% 0.000us 0.000us 6
|
| 4005 |
+
cudaFuncSetAttribute 0.12% 5.610us 0.12% 5.610us 1.870us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaLaunchKernel 5.09% 244.533us 5.09% 244.533us 81.511us 0.000us 0.00% 0.000us 0.000us 3
|
| 4007 |
+
aten::reshape 0.18% 8.489us 0.47% 22.530us 3.755us 0.000us 0.00% 0.000us 0.000us 6
|
| 4008 |
+
aten::view 0.29% 14.041us 0.29% 14.041us 2.340us 0.000us 0.00% 0.000us 0.000us 6
|
| 4009 |
+
cudaDeviceSynchronize 53.46% 2.566ms 53.46% 2.566ms 2.566ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
+
Self CPU time total: 4.800ms
|
| 4012 |
+
Self CUDA time total: 2.981ms
|
| 4013 |
|
| 4014 |
|
| 4015 |
|
|
|
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
+
xformers_meff 5.98% 313.865us 42.05% 2.207ms 2.207ms 0.000us 0.00% 4.635ms 4.635ms 1
|
| 4023 |
+
xformers_flash3::flash_fwd 2.80% 146.723us 35.63% 1.870ms 623.176us 0.000us 0.00% 4.635ms 1.545ms 3
|
| 4024 |
+
flash_attn_3::fwd 0.99% 51.861us 32.83% 1.723ms 574.268us 3.467ms 100.00% 4.635ms 1.545ms 3
|
| 4025 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.05% 3.469ms 3.469ms 1
|
| 4026 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.467ms 100.00% 3.467ms 1.156ms 3
|
| 4027 |
+
Activity Buffer Request 27.82% 1.460ms 27.82% 1.460ms 1.460ms 1.168ms 33.68% 1.168ms 1.168ms 1
|
| 4028 |
+
aten::empty 0.56% 29.260us 0.56% 29.260us 4.877us 0.000us 0.00% 0.000us 0.000us 6
|
| 4029 |
+
cudaFuncSetAttribute 0.12% 6.040us 0.12% 6.040us 2.013us 0.000us 0.00% 0.000us 0.000us 3
|
| 4030 |
+
cudaLaunchKernel 3.35% 175.903us 3.35% 175.903us 58.634us 0.000us 0.00% 0.000us 0.000us 3
|
| 4031 |
+
aten::reshape 0.16% 8.638us 0.44% 23.169us 3.862us 0.000us 0.00% 0.000us 0.000us 6
|
| 4032 |
+
aten::view 0.28% 14.531us 0.28% 14.531us 2.422us 0.000us 0.00% 0.000us 0.000us 6
|
| 4033 |
+
cudaDeviceSynchronize 57.95% 3.041ms 57.95% 3.041ms 3.041ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
+
Self CPU time total: 5.247ms
|
| 4036 |
+
Self CUDA time total: 3.467ms
|
| 4037 |
|
| 4038 |
|
| 4039 |
|
|
|
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
+
xformers_meff 5.97% 309.094us 41.86% 2.166ms 2.166ms 0.000us 0.00% 4.567ms 4.567ms 1
|
| 4047 |
+
xformers_flash3::flash_fwd 2.75% 142.242us 35.45% 1.834ms 611.405us 0.000us 0.00% 4.567ms 1.522ms 3
|
| 4048 |
+
flash_attn_3::fwd 1.04% 53.951us 32.70% 1.692ms 563.991us 3.419ms 100.00% 4.567ms 1.522ms 3
|
| 4049 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.421ms 100.05% 3.421ms 3.421ms 1
|
| 4050 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3
|
| 4051 |
+
Activity Buffer Request 27.74% 1.436ms 27.74% 1.436ms 1.436ms 1.148ms 33.59% 1.148ms 1.148ms 1
|
| 4052 |
+
aten::empty 0.58% 29.770us 0.58% 29.770us 4.962us 0.000us 0.00% 0.000us 0.000us 6
|
| 4053 |
+
cudaFuncSetAttribute 0.11% 5.591us 0.11% 5.591us 1.864us 0.000us 0.00% 0.000us 0.000us 3
|
| 4054 |
+
cudaLaunchKernel 3.23% 167.152us 3.23% 167.152us 55.717us 0.000us 0.00% 0.000us 0.000us 3
|
| 4055 |
+
aten::reshape 0.16% 8.371us 0.44% 22.751us 3.792us 0.000us 0.00% 0.000us 0.000us 6
|
| 4056 |
+
aten::view 0.28% 14.380us 0.28% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6
|
| 4057 |
+
cudaDeviceSynchronize 58.14% 3.008ms 58.14% 3.008ms 3.008ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
Self CPU time total: 5.174ms
|
| 4060 |
+
Self CUDA time total: 3.419ms
|
| 4061 |
|
| 4062 |
|
| 4063 |
impl wl p50(ms) ok
|
| 4064 |
+
xformers_meff cuda_attn_L128_bfloat16 1.00 True
|
| 4065 |
+
xformers_meff cuda_attn_L256_bfloat16 1.04 True
|
| 4066 |
+
xformers_meff cuda_attn_L320_bfloat16 1.09 True
|
| 4067 |
+
xformers_meff cuda_attn_L384_bfloat16 1.11 True
|
| 4068 |
+
xformers_meff cuda_attn_L448_bfloat16 1.26 True
|
| 4069 |
+
xformers_meff cuda_attn_L512_bfloat16 1.25 True
|
| 4070 |
</pre></div>
|
| 4071 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4072 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4073 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4074 |
Downloading xformers (111.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4075 |
Downloading xformers
|
| 4076 |
+
Installed 1 package in 14ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4077 |
</div>
|
| 4078 |
</div>
|
| 4079 |
<div class="cell-artifacts">
|
flash_attn/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
flash_attn/results/combined_results.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -3891,320 +3891,333 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3891 |
</g>
|
| 3892 |
<g id="axes--1" class="axes">
|
| 3893 |
<g id="patch_2">
|
| 3894 |
-
<path d="M
|
| 3895 |
</g>
|
| 3896 |
<g id="matplotlib.axis_1">
|
| 3897 |
<g id="xtick_1">
|
| 3898 |
<g id="grid-x--1" class="grid grid-x">
|
| 3899 |
-
<path d="M
|
| 3900 |
</g>
|
| 3901 |
<g id="line2d_1">
|
| 3902 |
<defs>
|
| 3903 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3904 |
</defs>
|
| 3905 |
<g>
|
| 3906 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 3907 |
</g>
|
| 3908 |
</g>
|
| 3909 |
<g id="text_1">
|
| 3910 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 3911 |
</g>
|
| 3912 |
</g>
|
| 3913 |
<g id="xtick_2">
|
| 3914 |
<g id="grid-x--2" class="grid grid-x">
|
| 3915 |
-
<path d="M
|
| 3916 |
</g>
|
| 3917 |
<g id="line2d_2">
|
| 3918 |
<g>
|
| 3919 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 3920 |
</g>
|
| 3921 |
</g>
|
| 3922 |
<g id="text_2">
|
| 3923 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 3924 |
</g>
|
| 3925 |
</g>
|
| 3926 |
<g id="xtick_3">
|
| 3927 |
<g id="grid-x--3" class="grid grid-x">
|
| 3928 |
-
<path d="M
|
| 3929 |
</g>
|
| 3930 |
<g id="line2d_3">
|
| 3931 |
<g>
|
| 3932 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 3933 |
</g>
|
| 3934 |
</g>
|
| 3935 |
<g id="text_3">
|
| 3936 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 3937 |
</g>
|
| 3938 |
</g>
|
| 3939 |
<g id="xtick_4">
|
| 3940 |
<g id="grid-x--4" class="grid grid-x">
|
| 3941 |
-
<path d="M
|
| 3942 |
</g>
|
| 3943 |
<g id="line2d_4">
|
| 3944 |
<g>
|
| 3945 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 3946 |
</g>
|
| 3947 |
</g>
|
| 3948 |
<g id="text_4">
|
| 3949 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 3950 |
</g>
|
| 3951 |
</g>
|
| 3952 |
<g id="xtick_5">
|
| 3953 |
<g id="grid-x--5" class="grid grid-x">
|
| 3954 |
-
<path d="M
|
| 3955 |
</g>
|
| 3956 |
<g id="line2d_5">
|
| 3957 |
<g>
|
| 3958 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 3959 |
</g>
|
| 3960 |
</g>
|
| 3961 |
<g id="text_5">
|
| 3962 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 3963 |
</g>
|
| 3964 |
</g>
|
| 3965 |
<g id="xtick_6">
|
| 3966 |
<g id="grid-x--6" class="grid grid-x">
|
| 3967 |
-
<path d="M 799.
|
| 3968 |
</g>
|
| 3969 |
<g id="line2d_6">
|
| 3970 |
<g>
|
| 3971 |
-
<use ns4:href="#mafb3703e5b" x="799.
|
| 3972 |
</g>
|
| 3973 |
</g>
|
| 3974 |
<g id="text_6">
|
| 3975 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.
|
| 3976 |
</g>
|
| 3977 |
</g>
|
| 3978 |
<g id="label--x" class="xlabel">
|
| 3979 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="matplotlib.axis_2">
|
| 3983 |
<g id="ytick_1">
|
| 3984 |
<g id="grid-y--2" class="grid grid-y">
|
| 3985 |
-
<path d="M
|
| 3986 |
</g>
|
| 3987 |
<g id="line2d_7">
|
| 3988 |
<defs>
|
| 3989 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3990 |
</defs>
|
| 3991 |
<g>
|
| 3992 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_2">
|
| 4000 |
<g id="grid-y--3" class="grid grid-y">
|
| 4001 |
-
<path d="M
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_3">
|
| 4013 |
<g id="grid-y--4" class="grid grid-y">
|
| 4014 |
-
<path d="M
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="ytick_4">
|
| 4026 |
<g id="grid-y--5" class="grid grid-y">
|
| 4027 |
-
<path d="M
|
| 4028 |
</g>
|
| 4029 |
<g id="line2d_10">
|
| 4030 |
<g>
|
| 4031 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_5">
|
| 4039 |
<g id="grid-y--6" class="grid grid-y">
|
| 4040 |
-
<path d="M
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_6">
|
| 4052 |
<g id="grid-y--7" class="grid grid-y">
|
| 4053 |
-
<path d="M
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="label--y" class="ylabel">
|
| 4065 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.
|
| 4066 |
</g>
|
| 4067 |
</g>
|
| 4068 |
<g id="series--torch-flash-ma" class="series">
|
| 4069 |
-
<path d="M
|
| 4070 |
<defs>
|
| 4071 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4072 |
</defs>
|
| 4073 |
-
<g clip-path="url(#
|
| 4074 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4075 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4076 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4077 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4078 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4079 |
-
<use ns4:href="#md7efaf3aec" x="799.
|
| 4080 |
</g>
|
| 4081 |
</g>
|
| 4082 |
<g id="series--torch-mem-eff" class="series">
|
| 4083 |
-
<path d="M
|
| 4084 |
<defs>
|
| 4085 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4086 |
</defs>
|
| 4087 |
-
<g clip-path="url(#
|
| 4088 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4089 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4090 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4091 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4092 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4093 |
-
<use ns4:href="#m9b8c54d372" x="799.
|
| 4094 |
</g>
|
| 4095 |
</g>
|
| 4096 |
<g id="series--xformers-meff" class="series">
|
| 4097 |
-
<path d="M
|
| 4098 |
<defs>
|
| 4099 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4100 |
</defs>
|
| 4101 |
-
<g clip-path="url(#
|
| 4102 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4103 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4104 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4105 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4106 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4107 |
-
<use ns4:href="#mc655281e0b" x="799.
|
| 4108 |
</g>
|
| 4109 |
</g>
|
| 4110 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4111 |
-
<path d="M
|
| 4112 |
<defs>
|
| 4113 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4114 |
</defs>
|
| 4115 |
-
<g clip-path="url(#
|
| 4116 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4117 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4118 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4119 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4120 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4121 |
-
<use ns4:href="#m61c8040d7e" x="799.
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4125 |
-
<path d="M
|
| 4126 |
<defs>
|
| 4127 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4128 |
</defs>
|
| 4129 |
-
<g clip-path="url(#
|
| 4130 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4131 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4132 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4133 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4134 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4135 |
-
<use ns4:href="#m7cd35be9cc" x="799.
|
| 4136 |
</g>
|
| 4137 |
</g>
|
| 4138 |
<g id="patch_3">
|
| 4139 |
-
<path d="M
|
| 4140 |
</g>
|
| 4141 |
<g id="patch_4">
|
| 4142 |
<path d="M 835.361742 447.507117 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4143 |
</g>
|
| 4144 |
<g id="patch_5">
|
| 4145 |
-
<path d="M
|
| 4146 |
</g>
|
| 4147 |
<g id="patch_6">
|
| 4148 |
-
<path d="M
|
| 4149 |
</g>
|
| 4150 |
-
<g id="
|
| 4151 |
-
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="
|
| 4152 |
</g>
|
| 4153 |
<g id="legend" class="legend">
|
| 4154 |
<g id="patch_7">
|
| 4155 |
-
<path d="M
|
| 4156 |
</g>
|
| 4157 |
-
<g id="
|
| 4158 |
-
<path d="M
|
| 4159 |
<g>
|
| 4160 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4161 |
</g>
|
| 4162 |
</g>
|
| 4163 |
<g id="legend-label--torch-flash-ma" class="legend">
|
| 4164 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4165 |
</g>
|
| 4166 |
-
<g id="
|
| 4167 |
-
<path d="M
|
| 4168 |
<g>
|
| 4169 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4170 |
</g>
|
| 4171 |
</g>
|
| 4172 |
<g id="legend-label--torch-mem-eff" class="legend">
|
| 4173 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4174 |
</g>
|
| 4175 |
-
<g id="
|
| 4176 |
-
<path d="M
|
| 4177 |
<g>
|
| 4178 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4179 |
</g>
|
| 4180 |
</g>
|
| 4181 |
<g id="legend-label--xformers-meff" class="legend">
|
| 4182 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4183 |
</g>
|
| 4184 |
-
<g id="
|
| 4185 |
-
<path d="M
|
| 4186 |
<g>
|
| 4187 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4188 |
</g>
|
| 4189 |
</g>
|
| 4190 |
<g id="legend-label--hf-kernels-flash-attn" class="legend">
|
| 4191 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4192 |
</g>
|
| 4193 |
-
<g id="
|
| 4194 |
-
<path d="M
|
| 4195 |
<g>
|
| 4196 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4197 |
</g>
|
| 4198 |
</g>
|
| 4199 |
<g id="legend-label--hf-kernels-flash-attn3" class="legend">
|
| 4200 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4201 |
</g>
|
| 4202 |
</g>
|
| 4203 |
</g>
|
| 4204 |
</g>
|
| 4205 |
<defs>
|
| 4206 |
-
<clipPath id="
|
| 4207 |
-
<rect x="
|
| 4208 |
</clipPath>
|
| 4209 |
</defs>
|
| 4210 |
</svg>
|
|
@@ -4217,7 +4230,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4217 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4218 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4219 |
</span> |
|
| 4220 |
-
Cell: combine |
|
| 4221 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4222 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4223 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4297,25 +4310,25 @@ Cell: combine | 39.40s
|
|
| 4297 |
<div class="cell-stdout"><pre class="stdout-text">======================================================================
|
| 4298 |
LOADING BENCHMARK DATA
|
| 4299 |
======================================================================
|
| 4300 |
-
✓ Flash (PyTorch SDPA) : /
|
| 4301 |
-
✓ MemEff (PyTorch SDPA) : /
|
| 4302 |
-
✓ xFormers : /
|
| 4303 |
-
✓ HF Kernels Flash Attn : /
|
| 4304 |
-
✓ HF Kernels Flash Attn3 : /
|
| 4305 |
-
✓ SageAttention : /
|
| 4306 |
|
| 4307 |
✓ Found Flash (PyTorch SDPA)
|
| 4308 |
-
Path: /
|
| 4309 |
✓ Found MemEff (PyTorch SDPA)
|
| 4310 |
-
Path: /
|
| 4311 |
✓ Found xFormers
|
| 4312 |
-
Path: /
|
| 4313 |
✓ Found HF Kernels Flash Attn
|
| 4314 |
-
Path: /
|
| 4315 |
✓ Found HF Kernels Flash Attn3
|
| 4316 |
-
Path: /
|
| 4317 |
✓ Found SageAttention
|
| 4318 |
-
Path: /
|
| 4319 |
|
| 4320 |
======================================================================
|
| 4321 |
Summary: 6 found, 0 skipped, 0 missing
|
|
@@ -4324,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing
|
|
| 4324 |
COMBINED BENCHMARK SUMMARY
|
| 4325 |
|
| 4326 |
impl wl p50(ms) ok
|
| 4327 |
-
hf_kernels_flash_attn cuda_attn_L128_bfloat16
|
| 4328 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16
|
| 4329 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16
|
| 4330 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16
|
| 4331 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16
|
| 4332 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16
|
| 4333 |
-
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16
|
| 4334 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16
|
| 4335 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16
|
| 4336 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16
|
| 4337 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16
|
| 4338 |
-
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16
|
| 4339 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4340 |
-
Error: module '
|
| 4341 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4342 |
-
Error: module '
|
| 4343 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4344 |
-
Error: module '
|
| 4345 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4346 |
-
Error: module '
|
| 4347 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4348 |
-
Error: module '
|
| 4349 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4350 |
-
Error: module '
|
| 4351 |
-
torch_flash_ma cuda_attn_L128_bfloat16
|
| 4352 |
-
torch_flash_ma cuda_attn_L256_bfloat16
|
| 4353 |
-
torch_flash_ma cuda_attn_L320_bfloat16
|
| 4354 |
-
torch_flash_ma cuda_attn_L384_bfloat16
|
| 4355 |
-
torch_flash_ma cuda_attn_L448_bfloat16
|
| 4356 |
-
torch_flash_ma cuda_attn_L512_bfloat16
|
| 4357 |
-
torch_mem_eff cuda_attn_L128_bfloat16
|
| 4358 |
-
torch_mem_eff cuda_attn_L256_bfloat16
|
| 4359 |
-
torch_mem_eff cuda_attn_L320_bfloat16
|
| 4360 |
-
torch_mem_eff cuda_attn_L384_bfloat16
|
| 4361 |
-
torch_mem_eff cuda_attn_L448_bfloat16
|
| 4362 |
-
torch_mem_eff cuda_attn_L512_bfloat16
|
| 4363 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4364 |
-
xformers_meff cuda_attn_L256_bfloat16
|
| 4365 |
-
xformers_meff cuda_attn_L320_bfloat16
|
| 4366 |
-
xformers_meff cuda_attn_L384_bfloat16
|
| 4367 |
-
xformers_meff cuda_attn_L448_bfloat16
|
| 4368 |
-
xformers_meff cuda_attn_L512_bfloat16
|
| 4369 |
|
| 4370 |
GENERATING COMBINED VISUALIZATION
|
| 4371 |
|
|
@@ -4389,53 +4402,7 @@ Implementations included:
|
|
| 4389 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4390 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4391 |
<div class="uv-logs-content" style="display: none;">
|
| 4392 |
-
|
| 4393 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4394 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4395 |
-
Downloading networkx (1.9MiB)
|
| 4396 |
-
Downloading kiwisolver (1.4MiB)
|
| 4397 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4398 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4399 |
-
Downloading pillow (6.7MiB)
|
| 4400 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4401 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4402 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4403 |
-
Downloading numpy (15.9MiB)
|
| 4404 |
-
Downloading fonttools (4.7MiB)
|
| 4405 |
-
Downloading setuptools (1.1MiB)
|
| 4406 |
-
Downloading sympy (6.0MiB)
|
| 4407 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4408 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4409 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4410 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4411 |
-
Downloading torch (846.8MiB)
|
| 4412 |
-
Downloading matplotlib (8.3MiB)
|
| 4413 |
-
Downloading triton (148.4MiB)
|
| 4414 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4415 |
-
Downloading nvidia-cufile-cu12
|
| 4416 |
-
Downloading kiwisolver
|
| 4417 |
-
Downloading setuptools
|
| 4418 |
-
Downloading fonttools
|
| 4419 |
-
Downloading networkx
|
| 4420 |
-
Downloading pillow
|
| 4421 |
-
Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4422 |
-
Downloading matplotlib
|
| 4423 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4424 |
-
Downloading numpy
|
| 4425 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4426 |
-
Downloading sympy
|
| 4427 |
-
Downloading nvidia-curand-cu12
|
| 4428 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4429 |
-
Downloading triton
|
| 4430 |
-
Downloading nvidia-cufft-cu12
|
| 4431 |
-
Downloading nvidia-cusolver-cu12
|
| 4432 |
-
Downloading nvidia-cusparselt-cu12
|
| 4433 |
-
Downloading nvidia-cusparse-cu12
|
| 4434 |
-
Downloading nvidia-nccl-cu12
|
| 4435 |
-
Downloading nvidia-cublas-cu12
|
| 4436 |
-
Downloading nvidia-cudnn-cu12
|
| 4437 |
-
Downloading torch
|
| 4438 |
-
Installed 37 packages in 230ms
|
| 4439 |
</div>
|
| 4440 |
</div>
|
| 4441 |
<div class="cell-artifacts">
|
|
@@ -4448,7 +4415,7 @@ Installed 37 packages in 230ms
|
|
| 4448 |
<rdf:RDF>
|
| 4449 |
<ns2:Work>
|
| 4450 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4451 |
-
<dc:date>2025-10-
|
| 4452 |
<dc:format>image/svg+xml</dc:format>
|
| 4453 |
<dc:creator>
|
| 4454 |
<ns2:Agent>
|
|
@@ -4467,320 +4434,333 @@ Installed 37 packages in 230ms
|
|
| 4467 |
</g>
|
| 4468 |
<g id="axes--1" class="axes">
|
| 4469 |
<g id="patch_2">
|
| 4470 |
-
<path d="M
|
| 4471 |
</g>
|
| 4472 |
<g id="matplotlib.axis_1">
|
| 4473 |
<g id="xtick_1">
|
| 4474 |
<g id="grid-x--1" class="grid grid-x">
|
| 4475 |
-
<path d="M
|
| 4476 |
</g>
|
| 4477 |
<g id="line2d_1">
|
| 4478 |
<defs>
|
| 4479 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4480 |
</defs>
|
| 4481 |
<g>
|
| 4482 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 4483 |
</g>
|
| 4484 |
</g>
|
| 4485 |
<g id="text_1">
|
| 4486 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 4487 |
</g>
|
| 4488 |
</g>
|
| 4489 |
<g id="xtick_2">
|
| 4490 |
<g id="grid-x--2" class="grid grid-x">
|
| 4491 |
-
<path d="M
|
| 4492 |
</g>
|
| 4493 |
<g id="line2d_2">
|
| 4494 |
<g>
|
| 4495 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 4496 |
</g>
|
| 4497 |
</g>
|
| 4498 |
<g id="text_2">
|
| 4499 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 4500 |
</g>
|
| 4501 |
</g>
|
| 4502 |
<g id="xtick_3">
|
| 4503 |
<g id="grid-x--3" class="grid grid-x">
|
| 4504 |
-
<path d="M
|
| 4505 |
</g>
|
| 4506 |
<g id="line2d_3">
|
| 4507 |
<g>
|
| 4508 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="text_3">
|
| 4512 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 4513 |
</g>
|
| 4514 |
</g>
|
| 4515 |
<g id="xtick_4">
|
| 4516 |
<g id="grid-x--4" class="grid grid-x">
|
| 4517 |
-
<path d="M
|
| 4518 |
</g>
|
| 4519 |
<g id="line2d_4">
|
| 4520 |
<g>
|
| 4521 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="text_4">
|
| 4525 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 4526 |
</g>
|
| 4527 |
</g>
|
| 4528 |
<g id="xtick_5">
|
| 4529 |
<g id="grid-x--5" class="grid grid-x">
|
| 4530 |
-
<path d="M
|
| 4531 |
</g>
|
| 4532 |
<g id="line2d_5">
|
| 4533 |
<g>
|
| 4534 |
-
<use ns4:href="#mafb3703e5b" x="
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="text_5">
|
| 4538 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(
|
| 4539 |
</g>
|
| 4540 |
</g>
|
| 4541 |
<g id="xtick_6">
|
| 4542 |
<g id="grid-x--6" class="grid grid-x">
|
| 4543 |
-
<path d="M 799.
|
| 4544 |
</g>
|
| 4545 |
<g id="line2d_6">
|
| 4546 |
<g>
|
| 4547 |
-
<use ns4:href="#mafb3703e5b" x="799.
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="text_6">
|
| 4551 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.
|
| 4552 |
</g>
|
| 4553 |
</g>
|
| 4554 |
<g id="label--x" class="xlabel">
|
| 4555 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="
|
| 4556 |
</g>
|
| 4557 |
</g>
|
| 4558 |
<g id="matplotlib.axis_2">
|
| 4559 |
<g id="ytick_1">
|
| 4560 |
<g id="grid-y--2" class="grid grid-y">
|
| 4561 |
-
<path d="M
|
| 4562 |
</g>
|
| 4563 |
<g id="line2d_7">
|
| 4564 |
<defs>
|
| 4565 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4566 |
</defs>
|
| 4567 |
<g>
|
| 4568 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4569 |
</g>
|
| 4570 |
</g>
|
| 4571 |
<g id="text_7">
|
| 4572 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4573 |
</g>
|
| 4574 |
</g>
|
| 4575 |
<g id="ytick_2">
|
| 4576 |
<g id="grid-y--3" class="grid grid-y">
|
| 4577 |
-
<path d="M
|
| 4578 |
</g>
|
| 4579 |
<g id="line2d_8">
|
| 4580 |
<g>
|
| 4581 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4582 |
</g>
|
| 4583 |
</g>
|
| 4584 |
<g id="text_8">
|
| 4585 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4586 |
</g>
|
| 4587 |
</g>
|
| 4588 |
<g id="ytick_3">
|
| 4589 |
<g id="grid-y--4" class="grid grid-y">
|
| 4590 |
-
<path d="M
|
| 4591 |
</g>
|
| 4592 |
<g id="line2d_9">
|
| 4593 |
<g>
|
| 4594 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4595 |
</g>
|
| 4596 |
</g>
|
| 4597 |
<g id="text_9">
|
| 4598 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4599 |
</g>
|
| 4600 |
</g>
|
| 4601 |
<g id="ytick_4">
|
| 4602 |
<g id="grid-y--5" class="grid grid-y">
|
| 4603 |
-
<path d="M
|
| 4604 |
</g>
|
| 4605 |
<g id="line2d_10">
|
| 4606 |
<g>
|
| 4607 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4608 |
</g>
|
| 4609 |
</g>
|
| 4610 |
<g id="text_10">
|
| 4611 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4612 |
</g>
|
| 4613 |
</g>
|
| 4614 |
<g id="ytick_5">
|
| 4615 |
<g id="grid-y--6" class="grid grid-y">
|
| 4616 |
-
<path d="M
|
| 4617 |
</g>
|
| 4618 |
<g id="line2d_11">
|
| 4619 |
<g>
|
| 4620 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4621 |
</g>
|
| 4622 |
</g>
|
| 4623 |
<g id="text_11">
|
| 4624 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
| 4625 |
</g>
|
| 4626 |
</g>
|
| 4627 |
<g id="ytick_6">
|
| 4628 |
<g id="grid-y--7" class="grid grid-y">
|
| 4629 |
-
<path d="M
|
| 4630 |
</g>
|
| 4631 |
<g id="line2d_12">
|
| 4632 |
<g>
|
| 4633 |
-
<use ns4:href="#m0fca2865ba" x="
|
| 4634 |
</g>
|
| 4635 |
</g>
|
| 4636 |
<g id="text_12">
|
| 4637 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4638 |
</g>
|
| 4639 |
</g>
|
| 4640 |
<g id="label--y" class="ylabel">
|
| 4641 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.
|
| 4642 |
</g>
|
| 4643 |
</g>
|
| 4644 |
<g id="series--torch-flash-ma" class="series">
|
| 4645 |
-
<path d="M
|
| 4646 |
<defs>
|
| 4647 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4648 |
</defs>
|
| 4649 |
-
<g clip-path="url(#
|
| 4650 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4651 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4652 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4653 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4654 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4655 |
-
<use ns4:href="#md7efaf3aec" x="799.
|
| 4656 |
</g>
|
| 4657 |
</g>
|
| 4658 |
<g id="series--torch-mem-eff" class="series">
|
| 4659 |
-
<path d="M
|
| 4660 |
<defs>
|
| 4661 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4662 |
</defs>
|
| 4663 |
-
<g clip-path="url(#
|
| 4664 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4665 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4666 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4667 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4668 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4669 |
-
<use ns4:href="#m9b8c54d372" x="799.
|
| 4670 |
</g>
|
| 4671 |
</g>
|
| 4672 |
<g id="series--xformers-meff" class="series">
|
| 4673 |
-
<path d="M
|
| 4674 |
<defs>
|
| 4675 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4676 |
</defs>
|
| 4677 |
-
<g clip-path="url(#
|
| 4678 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4679 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4680 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4681 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4682 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4683 |
-
<use ns4:href="#mc655281e0b" x="799.
|
| 4684 |
</g>
|
| 4685 |
</g>
|
| 4686 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4687 |
-
<path d="M
|
| 4688 |
<defs>
|
| 4689 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4690 |
</defs>
|
| 4691 |
-
<g clip-path="url(#
|
| 4692 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4693 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4694 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4695 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4696 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4697 |
-
<use ns4:href="#m61c8040d7e" x="799.
|
| 4698 |
</g>
|
| 4699 |
</g>
|
| 4700 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4701 |
-
<path d="M
|
| 4702 |
<defs>
|
| 4703 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4704 |
</defs>
|
| 4705 |
-
<g clip-path="url(#
|
| 4706 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4707 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4708 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4709 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4710 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4711 |
-
<use ns4:href="#m7cd35be9cc" x="799.
|
| 4712 |
</g>
|
| 4713 |
</g>
|
| 4714 |
<g id="patch_3">
|
| 4715 |
-
<path d="M
|
| 4716 |
</g>
|
| 4717 |
<g id="patch_4">
|
| 4718 |
<path d="M 835.361742 447.507117 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4719 |
</g>
|
| 4720 |
<g id="patch_5">
|
| 4721 |
-
<path d="M
|
| 4722 |
</g>
|
| 4723 |
<g id="patch_6">
|
| 4724 |
-
<path d="M
|
| 4725 |
</g>
|
| 4726 |
-
<g id="
|
| 4727 |
-
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="
|
| 4728 |
</g>
|
| 4729 |
<g id="legend" class="legend">
|
| 4730 |
<g id="patch_7">
|
| 4731 |
-
<path d="M
|
| 4732 |
</g>
|
| 4733 |
-
<g id="
|
| 4734 |
-
<path d="M
|
| 4735 |
<g>
|
| 4736 |
-
<use ns4:href="#md7efaf3aec" x="
|
| 4737 |
</g>
|
| 4738 |
</g>
|
| 4739 |
<g id="legend-label--torch-flash-ma" class="legend">
|
| 4740 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4741 |
</g>
|
| 4742 |
-
<g id="
|
| 4743 |
-
<path d="M
|
| 4744 |
<g>
|
| 4745 |
-
<use ns4:href="#m9b8c54d372" x="
|
| 4746 |
</g>
|
| 4747 |
</g>
|
| 4748 |
<g id="legend-label--torch-mem-eff" class="legend">
|
| 4749 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4750 |
</g>
|
| 4751 |
-
<g id="
|
| 4752 |
-
<path d="M
|
| 4753 |
<g>
|
| 4754 |
-
<use ns4:href="#mc655281e0b" x="
|
| 4755 |
</g>
|
| 4756 |
</g>
|
| 4757 |
<g id="legend-label--xformers-meff" class="legend">
|
| 4758 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4759 |
</g>
|
| 4760 |
-
<g id="
|
| 4761 |
-
<path d="M
|
| 4762 |
<g>
|
| 4763 |
-
<use ns4:href="#m61c8040d7e" x="
|
| 4764 |
</g>
|
| 4765 |
</g>
|
| 4766 |
<g id="legend-label--hf-kernels-flash-attn" class="legend">
|
| 4767 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4768 |
</g>
|
| 4769 |
-
<g id="
|
| 4770 |
-
<path d="M
|
| 4771 |
<g>
|
| 4772 |
-
<use ns4:href="#m7cd35be9cc" x="
|
| 4773 |
</g>
|
| 4774 |
</g>
|
| 4775 |
<g id="legend-label--hf-kernels-flash-attn3" class="legend">
|
| 4776 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="
|
| 4777 |
</g>
|
| 4778 |
</g>
|
| 4779 |
</g>
|
| 4780 |
</g>
|
| 4781 |
<defs>
|
| 4782 |
-
<clipPath id="
|
| 4783 |
-
<rect x="
|
| 4784 |
</clipPath>
|
| 4785 |
</defs>
|
| 4786 |
</svg>
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-27T14:46:38.946915</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 3891 |
</g>
|
| 3892 |
<g id="axes--1" class="axes">
|
| 3893 |
<g id="patch_2">
|
| 3894 |
+
<path d="M 47.81 447.507117 L 835.361742 447.507117 L 835.361742 26.88 L 47.81 26.88 L 47.81 447.507117 z " style="fill: none" />
|
| 3895 |
</g>
|
| 3896 |
<g id="matplotlib.axis_1">
|
| 3897 |
<g id="xtick_1">
|
| 3898 |
<g id="grid-x--1" class="grid grid-x">
|
| 3899 |
+
<path d="M 83.607806 447.507117 L 83.607806 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3900 |
</g>
|
| 3901 |
<g id="line2d_1">
|
| 3902 |
<defs>
|
| 3903 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3904 |
</defs>
|
| 3905 |
<g>
|
| 3906 |
+
<use ns4:href="#mafb3703e5b" x="83.607806" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 3907 |
</g>
|
| 3908 |
</g>
|
| 3909 |
<g id="text_1">
|
| 3910 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(40.977554 548.84621) rotate(-45)">cuda_attn_L128_bfloat16</text>
|
| 3911 |
</g>
|
| 3912 |
</g>
|
| 3913 |
<g id="xtick_2">
|
| 3914 |
<g id="grid-x--2" class="grid grid-x">
|
| 3915 |
+
<path d="M 226.799032 447.507117 L 226.799032 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3916 |
</g>
|
| 3917 |
<g id="line2d_2">
|
| 3918 |
<g>
|
| 3919 |
+
<use ns4:href="#mafb3703e5b" x="226.799032" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 3920 |
</g>
|
| 3921 |
</g>
|
| 3922 |
<g id="text_2">
|
| 3923 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(184.16878 548.84621) rotate(-45)">cuda_attn_L256_bfloat16</text>
|
| 3924 |
</g>
|
| 3925 |
</g>
|
| 3926 |
<g id="xtick_3">
|
| 3927 |
<g id="grid-x--3" class="grid grid-x">
|
| 3928 |
+
<path d="M 369.990258 447.507117 L 369.990258 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3929 |
</g>
|
| 3930 |
<g id="line2d_3">
|
| 3931 |
<g>
|
| 3932 |
+
<use ns4:href="#mafb3703e5b" x="369.990258" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 3933 |
</g>
|
| 3934 |
</g>
|
| 3935 |
<g id="text_3">
|
| 3936 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(327.360005 548.84621) rotate(-45)">cuda_attn_L320_bfloat16</text>
|
| 3937 |
</g>
|
| 3938 |
</g>
|
| 3939 |
<g id="xtick_4">
|
| 3940 |
<g id="grid-x--4" class="grid grid-x">
|
| 3941 |
+
<path d="M 513.181484 447.507117 L 513.181484 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3942 |
</g>
|
| 3943 |
<g id="line2d_4">
|
| 3944 |
<g>
|
| 3945 |
+
<use ns4:href="#mafb3703e5b" x="513.181484" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 3946 |
</g>
|
| 3947 |
</g>
|
| 3948 |
<g id="text_4">
|
| 3949 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(470.551231 548.84621) rotate(-45)">cuda_attn_L384_bfloat16</text>
|
| 3950 |
</g>
|
| 3951 |
</g>
|
| 3952 |
<g id="xtick_5">
|
| 3953 |
<g id="grid-x--5" class="grid grid-x">
|
| 3954 |
+
<path d="M 656.37271 447.507117 L 656.37271 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3955 |
</g>
|
| 3956 |
<g id="line2d_5">
|
| 3957 |
<g>
|
| 3958 |
+
<use ns4:href="#mafb3703e5b" x="656.37271" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 3959 |
</g>
|
| 3960 |
</g>
|
| 3961 |
<g id="text_5">
|
| 3962 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(613.742457 548.84621) rotate(-45)">cuda_attn_L448_bfloat16</text>
|
| 3963 |
</g>
|
| 3964 |
</g>
|
| 3965 |
<g id="xtick_6">
|
| 3966 |
<g id="grid-x--6" class="grid grid-x">
|
| 3967 |
+
<path d="M 799.563935 447.507117 L 799.563935 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3968 |
</g>
|
| 3969 |
<g id="line2d_6">
|
| 3970 |
<g>
|
| 3971 |
+
<use ns4:href="#mafb3703e5b" x="799.563935" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 3972 |
</g>
|
| 3973 |
</g>
|
| 3974 |
<g id="text_6">
|
| 3975 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.933683 548.84621) rotate(-45)">cuda_attn_L512_bfloat16</text>
|
| 3976 |
</g>
|
| 3977 |
</g>
|
| 3978 |
<g id="label--x" class="xlabel">
|
| 3979 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="562.111872" transform="rotate(-0 441.585871 562.111872)">Workload</text>
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="matplotlib.axis_2">
|
| 3983 |
<g id="ytick_1">
|
| 3984 |
<g id="grid-y--2" class="grid grid-y">
|
| 3985 |
+
<path d="M 47.81 413.210177 L 835.361742 413.210177 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3986 |
</g>
|
| 3987 |
<g id="line2d_7">
|
| 3988 |
<defs>
|
| 3989 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3990 |
</defs>
|
| 3991 |
<g>
|
| 3992 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="413.210177" style="stroke: #000000; stroke-width: 0.8" />
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="417.009396" transform="rotate(-0 40.81 417.009396)">1.0</text>
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_2">
|
| 4000 |
<g id="grid-y--3" class="grid grid-y">
|
| 4001 |
+
<path d="M 47.81 355.233116 L 835.361742 355.233116 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="355.233116" style="stroke: #000000; stroke-width: 0.8" />
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="359.032335" transform="rotate(-0 40.81 359.032335)">1.2</text>
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_3">
|
| 4013 |
<g id="grid-y--4" class="grid grid-y">
|
| 4014 |
+
<path d="M 47.81 297.256055 L 835.361742 297.256055 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="297.256055" style="stroke: #000000; stroke-width: 0.8" />
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="301.055273" transform="rotate(-0 40.81 301.055273)">1.4</text>
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="ytick_4">
|
| 4026 |
<g id="grid-y--5" class="grid grid-y">
|
| 4027 |
+
<path d="M 47.81 239.278993 L 835.361742 239.278993 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4028 |
</g>
|
| 4029 |
<g id="line2d_10">
|
| 4030 |
<g>
|
| 4031 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="239.278993" style="stroke: #000000; stroke-width: 0.8" />
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="243.078212" transform="rotate(-0 40.81 243.078212)">1.6</text>
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_5">
|
| 4039 |
<g id="grid-y--6" class="grid grid-y">
|
| 4040 |
+
<path d="M 47.81 181.301932 L 835.361742 181.301932 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="181.301932" style="stroke: #000000; stroke-width: 0.8" />
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="185.101151" transform="rotate(-0 40.81 185.101151)">1.8</text>
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_6">
|
| 4052 |
<g id="grid-y--7" class="grid grid-y">
|
| 4053 |
+
<path d="M 47.81 123.324871 L 835.361742 123.324871 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="123.324871" style="stroke: #000000; stroke-width: 0.8" />
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="127.124089" transform="rotate(-0 40.81 127.124089)">2.0</text>
|
| 4062 |
+
</g>
|
| 4063 |
+
</g>
|
| 4064 |
+
<g id="ytick_7">
|
| 4065 |
+
<g id="grid-y--8" class="grid grid-y">
|
| 4066 |
+
<path d="M 47.81 65.347809 L 835.361742 65.347809 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4067 |
+
</g>
|
| 4068 |
+
<g id="line2d_13">
|
| 4069 |
+
<g>
|
| 4070 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="65.347809" style="stroke: #000000; stroke-width: 0.8" />
|
| 4071 |
+
</g>
|
| 4072 |
+
</g>
|
| 4073 |
+
<g id="text_13">
|
| 4074 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="69.147028" transform="rotate(-0 40.81 69.147028)">2.2</text>
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="label--y" class="ylabel">
|
| 4078 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.827187" y="237.193558" transform="rotate(-90 18.827187 237.193558)">Latency P50 (ms)</text>
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="series--torch-flash-ma" class="series">
|
| 4082 |
+
<path d="M 83.607806 349.439178 L 226.799032 333.602454 L 369.990258 324.473676 L 513.181484 316.069901 L 656.37271 272.899601 L 799.563935 261.559288 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4083 |
<defs>
|
| 4084 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4085 |
</defs>
|
| 4086 |
+
<g clip-path="url(#p09feef2583)">
|
| 4087 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="349.439178" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4088 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="333.602454" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4089 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="324.473676" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4090 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="316.069901" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4091 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="272.899601" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4092 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="261.559288" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4093 |
</g>
|
| 4094 |
</g>
|
| 4095 |
<g id="series--torch-mem-eff" class="series">
|
| 4096 |
+
<path d="M 83.607806 156.020744 L 226.799032 138.969401 L 369.990258 109.128607 L 513.181484 99.249026 L 656.37271 87.05645 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4097 |
<defs>
|
| 4098 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4099 |
</defs>
|
| 4100 |
+
<g clip-path="url(#p09feef2583)">
|
| 4101 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="156.020744" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4102 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="138.969401" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4103 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="109.128607" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4104 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="99.249026" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4105 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="87.05645" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4106 |
+
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4107 |
</g>
|
| 4108 |
</g>
|
| 4109 |
<g id="series--xformers-meff" class="series">
|
| 4110 |
+
<path d="M 83.607806 414.345368 L 226.799032 400.181572 L 369.990258 385.808769 L 513.181484 380.581847 L 656.37271 338.122056 L 799.563935 339.866876 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4111 |
<defs>
|
| 4112 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4113 |
</defs>
|
| 4114 |
+
<g clip-path="url(#p09feef2583)">
|
| 4115 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="414.345368" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4116 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="400.181572" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4117 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="385.808769" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4118 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="380.581847" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4119 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="338.122056" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4120 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="339.866876" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4121 |
</g>
|
| 4122 |
</g>
|
| 4123 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4124 |
+
<path d="M 83.607806 420.20395 L 226.799032 407.432473 L 369.990258 399.40236 L 513.181484 392.590345 L 656.37271 345.709514 L 799.563935 346.355668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4125 |
<defs>
|
| 4126 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4127 |
</defs>
|
| 4128 |
+
<g clip-path="url(#p09feef2583)">
|
| 4129 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="420.20395" style="fill: #d62728; stroke: #d62728" />
|
| 4130 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="407.432473" style="fill: #d62728; stroke: #d62728" />
|
| 4131 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="399.40236" style="fill: #d62728; stroke: #d62728" />
|
| 4132 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="392.590345" style="fill: #d62728; stroke: #d62728" />
|
| 4133 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="345.709514" style="fill: #d62728; stroke: #d62728" />
|
| 4134 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="346.355668" style="fill: #d62728; stroke: #d62728" />
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4138 |
+
<path d="M 83.607806 428.387702 L 226.799032 420.061906 L 369.990258 405.625328 L 513.181484 401.010644 L 656.37271 352.807645 L 799.563935 359.622849 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4139 |
<defs>
|
| 4140 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4141 |
</defs>
|
| 4142 |
+
<g clip-path="url(#p09feef2583)">
|
| 4143 |
+
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4144 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="420.061906" style="fill: #9467bd; stroke: #9467bd" />
|
| 4145 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="405.625328" style="fill: #9467bd; stroke: #9467bd" />
|
| 4146 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="401.010644" style="fill: #9467bd; stroke: #9467bd" />
|
| 4147 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="352.807645" style="fill: #9467bd; stroke: #9467bd" />
|
| 4148 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="359.622849" style="fill: #9467bd; stroke: #9467bd" />
|
| 4149 |
</g>
|
| 4150 |
</g>
|
| 4151 |
<g id="patch_3">
|
| 4152 |
+
<path d="M 47.81 447.507117 L 47.81 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4153 |
</g>
|
| 4154 |
<g id="patch_4">
|
| 4155 |
<path d="M 835.361742 447.507117 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4156 |
</g>
|
| 4157 |
<g id="patch_5">
|
| 4158 |
+
<path d="M 47.81 447.507117 L 835.361742 447.507117 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4159 |
</g>
|
| 4160 |
<g id="patch_6">
|
| 4161 |
+
<path d="M 47.81 26.88 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4162 |
</g>
|
| 4163 |
+
<g id="text_14">
|
| 4164 |
+
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="20.88" transform="rotate(-0 441.585871 20.88)">Attention Implementation Latency</text>
|
| 4165 |
</g>
|
| 4166 |
<g id="legend" class="legend">
|
| 4167 |
<g id="patch_7">
|
| 4168 |
+
<path d="M 54.81 109.66125 L 198.305313 109.66125 Q 200.305313 109.66125 200.305313 107.66125 L 200.305313 33.88 Q 200.305313 31.88 198.305313 31.88 L 54.81 31.88 Q 52.81 31.88 52.81 33.88 L 52.81 107.66125 Q 52.81 109.66125 54.81 109.66125 L 54.81 109.66125 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4169 |
</g>
|
| 4170 |
+
<g id="line2d_14">
|
| 4171 |
+
<path d="M 56.81 39.978438 L 66.81 39.978438 L 76.81 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4172 |
<g>
|
| 4173 |
+
<use ns4:href="#md7efaf3aec" x="66.81" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4174 |
</g>
|
| 4175 |
</g>
|
| 4176 |
<g id="legend-label--torch-flash-ma" class="legend">
|
| 4177 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="43.478438" transform="rotate(-0 84.81 43.478438)">torch_flash_ma</text>
|
| 4178 |
</g>
|
| 4179 |
+
<g id="line2d_15">
|
| 4180 |
+
<path d="M 56.81 54.934687 L 66.81 54.934687 L 76.81 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4181 |
<g>
|
| 4182 |
+
<use ns4:href="#m9b8c54d372" x="66.81" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4183 |
</g>
|
| 4184 |
</g>
|
| 4185 |
<g id="legend-label--torch-mem-eff" class="legend">
|
| 4186 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="58.434687" transform="rotate(-0 84.81 58.434687)">torch_mem_eff</text>
|
| 4187 |
</g>
|
| 4188 |
+
<g id="line2d_16">
|
| 4189 |
+
<path d="M 56.81 69.890938 L 66.81 69.890938 L 76.81 69.890938 " style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4190 |
<g>
|
| 4191 |
+
<use ns4:href="#mc655281e0b" x="66.81" y="69.890938" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4192 |
</g>
|
| 4193 |
</g>
|
| 4194 |
<g id="legend-label--xformers-meff" class="legend">
|
| 4195 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="73.390938" transform="rotate(-0 84.81 73.390938)">xformers_meff</text>
|
| 4196 |
</g>
|
| 4197 |
+
<g id="line2d_17">
|
| 4198 |
+
<path d="M 56.81 84.847188 L 66.81 84.847188 L 76.81 84.847188 " style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4199 |
<g>
|
| 4200 |
+
<use ns4:href="#m61c8040d7e" x="66.81" y="84.847188" style="fill: #d62728; stroke: #d62728" />
|
| 4201 |
</g>
|
| 4202 |
</g>
|
| 4203 |
<g id="legend-label--hf-kernels-flash-attn" class="legend">
|
| 4204 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="88.347188" transform="rotate(-0 84.81 88.347188)">hf_kernels_flash_attn</text>
|
| 4205 |
</g>
|
| 4206 |
+
<g id="line2d_18">
|
| 4207 |
+
<path d="M 56.81 99.803438 L 66.81 99.803438 L 76.81 99.803438 " style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4208 |
<g>
|
| 4209 |
+
<use ns4:href="#m7cd35be9cc" x="66.81" y="99.803438" style="fill: #9467bd; stroke: #9467bd" />
|
| 4210 |
</g>
|
| 4211 |
</g>
|
| 4212 |
<g id="legend-label--hf-kernels-flash-attn3" class="legend">
|
| 4213 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="103.303438" transform="rotate(-0 84.81 103.303438)">hf_kernels_flash_attn3</text>
|
| 4214 |
</g>
|
| 4215 |
</g>
|
| 4216 |
</g>
|
| 4217 |
</g>
|
| 4218 |
<defs>
|
| 4219 |
+
<clipPath id="p09feef2583">
|
| 4220 |
+
<rect x="47.81" y="26.88" width="787.551742" height="420.627117" />
|
| 4221 |
</clipPath>
|
| 4222 |
</defs>
|
| 4223 |
</svg>
|
|
|
|
| 4230 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4231 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4232 |
</span> |
|
| 4233 |
+
Cell: combine | 4.50s
|
| 4234 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4235 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4236 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4310 |
<div class="cell-stdout"><pre class="stdout-text">======================================================================
|
| 4311 |
LOADING BENCHMARK DATA
|
| 4312 |
======================================================================
|
| 4313 |
+
✓ Flash (PyTorch SDPA) : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04
|
| 4314 |
+
✓ MemEff (PyTorch SDPA) : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f
|
| 4315 |
+
✓ xFormers : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58
|
| 4316 |
+
✓ HF Kernels Flash Attn : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849
|
| 4317 |
+
✓ HF Kernels Flash Attn3 : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20
|
| 4318 |
+
✓ SageAttention : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/1355120a3e88bcb74f4130be51dfe8b03e7dc2b7823f2a53b20da7899570a16f
|
| 4319 |
|
| 4320 |
✓ Found Flash (PyTorch SDPA)
|
| 4321 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04/attention.jsonl
|
| 4322 |
✓ Found MemEff (PyTorch SDPA)
|
| 4323 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/a23b7ad9cfb9e9968ec4a8f126174dc4a3ab5e6999c65a44570f93656598bd2f/attention.jsonl
|
| 4324 |
✓ Found xFormers
|
| 4325 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58/attention.jsonl
|
| 4326 |
✓ Found HF Kernels Flash Attn
|
| 4327 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849/attention.jsonl
|
| 4328 |
✓ Found HF Kernels Flash Attn3
|
| 4329 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20/attention.jsonl
|
| 4330 |
✓ Found SageAttention
|
| 4331 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/1355120a3e88bcb74f4130be51dfe8b03e7dc2b7823f2a53b20da7899570a16f/attention.jsonl
|
| 4332 |
|
| 4333 |
======================================================================
|
| 4334 |
Summary: 6 found, 0 skipped, 0 missing
|
|
|
|
| 4337 |
COMBINED BENCHMARK SUMMARY
|
| 4338 |
|
| 4339 |
impl wl p50(ms) ok
|
| 4340 |
+
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.98 True
|
| 4341 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
|
| 4342 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
|
| 4343 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
|
| 4344 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
|
| 4345 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
|
| 4346 |
+
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.95 True
|
| 4347 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
|
| 4348 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
|
| 4349 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True
|
| 4350 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.21 True
|
| 4351 |
+
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4352 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4353 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 4354 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4355 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 4356 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4357 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 4358 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4359 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 4360 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4361 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 4362 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4363 |
+
Error: module 'sage_attention_12c766386675beb4' has no attribute 'fwd'
|
| 4364 |
+
torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
|
| 4365 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
|
| 4366 |
+
torch_flash_ma cuda_attn_L320_bfloat16 1.31 True
|
| 4367 |
+
torch_flash_ma cuda_attn_L384_bfloat16 1.34 True
|
| 4368 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
|
| 4369 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
|
| 4370 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.89 True
|
| 4371 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
|
| 4372 |
+
torch_mem_eff cuda_attn_L320_bfloat16 2.05 True
|
| 4373 |
+
torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
|
| 4374 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.13 True
|
| 4375 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.27 True
|
| 4376 |
+
xformers_meff cuda_attn_L128_bfloat16 1.00 True
|
| 4377 |
+
xformers_meff cuda_attn_L256_bfloat16 1.04 True
|
| 4378 |
+
xformers_meff cuda_attn_L320_bfloat16 1.09 True
|
| 4379 |
+
xformers_meff cuda_attn_L384_bfloat16 1.11 True
|
| 4380 |
+
xformers_meff cuda_attn_L448_bfloat16 1.26 True
|
| 4381 |
+
xformers_meff cuda_attn_L512_bfloat16 1.25 True
|
| 4382 |
|
| 4383 |
GENERATING COMBINED VISUALIZATION
|
| 4384 |
|
|
|
|
| 4402 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4403 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4404 |
<div class="uv-logs-content" style="display: none;">
|
| 4405 |
+
Installed 37 packages in 259ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4406 |
</div>
|
| 4407 |
</div>
|
| 4408 |
<div class="cell-artifacts">
|
|
|
|
| 4415 |
<rdf:RDF>
|
| 4416 |
<ns2:Work>
|
| 4417 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4418 |
+
<dc:date>2025-10-27T14:46:38.946915</dc:date>
|
| 4419 |
<dc:format>image/svg+xml</dc:format>
|
| 4420 |
<dc:creator>
|
| 4421 |
<ns2:Agent>
|
|
|
|
| 4434 |
</g>
|
| 4435 |
<g id="axes--1" class="axes">
|
| 4436 |
<g id="patch_2">
|
| 4437 |
+
<path d="M 47.81 447.507117 L 835.361742 447.507117 L 835.361742 26.88 L 47.81 26.88 L 47.81 447.507117 z " style="fill: none" />
|
| 4438 |
</g>
|
| 4439 |
<g id="matplotlib.axis_1">
|
| 4440 |
<g id="xtick_1">
|
| 4441 |
<g id="grid-x--1" class="grid grid-x">
|
| 4442 |
+
<path d="M 83.607806 447.507117 L 83.607806 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4443 |
</g>
|
| 4444 |
<g id="line2d_1">
|
| 4445 |
<defs>
|
| 4446 |
<path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4447 |
</defs>
|
| 4448 |
<g>
|
| 4449 |
+
<use ns4:href="#mafb3703e5b" x="83.607806" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 4450 |
</g>
|
| 4451 |
</g>
|
| 4452 |
<g id="text_1">
|
| 4453 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(40.977554 548.84621) rotate(-45)">cuda_attn_L128_bfloat16</text>
|
| 4454 |
</g>
|
| 4455 |
</g>
|
| 4456 |
<g id="xtick_2">
|
| 4457 |
<g id="grid-x--2" class="grid grid-x">
|
| 4458 |
+
<path d="M 226.799032 447.507117 L 226.799032 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4459 |
</g>
|
| 4460 |
<g id="line2d_2">
|
| 4461 |
<g>
|
| 4462 |
+
<use ns4:href="#mafb3703e5b" x="226.799032" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 4463 |
</g>
|
| 4464 |
</g>
|
| 4465 |
<g id="text_2">
|
| 4466 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(184.16878 548.84621) rotate(-45)">cuda_attn_L256_bfloat16</text>
|
| 4467 |
</g>
|
| 4468 |
</g>
|
| 4469 |
<g id="xtick_3">
|
| 4470 |
<g id="grid-x--3" class="grid grid-x">
|
| 4471 |
+
<path d="M 369.990258 447.507117 L 369.990258 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4472 |
</g>
|
| 4473 |
<g id="line2d_3">
|
| 4474 |
<g>
|
| 4475 |
+
<use ns4:href="#mafb3703e5b" x="369.990258" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 4476 |
</g>
|
| 4477 |
</g>
|
| 4478 |
<g id="text_3">
|
| 4479 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(327.360005 548.84621) rotate(-45)">cuda_attn_L320_bfloat16</text>
|
| 4480 |
</g>
|
| 4481 |
</g>
|
| 4482 |
<g id="xtick_4">
|
| 4483 |
<g id="grid-x--4" class="grid grid-x">
|
| 4484 |
+
<path d="M 513.181484 447.507117 L 513.181484 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4485 |
</g>
|
| 4486 |
<g id="line2d_4">
|
| 4487 |
<g>
|
| 4488 |
+
<use ns4:href="#mafb3703e5b" x="513.181484" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 4489 |
</g>
|
| 4490 |
</g>
|
| 4491 |
<g id="text_4">
|
| 4492 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(470.551231 548.84621) rotate(-45)">cuda_attn_L384_bfloat16</text>
|
| 4493 |
</g>
|
| 4494 |
</g>
|
| 4495 |
<g id="xtick_5">
|
| 4496 |
<g id="grid-x--5" class="grid grid-x">
|
| 4497 |
+
<path d="M 656.37271 447.507117 L 656.37271 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4498 |
</g>
|
| 4499 |
<g id="line2d_5">
|
| 4500 |
<g>
|
| 4501 |
+
<use ns4:href="#mafb3703e5b" x="656.37271" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 4502 |
</g>
|
| 4503 |
</g>
|
| 4504 |
<g id="text_5">
|
| 4505 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(613.742457 548.84621) rotate(-45)">cuda_attn_L448_bfloat16</text>
|
| 4506 |
</g>
|
| 4507 |
</g>
|
| 4508 |
<g id="xtick_6">
|
| 4509 |
<g id="grid-x--6" class="grid grid-x">
|
| 4510 |
+
<path d="M 799.563935 447.507117 L 799.563935 26.88 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4511 |
</g>
|
| 4512 |
<g id="line2d_6">
|
| 4513 |
<g>
|
| 4514 |
+
<use ns4:href="#mafb3703e5b" x="799.563935" y="447.507117" style="stroke: #000000; stroke-width: 0.8" />
|
| 4515 |
</g>
|
| 4516 |
</g>
|
| 4517 |
<g id="text_6">
|
| 4518 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(756.933683 548.84621) rotate(-45)">cuda_attn_L512_bfloat16</text>
|
| 4519 |
</g>
|
| 4520 |
</g>
|
| 4521 |
<g id="label--x" class="xlabel">
|
| 4522 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="562.111872" transform="rotate(-0 441.585871 562.111872)">Workload</text>
|
| 4523 |
</g>
|
| 4524 |
</g>
|
| 4525 |
<g id="matplotlib.axis_2">
|
| 4526 |
<g id="ytick_1">
|
| 4527 |
<g id="grid-y--2" class="grid grid-y">
|
| 4528 |
+
<path d="M 47.81 413.210177 L 835.361742 413.210177 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4529 |
</g>
|
| 4530 |
<g id="line2d_7">
|
| 4531 |
<defs>
|
| 4532 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4533 |
</defs>
|
| 4534 |
<g>
|
| 4535 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="413.210177" style="stroke: #000000; stroke-width: 0.8" />
|
| 4536 |
</g>
|
| 4537 |
</g>
|
| 4538 |
<g id="text_7">
|
| 4539 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="417.009396" transform="rotate(-0 40.81 417.009396)">1.0</text>
|
| 4540 |
</g>
|
| 4541 |
</g>
|
| 4542 |
<g id="ytick_2">
|
| 4543 |
<g id="grid-y--3" class="grid grid-y">
|
| 4544 |
+
<path d="M 47.81 355.233116 L 835.361742 355.233116 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4545 |
</g>
|
| 4546 |
<g id="line2d_8">
|
| 4547 |
<g>
|
| 4548 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="355.233116" style="stroke: #000000; stroke-width: 0.8" />
|
| 4549 |
</g>
|
| 4550 |
</g>
|
| 4551 |
<g id="text_8">
|
| 4552 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="359.032335" transform="rotate(-0 40.81 359.032335)">1.2</text>
|
| 4553 |
</g>
|
| 4554 |
</g>
|
| 4555 |
<g id="ytick_3">
|
| 4556 |
<g id="grid-y--4" class="grid grid-y">
|
| 4557 |
+
<path d="M 47.81 297.256055 L 835.361742 297.256055 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4558 |
</g>
|
| 4559 |
<g id="line2d_9">
|
| 4560 |
<g>
|
| 4561 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="297.256055" style="stroke: #000000; stroke-width: 0.8" />
|
| 4562 |
</g>
|
| 4563 |
</g>
|
| 4564 |
<g id="text_9">
|
| 4565 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="301.055273" transform="rotate(-0 40.81 301.055273)">1.4</text>
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="ytick_4">
|
| 4569 |
<g id="grid-y--5" class="grid grid-y">
|
| 4570 |
+
<path d="M 47.81 239.278993 L 835.361742 239.278993 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4571 |
</g>
|
| 4572 |
<g id="line2d_10">
|
| 4573 |
<g>
|
| 4574 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="239.278993" style="stroke: #000000; stroke-width: 0.8" />
|
| 4575 |
</g>
|
| 4576 |
</g>
|
| 4577 |
<g id="text_10">
|
| 4578 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="243.078212" transform="rotate(-0 40.81 243.078212)">1.6</text>
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="ytick_5">
|
| 4582 |
<g id="grid-y--6" class="grid grid-y">
|
| 4583 |
+
<path d="M 47.81 181.301932 L 835.361742 181.301932 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4584 |
</g>
|
| 4585 |
<g id="line2d_11">
|
| 4586 |
<g>
|
| 4587 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="181.301932" style="stroke: #000000; stroke-width: 0.8" />
|
| 4588 |
</g>
|
| 4589 |
</g>
|
| 4590 |
<g id="text_11">
|
| 4591 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="185.101151" transform="rotate(-0 40.81 185.101151)">1.8</text>
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="ytick_6">
|
| 4595 |
<g id="grid-y--7" class="grid grid-y">
|
| 4596 |
+
<path d="M 47.81 123.324871 L 835.361742 123.324871 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4597 |
</g>
|
| 4598 |
<g id="line2d_12">
|
| 4599 |
<g>
|
| 4600 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="123.324871" style="stroke: #000000; stroke-width: 0.8" />
|
| 4601 |
</g>
|
| 4602 |
</g>
|
| 4603 |
<g id="text_12">
|
| 4604 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="127.124089" transform="rotate(-0 40.81 127.124089)">2.0</text>
|
| 4605 |
+
</g>
|
| 4606 |
+
</g>
|
| 4607 |
+
<g id="ytick_7">
|
| 4608 |
+
<g id="grid-y--8" class="grid grid-y">
|
| 4609 |
+
<path d="M 47.81 65.347809 L 835.361742 65.347809 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4610 |
+
</g>
|
| 4611 |
+
<g id="line2d_13">
|
| 4612 |
+
<g>
|
| 4613 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="65.347809" style="stroke: #000000; stroke-width: 0.8" />
|
| 4614 |
+
</g>
|
| 4615 |
+
</g>
|
| 4616 |
+
<g id="text_13">
|
| 4617 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="69.147028" transform="rotate(-0 40.81 69.147028)">2.2</text>
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="label--y" class="ylabel">
|
| 4621 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.827187" y="237.193558" transform="rotate(-90 18.827187 237.193558)">Latency P50 (ms)</text>
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="series--torch-flash-ma" class="series">
|
| 4625 |
+
<path d="M 83.607806 349.439178 L 226.799032 333.602454 L 369.990258 324.473676 L 513.181484 316.069901 L 656.37271 272.899601 L 799.563935 261.559288 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4626 |
<defs>
|
| 4627 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4628 |
</defs>
|
| 4629 |
+
<g clip-path="url(#p09feef2583)">
|
| 4630 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="349.439178" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4631 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="333.602454" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4632 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="324.473676" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4633 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="316.069901" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4634 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="272.899601" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4635 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="261.559288" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4636 |
</g>
|
| 4637 |
</g>
|
| 4638 |
<g id="series--torch-mem-eff" class="series">
|
| 4639 |
+
<path d="M 83.607806 156.020744 L 226.799032 138.969401 L 369.990258 109.128607 L 513.181484 99.249026 L 656.37271 87.05645 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4640 |
<defs>
|
| 4641 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4642 |
</defs>
|
| 4643 |
+
<g clip-path="url(#p09feef2583)">
|
| 4644 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="156.020744" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4645 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="138.969401" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4646 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="109.128607" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4647 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="99.249026" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4648 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="87.05645" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4649 |
+
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<g id="series--xformers-meff" class="series">
|
| 4653 |
+
<path d="M 83.607806 414.345368 L 226.799032 400.181572 L 369.990258 385.808769 L 513.181484 380.581847 L 656.37271 338.122056 L 799.563935 339.866876 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4654 |
<defs>
|
| 4655 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4656 |
</defs>
|
| 4657 |
+
<g clip-path="url(#p09feef2583)">
|
| 4658 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="414.345368" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4659 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="400.181572" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4660 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="385.808769" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4661 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="380.581847" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4662 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="338.122056" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4663 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="339.866876" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4664 |
</g>
|
| 4665 |
</g>
|
| 4666 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4667 |
+
<path d="M 83.607806 420.20395 L 226.799032 407.432473 L 369.990258 399.40236 L 513.181484 392.590345 L 656.37271 345.709514 L 799.563935 346.355668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4668 |
<defs>
|
| 4669 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4670 |
</defs>
|
| 4671 |
+
<g clip-path="url(#p09feef2583)">
|
| 4672 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="420.20395" style="fill: #d62728; stroke: #d62728" />
|
| 4673 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="407.432473" style="fill: #d62728; stroke: #d62728" />
|
| 4674 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="399.40236" style="fill: #d62728; stroke: #d62728" />
|
| 4675 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="392.590345" style="fill: #d62728; stroke: #d62728" />
|
| 4676 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="345.709514" style="fill: #d62728; stroke: #d62728" />
|
| 4677 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="346.355668" style="fill: #d62728; stroke: #d62728" />
|
| 4678 |
</g>
|
| 4679 |
</g>
|
| 4680 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4681 |
+
<path d="M 83.607806 428.387702 L 226.799032 420.061906 L 369.990258 405.625328 L 513.181484 401.010644 L 656.37271 352.807645 L 799.563935 359.622849 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4682 |
<defs>
|
| 4683 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4684 |
</defs>
|
| 4685 |
+
<g clip-path="url(#p09feef2583)">
|
| 4686 |
+
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4687 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="420.061906" style="fill: #9467bd; stroke: #9467bd" />
|
| 4688 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="405.625328" style="fill: #9467bd; stroke: #9467bd" />
|
| 4689 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="401.010644" style="fill: #9467bd; stroke: #9467bd" />
|
| 4690 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="352.807645" style="fill: #9467bd; stroke: #9467bd" />
|
| 4691 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="359.622849" style="fill: #9467bd; stroke: #9467bd" />
|
| 4692 |
</g>
|
| 4693 |
</g>
|
| 4694 |
<g id="patch_3">
|
| 4695 |
+
<path d="M 47.81 447.507117 L 47.81 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4696 |
</g>
|
| 4697 |
<g id="patch_4">
|
| 4698 |
<path d="M 835.361742 447.507117 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4699 |
</g>
|
| 4700 |
<g id="patch_5">
|
| 4701 |
+
<path d="M 47.81 447.507117 L 835.361742 447.507117 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4702 |
</g>
|
| 4703 |
<g id="patch_6">
|
| 4704 |
+
<path d="M 47.81 26.88 L 835.361742 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
|
| 4705 |
</g>
|
| 4706 |
+
<g id="text_14">
|
| 4707 |
+
<text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="441.585871" y="20.88" transform="rotate(-0 441.585871 20.88)">Attention Implementation Latency</text>
|
| 4708 |
</g>
|
| 4709 |
<g id="legend" class="legend">
|
| 4710 |
<g id="patch_7">
|
| 4711 |
+
<path d="M 54.81 109.66125 L 198.305313 109.66125 Q 200.305313 109.66125 200.305313 107.66125 L 200.305313 33.88 Q 200.305313 31.88 198.305313 31.88 L 54.81 31.88 Q 52.81 31.88 52.81 33.88 L 52.81 107.66125 Q 52.81 109.66125 54.81 109.66125 L 54.81 109.66125 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
|
| 4712 |
</g>
|
| 4713 |
+
<g id="line2d_14">
|
| 4714 |
+
<path d="M 56.81 39.978438 L 66.81 39.978438 L 76.81 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4715 |
<g>
|
| 4716 |
+
<use ns4:href="#md7efaf3aec" x="66.81" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4717 |
</g>
|
| 4718 |
</g>
|
| 4719 |
<g id="legend-label--torch-flash-ma" class="legend">
|
| 4720 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="43.478438" transform="rotate(-0 84.81 43.478438)">torch_flash_ma</text>
|
| 4721 |
</g>
|
| 4722 |
+
<g id="line2d_15">
|
| 4723 |
+
<path d="M 56.81 54.934687 L 66.81 54.934687 L 76.81 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4724 |
<g>
|
| 4725 |
+
<use ns4:href="#m9b8c54d372" x="66.81" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4726 |
</g>
|
| 4727 |
</g>
|
| 4728 |
<g id="legend-label--torch-mem-eff" class="legend">
|
| 4729 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="58.434687" transform="rotate(-0 84.81 58.434687)">torch_mem_eff</text>
|
| 4730 |
</g>
|
| 4731 |
+
<g id="line2d_16">
|
| 4732 |
+
<path d="M 56.81 69.890938 L 66.81 69.890938 L 76.81 69.890938 " style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4733 |
<g>
|
| 4734 |
+
<use ns4:href="#mc655281e0b" x="66.81" y="69.890938" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4735 |
</g>
|
| 4736 |
</g>
|
| 4737 |
<g id="legend-label--xformers-meff" class="legend">
|
| 4738 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="73.390938" transform="rotate(-0 84.81 73.390938)">xformers_meff</text>
|
| 4739 |
</g>
|
| 4740 |
+
<g id="line2d_17">
|
| 4741 |
+
<path d="M 56.81 84.847188 L 66.81 84.847188 L 76.81 84.847188 " style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4742 |
<g>
|
| 4743 |
+
<use ns4:href="#m61c8040d7e" x="66.81" y="84.847188" style="fill: #d62728; stroke: #d62728" />
|
| 4744 |
</g>
|
| 4745 |
</g>
|
| 4746 |
<g id="legend-label--hf-kernels-flash-attn" class="legend">
|
| 4747 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="88.347188" transform="rotate(-0 84.81 88.347188)">hf_kernels_flash_attn</text>
|
| 4748 |
</g>
|
| 4749 |
+
<g id="line2d_18">
|
| 4750 |
+
<path d="M 56.81 99.803438 L 66.81 99.803438 L 76.81 99.803438 " style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4751 |
<g>
|
| 4752 |
+
<use ns4:href="#m7cd35be9cc" x="66.81" y="99.803438" style="fill: #9467bd; stroke: #9467bd" />
|
| 4753 |
</g>
|
| 4754 |
</g>
|
| 4755 |
<g id="legend-label--hf-kernels-flash-attn3" class="legend">
|
| 4756 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.81" y="103.303438" transform="rotate(-0 84.81 103.303438)">hf_kernels_flash_attn3</text>
|
| 4757 |
</g>
|
| 4758 |
</g>
|
| 4759 |
</g>
|
| 4760 |
</g>
|
| 4761 |
<defs>
|
| 4762 |
+
<clipPath id="p09feef2583">
|
| 4763 |
+
<rect x="47.81" y="26.88" width="787.551742" height="420.627117" />
|
| 4764 |
</clipPath>
|
| 4765 |
</defs>
|
| 4766 |
</svg>
|
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/impls/cells/benchmark.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
|
|
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
# ]
|
| 8 |
#
|
|
@@ -12,15 +13,37 @@
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
run_benchmark(
|
| 22 |
kernel_type=KernelTypeEnum.LAYER_NORM,
|
| 23 |
-
impl_name="
|
| 24 |
-
impl_tags={"family": "
|
| 25 |
-
impl_func=
|
| 26 |
)
|
|
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
+
# "kernels",
|
| 7 |
# "kernels-benchmark-tools",
|
| 8 |
# ]
|
| 9 |
#
|
|
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
+
from kernels import get_kernel
|
| 17 |
|
| 18 |
+
# Load the layer norm kernel
|
| 19 |
+
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
|
| 20 |
|
| 21 |
+
|
| 22 |
+
def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
|
| 23 |
+
B, S, D = x.shape
|
| 24 |
+
# The kernel expects [N, D] input; support beta (bias) if provided.
|
| 25 |
+
out = layer_norm_kernel.dropout_add_ln_fwd(
|
| 26 |
+
input=x.view(-1, D),
|
| 27 |
+
gamma=weight,
|
| 28 |
+
beta=bias,
|
| 29 |
+
rowscale=None,
|
| 30 |
+
colscale=None,
|
| 31 |
+
x0_subset=None,
|
| 32 |
+
z_subset=None,
|
| 33 |
+
dropout_p=0.0,
|
| 34 |
+
epsilon=eps,
|
| 35 |
+
rowscale_const=1.0,
|
| 36 |
+
z_numrows=S,
|
| 37 |
+
gen=None,
|
| 38 |
+
residual_in_fp32=False,
|
| 39 |
+
is_rms_norm=False,
|
| 40 |
+
)[0].view(B, S, D)
|
| 41 |
+
return out
|
| 42 |
|
| 43 |
|
| 44 |
run_benchmark(
|
| 45 |
kernel_type=KernelTypeEnum.LAYER_NORM,
|
| 46 |
+
impl_name="hf_kernels_layer_norm",
|
| 47 |
+
impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
|
| 48 |
+
impl_func=hf_kernels_layer_norm,
|
| 49 |
)
|
layer_norm/impls/hf_kernels_layer_norm.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/impls/torch_layer_norm.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
layer_norm/results/artifacts/combine/latency.svg
CHANGED
|
|
Git LFS Details
|
|
|
Git LFS Details
|
layer_norm/results/combined_results.html
CHANGED
|
@@ -3857,7 +3857,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
-
Linux x86_64 | Linux-5.
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -3900,7 +3900,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3900 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 3901 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3902 |
</span> |
|
| 3903 |
-
Cell: combine |
|
| 3904 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 3905 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 3906 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3972,13 +3972,13 @@ Cell: combine | 38.84s
|
|
| 3972 |
<div class="cell-stdout"><pre class="stdout-text">======================================================================
|
| 3973 |
LOADING BENCHMARK DATA
|
| 3974 |
======================================================================
|
| 3975 |
-
✓ PyTorch LayerNorm : /
|
| 3976 |
-
✓ HF Kernels LayerNorm : /
|
| 3977 |
|
| 3978 |
✓ Found PyTorch LayerNorm
|
| 3979 |
-
Path: /
|
| 3980 |
✓ Found HF Kernels LayerNorm
|
| 3981 |
-
Path: /
|
| 3982 |
|
| 3983 |
======================================================================
|
| 3984 |
Summary: 2 found, 0 skipped, 0 missing
|
|
@@ -3987,102 +3987,102 @@ Summary: 2 found, 0 skipped, 0 missing
|
|
| 3987 |
COMBINED BENCHMARK SUMMARY
|
| 3988 |
|
| 3989 |
impl wl p50(ms) ok
|
| 3990 |
-
hf_kernels_layer_norm LN_B16_S1024_D1024 0.
|
| 3991 |
-
hf_kernels_layer_norm LN_B16_S1024_D2048 0.
|
| 3992 |
-
hf_kernels_layer_norm LN_B16_S1024_D4096
|
| 3993 |
-
hf_kernels_layer_norm LN_B16_S1024_D8192
|
| 3994 |
hf_kernels_layer_norm LN_B16_S128_D1024 0.05 False
|
| 3995 |
hf_kernels_layer_norm LN_B16_S128_D2048 0.05 False
|
| 3996 |
-
hf_kernels_layer_norm LN_B16_S128_D4096 0.
|
| 3997 |
-
hf_kernels_layer_norm LN_B16_S128_D8192 0.
|
| 3998 |
-
hf_kernels_layer_norm LN_B16_S2048_D1024 0.
|
| 3999 |
-
hf_kernels_layer_norm LN_B16_S2048_D2048
|
| 4000 |
-
hf_kernels_layer_norm LN_B16_S2048_D4096
|
| 4001 |
-
hf_kernels_layer_norm LN_B16_S2048_D8192
|
| 4002 |
-
hf_kernels_layer_norm LN_B16_S512_D1024 0.
|
| 4003 |
-
hf_kernels_layer_norm LN_B16_S512_D2048 0.
|
| 4004 |
-
hf_kernels_layer_norm LN_B16_S512_D4096 0.
|
| 4005 |
-
hf_kernels_layer_norm LN_B16_S512_D8192
|
| 4006 |
hf_kernels_layer_norm LN_B1_S1024_D1024 0.05 False
|
| 4007 |
hf_kernels_layer_norm LN_B1_S1024_D2048 0.05 False
|
| 4008 |
hf_kernels_layer_norm LN_B1_S1024_D4096 0.05 False
|
| 4009 |
-
hf_kernels_layer_norm LN_B1_S1024_D8192 0.
|
| 4010 |
-
hf_kernels_layer_norm LN_B1_S128_D1024 0.
|
| 4011 |
hf_kernels_layer_norm LN_B1_S128_D2048 0.05 False
|
| 4012 |
hf_kernels_layer_norm LN_B1_S128_D4096 0.05 False
|
| 4013 |
hf_kernels_layer_norm LN_B1_S128_D8192 0.05 False
|
| 4014 |
hf_kernels_layer_norm LN_B1_S2048_D1024 0.05 False
|
| 4015 |
hf_kernels_layer_norm LN_B1_S2048_D2048 0.05 False
|
| 4016 |
-
hf_kernels_layer_norm LN_B1_S2048_D4096 0.
|
| 4017 |
-
hf_kernels_layer_norm LN_B1_S2048_D8192 0.
|
| 4018 |
hf_kernels_layer_norm LN_B1_S512_D1024 0.05 False
|
| 4019 |
hf_kernels_layer_norm LN_B1_S512_D2048 0.05 False
|
| 4020 |
hf_kernels_layer_norm LN_B1_S512_D4096 0.05 False
|
| 4021 |
hf_kernels_layer_norm LN_B1_S512_D8192 0.05 False
|
| 4022 |
hf_kernels_layer_norm LN_B4_S1024_D1024 0.05 False
|
| 4023 |
-
hf_kernels_layer_norm LN_B4_S1024_D2048 0.
|
| 4024 |
-
hf_kernels_layer_norm LN_B4_S1024_D4096 0.
|
| 4025 |
-
hf_kernels_layer_norm LN_B4_S1024_D8192 0.
|
| 4026 |
hf_kernels_layer_norm LN_B4_S128_D1024 0.05 False
|
| 4027 |
hf_kernels_layer_norm LN_B4_S128_D2048 0.05 False
|
| 4028 |
hf_kernels_layer_norm LN_B4_S128_D4096 0.05 False
|
| 4029 |
hf_kernels_layer_norm LN_B4_S128_D8192 0.05 False
|
| 4030 |
-
hf_kernels_layer_norm LN_B4_S2048_D1024 0.
|
| 4031 |
-
hf_kernels_layer_norm LN_B4_S2048_D2048 0.
|
| 4032 |
-
hf_kernels_layer_norm LN_B4_S2048_D4096 0.
|
| 4033 |
-
hf_kernels_layer_norm LN_B4_S2048_D8192
|
| 4034 |
hf_kernels_layer_norm LN_B4_S512_D1024 0.05 False
|
| 4035 |
hf_kernels_layer_norm LN_B4_S512_D2048 0.05 False
|
| 4036 |
-
hf_kernels_layer_norm LN_B4_S512_D4096 0.
|
| 4037 |
-
hf_kernels_layer_norm LN_B4_S512_D8192 0.
|
| 4038 |
-
torch_layer_norm LN_B16_S1024_D1024 0.
|
| 4039 |
-
torch_layer_norm LN_B16_S1024_D2048 0.
|
| 4040 |
-
torch_layer_norm LN_B16_S1024_D4096
|
| 4041 |
-
torch_layer_norm LN_B16_S1024_D8192
|
| 4042 |
torch_layer_norm LN_B16_S128_D1024 0.03 False
|
| 4043 |
-
torch_layer_norm LN_B16_S128_D2048 0.
|
| 4044 |
-
torch_layer_norm LN_B16_S128_D4096 0.
|
| 4045 |
-
torch_layer_norm LN_B16_S128_D8192 0.
|
| 4046 |
-
torch_layer_norm LN_B16_S2048_D1024 0.
|
| 4047 |
-
torch_layer_norm LN_B16_S2048_D2048
|
| 4048 |
-
torch_layer_norm LN_B16_S2048_D4096
|
| 4049 |
-
torch_layer_norm LN_B16_S2048_D8192
|
| 4050 |
-
torch_layer_norm LN_B16_S512_D1024 0.
|
| 4051 |
-
torch_layer_norm LN_B16_S512_D2048 0.
|
| 4052 |
-
torch_layer_norm LN_B16_S512_D4096 0.
|
| 4053 |
-
torch_layer_norm LN_B16_S512_D8192
|
| 4054 |
torch_layer_norm LN_B1_S1024_D1024 0.03 False
|
| 4055 |
torch_layer_norm LN_B1_S1024_D2048 0.03 False
|
| 4056 |
-
torch_layer_norm LN_B1_S1024_D4096 0.
|
| 4057 |
-
torch_layer_norm LN_B1_S1024_D8192 0.
|
| 4058 |
-
torch_layer_norm LN_B1_S128_D1024 0.
|
| 4059 |
torch_layer_norm LN_B1_S128_D2048 0.03 False
|
| 4060 |
torch_layer_norm LN_B1_S128_D4096 0.03 False
|
| 4061 |
torch_layer_norm LN_B1_S128_D8192 0.03 False
|
| 4062 |
-
torch_layer_norm LN_B1_S2048_D1024 0.
|
| 4063 |
-
torch_layer_norm LN_B1_S2048_D2048 0.
|
| 4064 |
-
torch_layer_norm LN_B1_S2048_D4096 0.
|
| 4065 |
-
torch_layer_norm LN_B1_S2048_D8192 0.
|
| 4066 |
torch_layer_norm LN_B1_S512_D1024 0.03 False
|
| 4067 |
torch_layer_norm LN_B1_S512_D2048 0.03 False
|
| 4068 |
torch_layer_norm LN_B1_S512_D4096 0.03 False
|
| 4069 |
-
torch_layer_norm LN_B1_S512_D8192 0.
|
| 4070 |
-
torch_layer_norm LN_B4_S1024_D1024 0.
|
| 4071 |
-
torch_layer_norm LN_B4_S1024_D2048 0.
|
| 4072 |
-
torch_layer_norm LN_B4_S1024_D4096 0.
|
| 4073 |
-
torch_layer_norm LN_B4_S1024_D8192 0.
|
| 4074 |
torch_layer_norm LN_B4_S128_D1024 0.03 False
|
| 4075 |
torch_layer_norm LN_B4_S128_D2048 0.03 False
|
| 4076 |
torch_layer_norm LN_B4_S128_D4096 0.03 False
|
| 4077 |
-
torch_layer_norm LN_B4_S128_D8192 0.
|
| 4078 |
-
torch_layer_norm LN_B4_S2048_D1024 0.
|
| 4079 |
-
torch_layer_norm LN_B4_S2048_D2048 0.
|
| 4080 |
-
torch_layer_norm LN_B4_S2048_D4096 0.
|
| 4081 |
-
torch_layer_norm LN_B4_S2048_D8192
|
| 4082 |
torch_layer_norm LN_B4_S512_D1024 0.03 False
|
| 4083 |
-
torch_layer_norm LN_B4_S512_D2048 0.
|
| 4084 |
-
torch_layer_norm LN_B4_S512_D4096 0.
|
| 4085 |
-
torch_layer_norm LN_B4_S512_D8192 0.
|
| 4086 |
|
| 4087 |
GENERATING COMBINED VISUALIZATION
|
| 4088 |
|
|
@@ -4101,53 +4101,7 @@ Implementations included:
|
|
| 4101 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4102 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4103 |
<div class="uv-logs-content" style="display: none;">
|
| 4104 |
-
|
| 4105 |
-
Downloading sympy (6.0MiB)
|
| 4106 |
-
Downloading pillow (6.7MiB)
|
| 4107 |
-
Downloading setuptools (1.1MiB)
|
| 4108 |
-
Downloading fonttools (4.7MiB)
|
| 4109 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4110 |
-
Downloading kiwisolver (1.4MiB)
|
| 4111 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4112 |
-
Downloading networkx (1.9MiB)
|
| 4113 |
-
Downloading matplotlib (8.3MiB)
|
| 4114 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4115 |
-
Downloading numpy (15.9MiB)
|
| 4116 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4117 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4118 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4119 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4120 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4121 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4122 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4123 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4124 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4125 |
-
Downloading torch (846.8MiB)
|
| 4126 |
-
Downloading triton (148.4MiB)
|
| 4127 |
-
Downloading nvidia-cufile-cu12
|
| 4128 |
-
Downloading kiwisolver
|
| 4129 |
-
Downloading setuptools
|
| 4130 |
-
Downloading networkx
|
| 4131 |
-
Downloading fonttools
|
| 4132 |
-
Downloading pillow
|
| 4133 |
-
Built kernels-benchmark-tools @ file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools
|
| 4134 |
-
Downloading matplotlib
|
| 4135 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4136 |
-
Downloading numpy
|
| 4137 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4138 |
-
Downloading sympy
|
| 4139 |
-
Downloading nvidia-curand-cu12
|
| 4140 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4141 |
-
Downloading triton
|
| 4142 |
-
Downloading nvidia-cufft-cu12
|
| 4143 |
-
Downloading nvidia-cusolver-cu12
|
| 4144 |
-
Downloading nvidia-cusparse-cu12
|
| 4145 |
-
Downloading nvidia-cusparselt-cu12
|
| 4146 |
-
Downloading nvidia-nccl-cu12
|
| 4147 |
-
Downloading nvidia-cublas-cu12
|
| 4148 |
-
Downloading nvidia-cudnn-cu12
|
| 4149 |
-
Downloading torch
|
| 4150 |
-
Installed 37 packages in 205ms
|
| 4151 |
</div>
|
| 4152 |
</div>
|
| 4153 |
<div class="cell-artifacts">
|
|
@@ -4160,7 +4114,7 @@ Installed 37 packages in 205ms
|
|
| 4160 |
<rdf:RDF>
|
| 4161 |
<ns2:Work>
|
| 4162 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4163 |
-
<dc:date>2025-10-
|
| 4164 |
<dc:format>image/svg+xml</dc:format>
|
| 4165 |
<dc:creator>
|
| 4166 |
<ns2:Agent>
|
|
|
|
| 3857 |
<div class="system-info">
|
| 3858 |
<div class="system-info-header">Generated on:</div>
|
| 3859 |
<div class="system-info-content">
|
| 3860 |
+
Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35
|
| 3861 |
</div>
|
| 3862 |
</div>
|
| 3863 |
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-27T14:46:34.455868</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 3900 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 3901 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3902 |
</span> |
|
| 3903 |
+
Cell: combine | 4.28s
|
| 3904 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 3905 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 3906 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3972 |
<div class="cell-stdout"><pre class="stdout-text">======================================================================
|
| 3973 |
LOADING BENCHMARK DATA
|
| 3974 |
======================================================================
|
| 3975 |
+
✓ PyTorch LayerNorm : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3
|
| 3976 |
+
✓ HF Kernels LayerNorm : /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74
|
| 3977 |
|
| 3978 |
✓ Found PyTorch LayerNorm
|
| 3979 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/4403c31e9bef6e648597b4fcc9cfdc402678aaa4f90636b74325f12d334214a3/layer_norm.jsonl
|
| 3980 |
✓ Found HF Kernels LayerNorm
|
| 3981 |
+
Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/layer_norm/impls/.uvnote/cache/bd278151199f29b397d85857b87922edaa39a62623fb28e0465de47d6a3bac74/layer_norm.jsonl
|
| 3982 |
|
| 3983 |
======================================================================
|
| 3984 |
Summary: 2 found, 0 skipped, 0 missing
|
|
|
|
| 3987 |
COMBINED BENCHMARK SUMMARY
|
| 3988 |
|
| 3989 |
impl wl p50(ms) ok
|
| 3990 |
+
hf_kernels_layer_norm LN_B16_S1024_D1024 0.05 False
|
| 3991 |
+
hf_kernels_layer_norm LN_B16_S1024_D2048 0.22 False
|
| 3992 |
+
hf_kernels_layer_norm LN_B16_S1024_D4096 0.44 False
|
| 3993 |
+
hf_kernels_layer_norm LN_B16_S1024_D8192 0.84 False
|
| 3994 |
hf_kernels_layer_norm LN_B16_S128_D1024 0.05 False
|
| 3995 |
hf_kernels_layer_norm LN_B16_S128_D2048 0.05 False
|
| 3996 |
+
hf_kernels_layer_norm LN_B16_S128_D4096 0.05 False
|
| 3997 |
+
hf_kernels_layer_norm LN_B16_S128_D8192 0.05 False
|
| 3998 |
+
hf_kernels_layer_norm LN_B16_S2048_D1024 0.21 False
|
| 3999 |
+
hf_kernels_layer_norm LN_B16_S2048_D2048 0.46 False
|
| 4000 |
+
hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 False
|
| 4001 |
+
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 False
|
| 4002 |
+
hf_kernels_layer_norm LN_B16_S512_D1024 0.05 False
|
| 4003 |
+
hf_kernels_layer_norm LN_B16_S512_D2048 0.05 False
|
| 4004 |
+
hf_kernels_layer_norm LN_B16_S512_D4096 0.21 False
|
| 4005 |
+
hf_kernels_layer_norm LN_B16_S512_D8192 0.43 False
|
| 4006 |
hf_kernels_layer_norm LN_B1_S1024_D1024 0.05 False
|
| 4007 |
hf_kernels_layer_norm LN_B1_S1024_D2048 0.05 False
|
| 4008 |
hf_kernels_layer_norm LN_B1_S1024_D4096 0.05 False
|
| 4009 |
+
hf_kernels_layer_norm LN_B1_S1024_D8192 0.05 False
|
| 4010 |
+
hf_kernels_layer_norm LN_B1_S128_D1024 0.04 False
|
| 4011 |
hf_kernels_layer_norm LN_B1_S128_D2048 0.05 False
|
| 4012 |
hf_kernels_layer_norm LN_B1_S128_D4096 0.05 False
|
| 4013 |
hf_kernels_layer_norm LN_B1_S128_D8192 0.05 False
|
| 4014 |
hf_kernels_layer_norm LN_B1_S2048_D1024 0.05 False
|
| 4015 |
hf_kernels_layer_norm LN_B1_S2048_D2048 0.05 False
|
| 4016 |
+
hf_kernels_layer_norm LN_B1_S2048_D4096 0.05 False
|
| 4017 |
+
hf_kernels_layer_norm LN_B1_S2048_D8192 0.05 False
|
| 4018 |
hf_kernels_layer_norm LN_B1_S512_D1024 0.05 False
|
| 4019 |
hf_kernels_layer_norm LN_B1_S512_D2048 0.05 False
|
| 4020 |
hf_kernels_layer_norm LN_B1_S512_D4096 0.05 False
|
| 4021 |
hf_kernels_layer_norm LN_B1_S512_D8192 0.05 False
|
| 4022 |
hf_kernels_layer_norm LN_B4_S1024_D1024 0.05 False
|
| 4023 |
+
hf_kernels_layer_norm LN_B4_S1024_D2048 0.05 False
|
| 4024 |
+
hf_kernels_layer_norm LN_B4_S1024_D4096 0.05 False
|
| 4025 |
+
hf_kernels_layer_norm LN_B4_S1024_D8192 0.21 False
|
| 4026 |
hf_kernels_layer_norm LN_B4_S128_D1024 0.05 False
|
| 4027 |
hf_kernels_layer_norm LN_B4_S128_D2048 0.05 False
|
| 4028 |
hf_kernels_layer_norm LN_B4_S128_D4096 0.05 False
|
| 4029 |
hf_kernels_layer_norm LN_B4_S128_D8192 0.05 False
|
| 4030 |
+
hf_kernels_layer_norm LN_B4_S2048_D1024 0.05 False
|
| 4031 |
+
hf_kernels_layer_norm LN_B4_S2048_D2048 0.06 False
|
| 4032 |
+
hf_kernels_layer_norm LN_B4_S2048_D4096 0.21 False
|
| 4033 |
+
hf_kernels_layer_norm LN_B4_S2048_D8192 0.44 False
|
| 4034 |
hf_kernels_layer_norm LN_B4_S512_D1024 0.05 False
|
| 4035 |
hf_kernels_layer_norm LN_B4_S512_D2048 0.05 False
|
| 4036 |
+
hf_kernels_layer_norm LN_B4_S512_D4096 0.05 False
|
| 4037 |
+
hf_kernels_layer_norm LN_B4_S512_D8192 0.05 False
|
| 4038 |
+
torch_layer_norm LN_B16_S1024_D1024 0.05 False
|
| 4039 |
+
torch_layer_norm LN_B16_S1024_D2048 0.21 False
|
| 4040 |
+
torch_layer_norm LN_B16_S1024_D4096 0.42 False
|
| 4041 |
+
torch_layer_norm LN_B16_S1024_D8192 0.85 False
|
| 4042 |
torch_layer_norm LN_B16_S128_D1024 0.03 False
|
| 4043 |
+
torch_layer_norm LN_B16_S128_D2048 0.03 False
|
| 4044 |
+
torch_layer_norm LN_B16_S128_D4096 0.04 False
|
| 4045 |
+
torch_layer_norm LN_B16_S128_D8192 0.05 False
|
| 4046 |
+
torch_layer_norm LN_B16_S2048_D1024 0.21 False
|
| 4047 |
+
torch_layer_norm LN_B16_S2048_D2048 0.42 False
|
| 4048 |
+
torch_layer_norm LN_B16_S2048_D4096 0.82 False
|
| 4049 |
+
torch_layer_norm LN_B16_S2048_D8192 1.68 False
|
| 4050 |
+
torch_layer_norm LN_B16_S512_D1024 0.04 False
|
| 4051 |
+
torch_layer_norm LN_B16_S512_D2048 0.05 False
|
| 4052 |
+
torch_layer_norm LN_B16_S512_D4096 0.21 False
|
| 4053 |
+
torch_layer_norm LN_B16_S512_D8192 0.43 False
|
| 4054 |
torch_layer_norm LN_B1_S1024_D1024 0.03 False
|
| 4055 |
torch_layer_norm LN_B1_S1024_D2048 0.03 False
|
| 4056 |
+
torch_layer_norm LN_B1_S1024_D4096 0.03 False
|
| 4057 |
+
torch_layer_norm LN_B1_S1024_D8192 0.04 False
|
| 4058 |
+
torch_layer_norm LN_B1_S128_D1024 0.02 False
|
| 4059 |
torch_layer_norm LN_B1_S128_D2048 0.03 False
|
| 4060 |
torch_layer_norm LN_B1_S128_D4096 0.03 False
|
| 4061 |
torch_layer_norm LN_B1_S128_D8192 0.03 False
|
| 4062 |
+
torch_layer_norm LN_B1_S2048_D1024 0.03 False
|
| 4063 |
+
torch_layer_norm LN_B1_S2048_D2048 0.03 False
|
| 4064 |
+
torch_layer_norm LN_B1_S2048_D4096 0.04 False
|
| 4065 |
+
torch_layer_norm LN_B1_S2048_D8192 0.05 False
|
| 4066 |
torch_layer_norm LN_B1_S512_D1024 0.03 False
|
| 4067 |
torch_layer_norm LN_B1_S512_D2048 0.03 False
|
| 4068 |
torch_layer_norm LN_B1_S512_D4096 0.03 False
|
| 4069 |
+
torch_layer_norm LN_B1_S512_D8192 0.03 False
|
| 4070 |
+
torch_layer_norm LN_B4_S1024_D1024 0.03 False
|
| 4071 |
+
torch_layer_norm LN_B4_S1024_D2048 0.04 False
|
| 4072 |
+
torch_layer_norm LN_B4_S1024_D4096 0.05 False
|
| 4073 |
+
torch_layer_norm LN_B4_S1024_D8192 0.20 False
|
| 4074 |
torch_layer_norm LN_B4_S128_D1024 0.03 False
|
| 4075 |
torch_layer_norm LN_B4_S128_D2048 0.03 False
|
| 4076 |
torch_layer_norm LN_B4_S128_D4096 0.03 False
|
| 4077 |
+
torch_layer_norm LN_B4_S128_D8192 0.03 False
|
| 4078 |
+
torch_layer_norm LN_B4_S2048_D1024 0.04 False
|
| 4079 |
+
torch_layer_norm LN_B4_S2048_D2048 0.05 False
|
| 4080 |
+
torch_layer_norm LN_B4_S2048_D4096 0.21 False
|
| 4081 |
+
torch_layer_norm LN_B4_S2048_D8192 0.44 False
|
| 4082 |
torch_layer_norm LN_B4_S512_D1024 0.03 False
|
| 4083 |
+
torch_layer_norm LN_B4_S512_D2048 0.03 False
|
| 4084 |
+
torch_layer_norm LN_B4_S512_D4096 0.04 False
|
| 4085 |
+
torch_layer_norm LN_B4_S512_D8192 0.05 False
|
| 4086 |
|
| 4087 |
GENERATING COMBINED VISUALIZATION
|
| 4088 |
|
|
|
|
| 4101 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4102 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4103 |
<div class="uv-logs-content" style="display: none;">
|
| 4104 |
+
Installed 37 packages in 260ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4105 |
</div>
|
| 4106 |
</div>
|
| 4107 |
<div class="cell-artifacts">
|
|
|
|
| 4114 |
<rdf:RDF>
|
| 4115 |
<ns2:Work>
|
| 4116 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4117 |
+
<dc:date>2025-10-27T14:46:34.455868</dc:date>
|
| 4118 |
<dc:format>image/svg+xml</dc:format>
|
| 4119 |
<dc:creator>
|
| 4120 |
<ns2:Agent>
|