+
title: "Flash Attention Benchmark" +author: "uvnote" +theme: "dark" +syntax_theme: "monokai" +show_line_numbers: true +collapse_code: false +custom_css: | + #output-setup { + overflow-x: auto; + } + .cell-output { + overflow: scroll; + } + .cell-stdout { + width: max-content; + overflow: scroll; + } + .cell-stderr { + width: max-content; + overflow: scroll; + max-height: 300px; + }
++
+
+
+
+▼ code
+▼ output
+ ▶ uv-logs
+ |
+Cell: benchmark | 50.28s | FAILED
+ |
+
+Raw
+
+
+
+
+
+
+
+
+# /// script
+# dependencies = [
+# "numpy",
+# "torch",
+# "kernels",
+# "pandas",
+# "matplotlib"
+# ]
+# ///
+# Benchmarking common shapes for Flux 1024x1024px image + varying text sequence lengths
+
+import functools
+import os
+import pathlib
+
+import matplotlib.pyplot as plt
+import torch
+import torch._dynamo.config
+import triton
+import triton.language as tl
+
+try:
+ from flash_attn import flash_attn_func
+except:
+ flash_attn_func = None
+ print("Flash Attention 2 not found.")
+
+try:
+ from flash_attn_interface import flash_attn_func as flash_attn_3_func
+except:
+ flash_attn_3_func = None
+ print("Flash Attention 3 not found.")
+
+try:
+ from kernels import get_kernel
+ hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
+ hf_kernels_flash_attn_3 = get_kernel("kernels-community/flash-attn3")
+except:
+ hf_kernels_flash_attn = None
+ hf_kernels_flash_attn_3 = None
+ print("HF Kernels not found.")
+
+try:
+ from sageattention import sageattn_qk_int8_pv_fp16_cuda, sageattn_qk_int8_pv_fp16_triton, sageattn_qk_int8_pv_fp8_cuda_sm90
+except:
+ sageattn_qk_int8_pv_fp16_cuda = None
+ sageattn_qk_int8_pv_fp16_triton = None
+ sageattn_qk_int8_pv_fp8_cuda_sm90 = None
+ print("SageAttention not found.")
+
+try:
+ from transformer_engine.pytorch.attention import DotProductAttention
+except:
+ DotProductAttention = None
+ print("Transformer Engine not found.")
+
+try:
+ import xformers.ops as xops
+except:
+ xops = None
+ print("xFormers not found.")
+
+
+plt.rcParams.update({
+ "figure.figsize": (12, 10),
+ "figure.dpi": 120,
+ "font.size": 10,
+ "axes.titlesize": 12,
+ "axes.labelsize": 14,
+ "xtick.labelsize": 10,
+ "ytick.labelsize": 10,
+ "legend.fontsize": 8,
+ "axes.grid": True,
+ "grid.alpha": 0.3,
+ "grid.linestyle": "--",
+ "lines.linewidth": 2.0,
+ "lines.markersize": 6,
+ "legend.frameon": True,
+ "legend.framealpha": 0.9,
+ "legend.loc": "best",
+ "axes.spines.top": False,
+ "axes.spines.right": False,
+})
+
+
+# We want to compare the best compiled version for each specific shape (dynamic=False)
+torch._dynamo.config.cache_size_limit = 10000
+
+# We need to suppress_errors for FA3 to work. It makes it run in eager mode.
+# I can't seem to get it to work any other way under torch.compile, so any suggestions are welcome!
+torch._dynamo.config.suppress_errors = True
+
+output_dir = pathlib.Path("dump_attention_benchmark")
+output_dir.mkdir(parents=True, exist_ok=True)
+
+batch_size = 1
+num_attention_heads = 24
+attention_head_dim = 128
+image_sequence_length = 4096 # 1024x1024px
+text_sequence_lengths = [128, 256, 320, 384, 448, 512]
+sequence_lengths = [image_sequence_length + i for i in text_sequence_lengths]
+
+
+def _attention_torch(query, key, value, *, backend):
+ query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value))
+ with torch.nn.attention.sdpa_kernel(backend):
+ out = torch.nn.functional.scaled_dot_product_attention(query, key, value)
+ out = out.transpose(1, 2).contiguous()
+ return out
+
+
+_compiled_attention_torch_default = torch.compile(_attention_torch, mode="default", fullgraph=True, dynamic=False)
+def _attention_torch_compile_default(query, key, value, *, backend):
+ return _compiled_attention_torch_default(query, key, value, backend=backend)
+
+
+_compiled_attention_torch_max_autotune = torch.compile(_attention_torch, mode="max-autotune", fullgraph=True, dynamic=False)
+def _attention_torch_compile_max_autotune(query, key, value, *, backend):
+ return _compiled_attention_torch_max_autotune(query, key, value, backend=backend)
+
+
+def _attention_flash_attn_2(query, key, value):
+ return flash_attn_func(query, key, value)
+
+
+_compiled_flash_attn_2_default = torch.compile(_attention_flash_attn_2, mode="default", fullgraph=True, dynamic=False)
+def _attention_flash_attn_2_compile_default(query, key, value):
+ return _compiled_flash_attn_2_default(query, key, value)
+
+
+_compiled_flash_attn_2_max_autotune = torch.compile(_attention_flash_attn_2, mode="max-autotune", fullgraph=True, dynamic=False)
+def _attention_flash_attn_2_compile_max_autotune(query, key, value):
+ return _compiled_flash_attn_2_max_autotune(query, key, value)
+
+
+# For fullgraph=True tracing to be compatible
+@torch.library.custom_op("flash_attn_3::_flash_attn_forward", mutates_args=(), device_types="cuda")
+def _wrapped_flash_attn_3(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
+ out, lse = flash_attn_3_func(query, key, value)
+ return out
+
+
+@torch.library.register_fake("flash_attn_3::_flash_attn_forward")
+def _(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
+ return torch.empty_like(query)
+
+
+def _attention_flash_attn_3(query, key, value):
+ out = _wrapped_flash_attn_3(query, key, value)
+ return out
+
+
+_compiled_flash_attn_3_default = torch.compile(_attention_flash_attn_3, mode="default", fullgraph=True, dynamic=False)
+def _attention_flash_attn_3_compile_default(query, key, value):
+ return _compiled_flash_attn_3_default(query, key, value)
+
+
+_compiled_flash_attn_3_max_autotune = torch.compile(_attention_flash_attn_3, mode="max-autotune", fullgraph=True, dynamic=False)
+def _attention_flash_attn_3_compile_max_autotune(query, key, value):
+ return _compiled_flash_attn_3_max_autotune(query, key, value)
+
+
+def _attention_hf_kernels_flash_attn(query, key, value):
+ return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
+
+
+def _attention_hf_kernels_flash_attn3(query, key, value):
+ return hf_kernels_flash_attn_3.flash_attn_func(query, key, value, causal=False)[0]
+
+
+def _attention_sageattn_qk_int8_pv_fp16_cuda(query, key, value):
+ return sageattn_qk_int8_pv_fp16_cuda(query, key, value, tensor_layout="NHD")
+
+
+def _attention_sageattn_qk_int8_pv_fp16_triton(query, key, value):
+ return sageattn_qk_int8_pv_fp16_triton(query, key, value, tensor_layout="NHD")
+
+
+def _attention_sageattn_qk_int8_pv_fp8_cuda_sm90(query, key, value):
+ return sageattn_qk_int8_pv_fp8_cuda_sm90(query, key, value, tensor_layout="NHD")
+
+
+if DotProductAttention is not None:
+ def set_te_backend(backend):
+ # must be applied before first use of
+ # transformer_engine.pytorch.attention
+ os.environ["NVTE_FLASH_ATTN"] = '0'
+ os.environ["NVTE_FUSED_ATTN"] = '0'
+ os.environ["NVTE_UNFUSED_ATTN"] = '0'
+ if backend == 'flash':
+ os.environ["NVTE_FLASH_ATTN"] = '1'
+ if backend == 'fused':
+ os.environ["NVTE_FUSED_ATTN"] = '1'
+ if backend == 'unfused':
+ os.environ["NVTE_UNFUSED_ATTN"] = '1'
+
+ set_te_backend("fused")
+ te_attn_fn = DotProductAttention(
+ num_attention_heads=num_attention_heads,
+ kv_channels=attention_head_dim,
+ qkv_format="bshd",
+ attn_mask_type="no_mask",
+ )
+else:
+ def te_attn_fn(query, key, value):
+ raise RuntimeError("Transformer Engine is not available. Please install it for TE-based attention.")
+
+def _attention_te(query, key, value):
+ out = te_attn_fn(query, key, value)
+ out = out.unflatten(2, (num_attention_heads, attention_head_dim))
+ return out
+
+
+# Cannot fullgraph compile TE
+_compiled_te_attn_fn_default = torch.compile(_attention_te, mode="default", fullgraph=False, dynamic=False)
+def _attention_te_compile_default(query, key, value):
+ return _compiled_te_attn_fn_default(query, key, value)
+
+
+# Cannot fullgraph compile TE
+_compiled_te_attn_fn_max_autotune = torch.compile(_attention_te, mode="max-autotune", fullgraph=False, dynamic=False)
+def _attention_te_compile_max_autotune(query, key, value):
+ return _compiled_te_attn_fn_max_autotune(query, key, value)
+
+
+def _attention_xformers(query, key, value):
+ return xops.memory_efficient_attention(query, key, value)
+
+
+_compiled_xformers_default = torch.compile(_attention_xformers, mode="default", fullgraph=True, dynamic=False)
+def _attention_xformers_compile_default(query, key, value):
+ return _compiled_xformers_default(query, key, value)
+
+
+_compiled_xformers_max_autotune = torch.compile(_attention_xformers, mode="max-autotune", fullgraph=True, dynamic=False)
+def _attention_xformers_compile_max_autotune(query, key, value):
+ return _compiled_xformers_max_autotune(query, key, value)
+
+
+attention_ops = {}
+attention_ops["torch_cudnn"] = functools.partial(_attention_torch, backend=torch.nn.attention.SDPBackend.CUDNN_ATTENTION)
+attention_ops["torch_cudnn_compile_d"] = functools.partial(_attention_torch_compile_default, backend=torch.nn.attention.SDPBackend.CUDNN_ATTENTION)
+attention_ops["torch_cudnn_compile_ma"] = functools.partial(_attention_torch_compile_max_autotune, backend=torch.nn.attention.SDPBackend.CUDNN_ATTENTION)
+attention_ops["torch_flash"] = functools.partial(_attention_torch, backend=torch.nn.attention.SDPBackend.FLASH_ATTENTION)
+attention_ops["torch_flash_compile_d"] = functools.partial(_attention_torch_compile_default, backend=torch.nn.attention.SDPBackend.FLASH_ATTENTION)
+attention_ops["torch_flash_compile_ma"] = functools.partial(_attention_torch_compile_max_autotune, backend=torch.nn.attention.SDPBackend.FLASH_ATTENTION)
+if hf_kernels_flash_attn is not None:
+ attention_ops["hf_flash_attn"] = _attention_hf_kernels_flash_attn
+ attention_ops["hf_flash_attn3"] = _attention_hf_kernels_flash_attn3
+if flash_attn_func is not None:
+ attention_ops["flash_attn_2"] = _attention_flash_attn_2
+ attention_ops["flash_attn_2_compile_d"] = _attention_flash_attn_2_compile_default
+ attention_ops["flash_attn_2_compile_ma"] = _attention_flash_attn_2_compile_max_autotune
+if flash_attn_3_func is not None:
+ attention_ops["flash_attn_3"] = _attention_flash_attn_3
+ attention_ops["flash_attn_3_compile_d"] = _attention_flash_attn_3_compile_default
+ attention_ops["flash_attn_3_compile_ma"] = _attention_flash_attn_3_compile_max_autotune
+if sageattn_qk_int8_pv_fp16_cuda is not None:
+ attention_ops["sageattn_qk_int8_pv_fp16_cuda"] = _attention_sageattn_qk_int8_pv_fp16_cuda
+ attention_ops["sageattn_qk_int8_pv_fp16_triton"] = _attention_sageattn_qk_int8_pv_fp16_triton
+ if torch.cuda.get_device_capability()[0] >= 9:
+ attention_ops["sageattn_qk_int8_pv_fp8_cuda_sm90"] = _attention_sageattn_qk_int8_pv_fp8_cuda_sm90
+if DotProductAttention is not None:
+ attention_ops["te_fused"] = _attention_te
+ attention_ops["te_fused_compile_d"] = _attention_te_compile_default
+ attention_ops["te_fused_compile_ma"] = _attention_te_compile_max_autotune
+if xops is not None:
+ attention_ops["xformers"] = _attention_xformers
+ attention_ops["xformers_compile_d"] = _attention_xformers_compile_default
+ attention_ops["xformers_compile_ma"] = _attention_xformers_compile_max_autotune
+
+
+def get_color_and_linestyle(n: int) -> tuple[str, str]:
+ colors = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#a65628", "#f781bf", "#999999"]
+ line_styles = ["-", ":", "-.", "--"]
+ if n > len(colors) * len(line_styles):
+ raise ValueError(f"Required {n=} styles but maximum is {len(colors) * len(line_styles)}")
+ styles = []
+ for i in range(n):
+ color = colors[i % len(colors)]
+ linestyle = line_styles[i // len(colors)]
+ styles.append((color, linestyle))
+ return styles
+
+
+def correctness():
+ for seq_len in sequence_lengths:
+ shape = (batch_size, seq_len, num_attention_heads, attention_head_dim)
+ print(f"\n\n===== Testing shape: {shape} =====")
+
+ query = torch.randn(shape, device="cuda", dtype=torch.float32)
+ key = torch.randn(shape, device="cuda", dtype=torch.float32)
+ value = torch.randn(shape, device="cuda", dtype=torch.float32)
+
+ golden_truth = _attention_torch(query, key, value, backend=torch.nn.attention.SDPBackend.MATH)
+ query, key, value = (x.bfloat16() for x in (query, key, value))
+
+ for name, fn in attention_ops.items():
+ out = fn(query, key, value)
+ absdiff = (out - golden_truth).abs()
+ absmax = torch.max(absdiff)
+ mae = torch.mean(absdiff)
+ mse = torch.mean((golden_truth - out) ** 2)
+ print(f"{name:<30}: absmax={absmax:.6f}, mae={mae:.6f}, mse={mse:.6f}")
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["seq_len"],
+ x_vals=sequence_lengths,
+ x_log=False,
+ line_arg="provider",
+ line_vals=list(attention_ops.keys()),
+ line_names=[x.removeprefix("solution_") for x in attention_ops.keys()],
+ ylabel="Time (ms)",
+ styles=get_color_and_linestyle(len(attention_ops)),
+ plot_name="Attention Benchmark",
+ args={},
+ )
+)
+def benchmark_fn(seq_len: int, provider: str):
+ torch.manual_seed(0)
+
+ shape = (batch_size, seq_len, num_attention_heads, attention_head_dim)
+ query = torch.randn(shape, device="cuda", dtype=torch.bfloat16) * torch.randint(1, 5, shape, device="cuda", dtype=torch.bfloat16)
+ key = torch.randn(shape, device="cuda", dtype=torch.bfloat16) * torch.randint(1, 5, shape, device="cuda", dtype=torch.bfloat16)
+ value = torch.randn(shape, device="cuda", dtype=torch.bfloat16) * torch.randint(1, 5, shape, device="cuda", dtype=torch.bfloat16)
+
+ fn = attention_ops[provider]
+ ms, min_ms, max_ms = triton.testing.do_bench(
+ lambda: fn(query, key, value),
+ warmup=3,
+ rep=10,
+ quantiles=[0.5, 0.2, 0.8],
+ )
+ return ms, max_ms, min_ms
+
+
+with torch.inference_mode():
+ correctness()
+ benchmark_fn.run(print_data=True, save_path=output_dir.as_posix())
+
+
+Flash Attention 2 not found.
+Flash Attention 3 not found.
+SageAttention not found.
+Transformer Engine not found.
+xFormers not found.
+
+
+===== Testing shape: (1, 4224, 24, 128) =====
+torch_cudnn : absmax=0.001547, mae=0.000075, mse=0.000000
+
+
+
+▶ UV Install Logs
+
+Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
+Fetching 20 files: 5%|▌ | 1/20 [00:00<00:08, 2.21it/s]
+Fetching 20 files: 10%|█ | 2/20 [00:02<00:21, 1.17s/it]
+Fetching 20 files: 100%|██████████| 20/20 [00:02<00:00, 9.41it/s]
+
+Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
+Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 5.28it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:02<00:02, 1.15s/it]
+Fetching 4 files: 100%|██████████| 4/4 [00:02<00:00, 1.99it/s]
+/tmp/tmpyw1le_3d/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
+ 5 | #include <Python.h>
+ | ^~~~~~~~~~
+compilation terminated.
+Traceback (most recent call last):
+ File "/repo/flash_attn/.uvnote/cells/benchmark.py", line 340, in <module>
+ correctness()
+ File "/repo/flash_attn/.uvnote/cells/benchmark.py", line 299, in correctness
+ out = fn(query, key, value)
+ ^^^^^^^^^^^^^^^^^^^^^
+ File "/repo/flash_attn/.uvnote/cells/benchmark.py", line 114, in _attention_torch_compile_default
+ return _compiled_attention_torch_default(query, key, value, backend=backend)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 749, in compile_wrapper
+ raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/compile_fx.py", line 923, in _compile_fx_inner
+ raise InductorError(e, currentframe()).with_traceback(
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/compile_fx.py", line 907, in _compile_fx_inner
+ mb_compiled_graph = fx_codegen_and_compile(
+ ^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/compile_fx.py", line 1578, in fx_codegen_and_compile
+ return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/compile_fx.py", line 1456, in codegen_and_compile
+ compiled_module = graph.compile_to_module()
+ ^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/graph.py", line 2293, in compile_to_module
+ return self._compile_to_module()
+ ^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/graph.py", line 2299, in _compile_to_module
+ self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
+ ^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/graph.py", line 2238, in codegen
+ self.scheduler.codegen()
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/scheduler.py", line 4598, in codegen
+ else self._codegen(self.nodes)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/scheduler.py", line 4750, in _codegen
+ self.get_backend(device).codegen_node(node)
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/codegen/cuda_combined_scheduling.py", line 107, in codegen_node
+ return self._triton_scheduling.codegen_node(node)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/codegen/simd.py", line 1371, in codegen_node
+ return self.codegen_node_schedule(
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/codegen/simd.py", line 1424, in codegen_node_schedule
+ src_code = kernel.codegen_kernel()
+ ^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/codegen/triton.py", line 3677, in codegen_kernel
+ **self.inductor_meta_common(),
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/codegen/triton.py", line 3501, in inductor_meta_common
+ "backend_hash": torch.utils._triton.triton_hash_with_backend(),
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/utils/_triton.py", line 165, in triton_hash_with_backend
+ backend = triton_backend()
+ ^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/utils/_triton.py", line 157, in triton_backend
+ target = driver.active.get_current_target()
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/runtime/driver.py", line 30, in __getattr__
+ return getattr(self._initialize_obj(), name)
+ ^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/runtime/driver.py", line 26, in _initialize_obj
+ self._obj = self._init_fn()
+ ^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/runtime/driver.py", line 12, in _create_driver
+ return active_drivers[0]()
+ ^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/backends/nvidia/driver.py", line 715, in __init__
+ self.utils = CudaUtils() # TODO: make static
+ ^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/backends/nvidia/driver.py", line 62, in __init__
+ mod = compile_module_from_src(
+ ^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/runtime/build.py", line 88, in compile_module_from_src
+ so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/runtime/build.py", line 51, in _build
+ subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
+ File "/usr/lib/python3.11/subprocess.py", line 413, in check_call
+ raise CalledProcessError(retcode, cmd)
+torch._inductor.exc.InductorError: CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmpyw1le_3d/cuda_utils.c', '-O3', '-shared', '-fPIC', '-Wno-psabi', '-o', '/tmp/tmpyw1le_3d/cuda_utils.cpython-311-x86_64-linux-gnu.so', '-lcuda', '-L/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/backends/nvidia/lib', '-L/usr/lib/x86_64-linux-gnu', '-I/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/backends/nvidia/include', '-I/tmp/tmpyw1le_3d', '-I/usr/include/python3.11']' returned non-zero exit status 1.
+
+Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
+