# /// script # requires-python = ">=3.10" # dependencies = [ # "numpy", # "torch==2.8.0", # "kernels", # "kernels-benchmark-tools", # "sageattention", # ] # # [tool.uv.sources] # kernels-benchmark-tools = { path = "../../../../../tools", editable = true } # /// import torch import sys from kernels_benchmark_tools import KernelTypeEnum, run_benchmark from kernels import get_kernel # Load the sage attention kernel hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention") def sage_attention(query, key, value): """SageAttention with INT8 Q/K quantization and FP16 P/V""" return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0] run_benchmark( kernel_type=KernelTypeEnum.ATTENTION, impl_name="sage_int8_fp16", impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, impl_func=sage_attention, )