File size: 1,199 Bytes
e8e4be6 1c22380 e8e4be6 1c22380 e8e4be6 1c22380 e8e4be6 1c22380 e8e4be6 1c22380 e8e4be6 1c22380 e8e4be6 1c22380 e8e4be6 1c22380 e8e4be6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the rotary kernel
rotary = get_kernel("kernels-community/rotary")
def hf_kernels_rotary(query, key, cos, sin, conj=False):
rotary_dim = cos.shape[-1]
# Clone to avoid modifying inputs
q_out = query.clone()
k_out = key.clone()
# Apply rotation to query
q1 = q_out[..., :rotary_dim]
q2 = q_out[..., rotary_dim : 2 * rotary_dim]
rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
# Apply rotation to key
k1 = k_out[..., :rotary_dim]
k2 = k_out[..., rotary_dim : 2 * rotary_dim]
rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
return q_out, k_out
run_benchmark(
kernel_type=KernelTypeEnum.ROTARY,
impl_name="hf_kernels_rotary",
impl_tags={"family": "hf-kernels", "backend": "cuda"},
impl_func=hf_kernels_rotary,
dtype="float32",
) |