Spaces:

kernels-community
/

kernels-benchmarks

Running

App Files Files Community

drbh commited on Oct 14

Commit

30c62e2

1 Parent(s): 08478da

fix: remove debug build

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

flash_attn/artifacts/benchmark/Attention Benchmark.csv +0 -7
flash_attn/artifacts/benchmark/Attention Benchmark.png +0 -3
flash_attn/artifacts/benchmark/results.html +0 -3
flash_attn/benchmark.html +0 -0
flash_attn/cells/benchmark.py +0 -343
flash_attn/cells/nv.py +0 -3
flash_attn/impls/artifacts/benchmark/attn.jsonl +0 -6
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +0 -6
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +0 -6
flash_attn/impls/cells/benchmark.py +0 -71
flash_attn/impls/cells/benchmark_default.py +0 -70
flash_attn/impls/cells/benchmark_max_autotune.py +0 -70
flash_attn/impls/cells/nv.py +0 -3
flash_attn/impls/compiled_variants.html +0 -0
flash_attn/impls/flash_attention.html +0 -0
flash_attn/impls/hf_kernels_flash_attn.html +0 -0
flash_attn/impls/hf_kernels_flash_attn3.html +0 -0
flash_attn/impls/index.html +0 -94
flash_attn/impls/mem_efficient_attention.html +0 -0
flash_attn/impls/sage_attention.html +0 -0
flash_attn/impls/xformers.html +0 -0
flash_attn/index.html +0 -89
flash_attn/results/artifacts/combine/latency.csv +0 -43
flash_attn/results/artifacts/combine/latency.png +0 -3
flash_attn/results/artifacts/combine/latency.svg +0 -3
flash_attn/results/cells/combine.py +0 -319
flash_attn/results/combined_results.html +0 -0
flash_attn/results/index.html +0 -88
index.html +0 -85
megablocks/cells/forward_and_backward.py +0 -196
megablocks/cells/forward_and_backward_no_kernel.py +0 -196
megablocks/cells/forward_only.py +0 -101
megablocks/cells/no_kernels.py +0 -98
megablocks/cells/nv.py +0 -3
megablocks/index.html +0 -24
megablocks/megablocks_only.html +0 -0
megablocks_yamoe/artifacts/binned_run/binned_results.json +0 -24
megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json +0 -24
megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json +0 -24
megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json +0 -24
megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc +0 -0
megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc +0 -0
megablocks_yamoe/cells/bench_utils.py +0 -241
megablocks_yamoe/cells/binned_run.py +0 -195
megablocks_yamoe/cells/config.py +0 -27
megablocks_yamoe/cells/gptoss_run.py +0 -147
megablocks_yamoe/cells/gptoss_training_run.py +0 -138
megablocks_yamoe/cells/megablocks_run.py +0 -103
megablocks_yamoe/cells/nv.py +0 -3
megablocks_yamoe/cells/save_data.py +0 -42

flash_attn/artifacts/benchmark/Attention Benchmark.csv DELETED Viewed

@@ -1,7 +0,0 @@
-seq_len,torch_cudnn,torch_cudnn_compile_d,torch_cudnn_compile_ma,torch_flash,torch_flash_compile_d,torch_flash_compile_ma,hf_flash_attn,hf_flash_attn3
-4224.000000,3.801472,3.790064,4.182320,3.968000,3.957824,4.311152,3.398160,3.330400
-4352.000000,4.082944,4.082912,4.413488,4.400000,4.391936,4.738048,3.837424,3.758208
-4416.000000,4.142624,4.135648,4.484160,4.452304,4.446096,4.792480,3.892064,3.864128
-4480.000000,4.206144,4.198752,4.551808,4.530752,4.522944,4.873760,3.949344,3.870224
-4544.000000,4.438320,4.433104,4.787584,4.584160,4.576640,4.934304,4.008960,3.974672
-4608.000000,4.502432,4.495456,4.871872,4.660192,4.651040,5.029792,4.065616,3.984160

flash_attn/artifacts/benchmark/Attention Benchmark.png DELETED Viewed

Git LFS Details

SHA256: 69a5d2d4ac33fa06e77a599eab6cadcddb77c15ad7bde323bb07849e2aa3ac14
Pointer size: 131 Bytes
Size of remote file: 142 kB

flash_attn/artifacts/benchmark/results.html DELETED Viewed

@@ -1,3 +0,0 @@
-<html><body>
-<image src="Attention Benchmark.png"/>
-</body></html>

flash_attn/benchmark.html DELETED Viewed

The diff for this file is too large to render. See raw diff

flash_attn/cells/benchmark.py DELETED Viewed

@@ -1,343 +0,0 @@
-# /// script
-# dependencies = [
-#   "numpy",
-#   "torch",
-#   "kernels",
-#   "pandas",
-#   "matplotlib"
-# ]
-# ///
-# Benchmarking common shapes for Flux 1024x1024px image + varying text sequence lengths
-import functools
-import os
-import pathlib
-import matplotlib.pyplot as plt
-import torch
-import torch._dynamo.config
-import triton
-import triton.language as tl
-try:
-    from flash_attn import flash_attn_func
-except:
-    flash_attn_func = None
-    print("Flash Attention 2 not found.")
-try:
-    from flash_attn_interface import flash_attn_func as flash_attn_3_func
-except:
-    flash_attn_3_func = None
-    print("Flash Attention 3 not found.")
-try:
-    from kernels import get_kernel
-    hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
-    hf_kernels_flash_attn_3 = get_kernel("kernels-community/flash-attn3")
-except:
-    hf_kernels_flash_attn = None
-    hf_kernels_flash_attn_3 = None
-    print("HF Kernels not found.")
-try:
-    from sageattention import sageattn_qk_int8_pv_fp16_cuda, sageattn_qk_int8_pv_fp16_triton, sageattn_qk_int8_pv_fp8_cuda_sm90
-except:
-    sageattn_qk_int8_pv_fp16_cuda = None
-    sageattn_qk_int8_pv_fp16_triton = None
-    sageattn_qk_int8_pv_fp8_cuda_sm90 = None
-    print("SageAttention not found.")
-try:
-    from transformer_engine.pytorch.attention import DotProductAttention
-except:
-    DotProductAttention = None
-    print("Transformer Engine not found.")
-try:
-    import xformers.ops as xops
-except:
-    xops = None
-    print("xFormers not found.")
-plt.rcParams.update({
-    "figure.figsize": (12, 10),
-    "figure.dpi": 120,
-    "font.size": 10,
-    "axes.titlesize": 12,
-    "axes.labelsize": 14,
-    "xtick.labelsize": 10,
-    "ytick.labelsize": 10,
-    "legend.fontsize": 8,
-    "axes.grid": True,
-    "grid.alpha": 0.3,
-    "grid.linestyle": "--",
-    "lines.linewidth": 2.0,
-    "lines.markersize": 6,
-    "legend.frameon": True,
-    "legend.framealpha": 0.9,
-    "legend.loc": "best",
-    "axes.spines.top": False,
-    "axes.spines.right": False,
-})
-# We want to compare the best compiled version for each specific shape (dynamic=False)
-torch._dynamo.config.cache_size_limit = 10000
-# We need to suppress_errors for FA3 to work. It makes it run in eager mode.
-# I can't seem to get it to work any other way under torch.compile, so any suggestions are welcome!
-torch._dynamo.config.suppress_errors = True
-# output_dir = pathlib.Path("dump_attention_benchmark")
-# output_dir.mkdir(parents=True, exist_ok=True)
-output_dir = pathlib.Path(".") # output to current directory for upload
-batch_size = 1
-num_attention_heads = 24
-attention_head_dim = 128
-image_sequence_length = 4096  # 1024x1024px
-text_sequence_lengths = [128, 256, 320, 384, 448, 512]
-sequence_lengths = [image_sequence_length + i for i in text_sequence_lengths]
-def _attention_torch(query, key, value, *, backend):
-    query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value))
-    with torch.nn.attention.sdpa_kernel(backend):
-        out = torch.nn.functional.scaled_dot_product_attention(query, key, value)
-    out = out.transpose(1, 2).contiguous()
-    return out
-_compiled_attention_torch_default = torch.compile(_attention_torch, mode="default", fullgraph=True, dynamic=False)
-def _attention_torch_compile_default(query, key, value, *, backend):
-    return _compiled_attention_torch_default(query, key, value, backend=backend)
-_compiled_attention_torch_max_autotune = torch.compile(_attention_torch, mode="max-autotune", fullgraph=True, dynamic=False)
-def _attention_torch_compile_max_autotune(query, key, value, *, backend):
-    return _compiled_attention_torch_max_autotune(query, key, value, backend=backend)
-def _attention_flash_attn_2(query, key, value):
-    return flash_attn_func(query, key, value)
-_compiled_flash_attn_2_default = torch.compile(_attention_flash_attn_2, mode="default", fullgraph=True, dynamic=False)
-def _attention_flash_attn_2_compile_default(query, key, value):
-    return _compiled_flash_attn_2_default(query, key, value)
-_compiled_flash_attn_2_max_autotune = torch.compile(_attention_flash_attn_2, mode="max-autotune", fullgraph=True, dynamic=False)
-def _attention_flash_attn_2_compile_max_autotune(query, key, value):
-    return _compiled_flash_attn_2_max_autotune(query, key, value)
-# For fullgraph=True tracing to be compatible
-@torch.library.custom_op("flash_attn_3::_flash_attn_forward", mutates_args=(), device_types="cuda")
-def _wrapped_flash_attn_3(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
-    out, lse = flash_attn_3_func(query, key, value)
-    return out
-@torch.library.register_fake("flash_attn_3::_flash_attn_forward")
-def _(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
-    return torch.empty_like(query)
-def _attention_flash_attn_3(query, key, value):
-    out = _wrapped_flash_attn_3(query, key, value)
-    return out
-_compiled_flash_attn_3_default = torch.compile(_attention_flash_attn_3, mode="default", fullgraph=True, dynamic=False)
-def _attention_flash_attn_3_compile_default(query, key, value):
-    return _compiled_flash_attn_3_default(query, key, value)
-_compiled_flash_attn_3_max_autotune = torch.compile(_attention_flash_attn_3, mode="max-autotune", fullgraph=True, dynamic=False)
-def _attention_flash_attn_3_compile_max_autotune(query, key, value):
-    return _compiled_flash_attn_3_max_autotune(query, key, value)
-def _attention_hf_kernels_flash_attn(query, key, value):
-    return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
-def _attention_hf_kernels_flash_attn3(query, key, value):
-    return hf_kernels_flash_attn_3.flash_attn_func(query, key, value, causal=False)[0]
-def _attention_sageattn_qk_int8_pv_fp16_cuda(query, key, value):
-    return sageattn_qk_int8_pv_fp16_cuda(query, key, value, tensor_layout="NHD")
-def _attention_sageattn_qk_int8_pv_fp16_triton(query, key, value):
-    return sageattn_qk_int8_pv_fp16_triton(query, key, value, tensor_layout="NHD")
-def _attention_sageattn_qk_int8_pv_fp8_cuda_sm90(query, key, value):
-    return sageattn_qk_int8_pv_fp8_cuda_sm90(query, key, value, tensor_layout="NHD")
-if DotProductAttention is not None:
-    def set_te_backend(backend):
-        # must be applied before first use of
-        # transformer_engine.pytorch.attention
-        os.environ["NVTE_FLASH_ATTN"] = '0'
-        os.environ["NVTE_FUSED_ATTN"] = '0'
-        os.environ["NVTE_UNFUSED_ATTN"] = '0'
-        if backend == 'flash':
-            os.environ["NVTE_FLASH_ATTN"] = '1'
-        if backend == 'fused':
-            os.environ["NVTE_FUSED_ATTN"] = '1'
-        if backend == 'unfused':
-            os.environ["NVTE_UNFUSED_ATTN"] = '1'
-    set_te_backend("fused")
-    te_attn_fn = DotProductAttention(
-        num_attention_heads=num_attention_heads,
-        kv_channels=attention_head_dim,
-        qkv_format="bshd",
-        attn_mask_type="no_mask",
-    )
-else:
-    def te_attn_fn(query, key, value):
-        raise RuntimeError("Transformer Engine is not available. Please install it for TE-based attention.")
-def _attention_te(query, key, value):
-    out = te_attn_fn(query, key, value)
-    out = out.unflatten(2, (num_attention_heads, attention_head_dim))
-    return out
-# Cannot fullgraph compile TE
-_compiled_te_attn_fn_default = torch.compile(_attention_te, mode="default", fullgraph=False, dynamic=False)
-def _attention_te_compile_default(query, key, value):
-    return _compiled_te_attn_fn_default(query, key, value)
-# Cannot fullgraph compile TE
-_compiled_te_attn_fn_max_autotune = torch.compile(_attention_te, mode="max-autotune", fullgraph=False, dynamic=False)
-def _attention_te_compile_max_autotune(query, key, value):
-    return _compiled_te_attn_fn_max_autotune(query, key, value)
-def _attention_xformers(query, key, value):
-    return xops.memory_efficient_attention(query, key, value)
-_compiled_xformers_default = torch.compile(_attention_xformers, mode="default", fullgraph=True, dynamic=False)
-def _attention_xformers_compile_default(query, key, value):
-    return _compiled_xformers_default(query, key, value)
-_compiled_xformers_max_autotune = torch.compile(_attention_xformers, mode="max-autotune", fullgraph=True, dynamic=False)
-def _attention_xformers_compile_max_autotune(query, key, value):
-    return _compiled_xformers_max_autotune(query, key, value)
-attention_ops = {}
-attention_ops["torch_cudnn"] = functools.partial(_attention_torch, backend=torch.nn.attention.SDPBackend.CUDNN_ATTENTION)
-attention_ops["torch_cudnn_compile_d"] = functools.partial(_attention_torch_compile_default, backend=torch.nn.attention.SDPBackend.CUDNN_ATTENTION)
-attention_ops["torch_cudnn_compile_ma"] = functools.partial(_attention_torch_compile_max_autotune, backend=torch.nn.attention.SDPBackend.CUDNN_ATTENTION)
-attention_ops["torch_flash"] = functools.partial(_attention_torch, backend=torch.nn.attention.SDPBackend.FLASH_ATTENTION)
-attention_ops["torch_flash_compile_d"] = functools.partial(_attention_torch_compile_default, backend=torch.nn.attention.SDPBackend.FLASH_ATTENTION)
-attention_ops["torch_flash_compile_ma"] = functools.partial(_attention_torch_compile_max_autotune, backend=torch.nn.attention.SDPBackend.FLASH_ATTENTION)
-if hf_kernels_flash_attn is not None:
-    attention_ops["hf_flash_attn"] = _attention_hf_kernels_flash_attn
-    attention_ops["hf_flash_attn3"] = _attention_hf_kernels_flash_attn3
-if flash_attn_func is not None:
-    attention_ops["flash_attn_2"] = _attention_flash_attn_2
-    attention_ops["flash_attn_2_compile_d"] = _attention_flash_attn_2_compile_default
-    attention_ops["flash_attn_2_compile_ma"] = _attention_flash_attn_2_compile_max_autotune
-if flash_attn_3_func is not None:
-    attention_ops["flash_attn_3"] = _attention_flash_attn_3
-    attention_ops["flash_attn_3_compile_d"] = _attention_flash_attn_3_compile_default
-    attention_ops["flash_attn_3_compile_ma"] = _attention_flash_attn_3_compile_max_autotune
-if sageattn_qk_int8_pv_fp16_cuda is not None:
-    attention_ops["sageattn_qk_int8_pv_fp16_cuda"] = _attention_sageattn_qk_int8_pv_fp16_cuda
-    attention_ops["sageattn_qk_int8_pv_fp16_triton"] = _attention_sageattn_qk_int8_pv_fp16_triton
-    if torch.cuda.get_device_capability()[0] >= 9:
-        attention_ops["sageattn_qk_int8_pv_fp8_cuda_sm90"] = _attention_sageattn_qk_int8_pv_fp8_cuda_sm90
-if DotProductAttention is not None:
-    attention_ops["te_fused"] = _attention_te
-    attention_ops["te_fused_compile_d"] = _attention_te_compile_default
-    attention_ops["te_fused_compile_ma"] = _attention_te_compile_max_autotune
-if xops is not None:
-    attention_ops["xformers"] = _attention_xformers
-    attention_ops["xformers_compile_d"] = _attention_xformers_compile_default
-    attention_ops["xformers_compile_ma"] = _attention_xformers_compile_max_autotune
-def get_color_and_linestyle(n: int) -> tuple[str, str]:
-    colors = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#a65628", "#f781bf", "#999999"]
-    line_styles = ["-", ":", "-.", "--"]
-    if n > len(colors) * len(line_styles):
-        raise ValueError(f"Required {n=} styles but maximum is {len(colors) * len(line_styles)}")
-    styles = []
-    for i in range(n):
-        color = colors[i % len(colors)]
-        linestyle = line_styles[i // len(colors)]
-        styles.append((color, linestyle))
-    return styles
-def correctness():
-    for seq_len in sequence_lengths:
-        shape = (batch_size, seq_len, num_attention_heads, attention_head_dim)
-        print(f"\n\n===== Testing shape: {shape} =====")
-        query = torch.randn(shape, device="cuda", dtype=torch.float32)
-        key = torch.randn(shape, device="cuda", dtype=torch.float32)
-        value = torch.randn(shape, device="cuda", dtype=torch.float32)
-        golden_truth = _attention_torch(query, key, value, backend=torch.nn.attention.SDPBackend.MATH)
-        query, key, value = (x.bfloat16() for x in (query, key, value))
-        for name, fn in attention_ops.items():
-            out = fn(query, key, value)
-            absdiff = (out - golden_truth).abs()
-            absmax = torch.max(absdiff)
-            mae = torch.mean(absdiff)
-            mse = torch.mean((golden_truth - out) ** 2)
-            print(f"{name:<30}: absmax={absmax:.6f}, mae={mae:.6f}, mse={mse:.6f}")
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["seq_len"],
-        x_vals=sequence_lengths,
-        x_log=False,
-        line_arg="provider",
-        line_vals=list(attention_ops.keys()),
-        line_names=[x.removeprefix("solution_") for x in attention_ops.keys()],
-        ylabel="Time (ms)",
-        styles=get_color_and_linestyle(len(attention_ops)),
-        plot_name="Attention Benchmark",
-        args={},
-    )
-)
-def benchmark_fn(seq_len: int, provider: str):
-    torch.manual_seed(0)
-    shape = (batch_size, seq_len, num_attention_heads, attention_head_dim)
-    query = torch.randn(shape, device="cuda", dtype=torch.bfloat16) * torch.randint(1, 5, shape, device="cuda", dtype=torch.bfloat16)
-    key = torch.randn(shape, device="cuda", dtype=torch.bfloat16) * torch.randint(1, 5, shape, device="cuda", dtype=torch.bfloat16)
-    value = torch.randn(shape, device="cuda", dtype=torch.bfloat16) * torch.randint(1, 5, shape, device="cuda", dtype=torch.bfloat16)
-    fn = attention_ops[provider]
-    ms, min_ms, max_ms = triton.testing.do_bench(
-        lambda: fn(query, key, value),
-        warmup=3,
-        rep=10,
-        quantiles=[0.5, 0.2, 0.8],
-    )
-    return ms, max_ms, min_ms
-with torch.inference_mode():
-    correctness()
-    fig = benchmark_fn.run(print_data=True, save_path=output_dir.as_posix())

flash_attn/cells/nv.py DELETED Viewed

@@ -1,3 +0,0 @@
-import subprocess
-print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)

flash_attn/impls/artifacts/benchmark/attn.jsonl DELETED Viewed

@@ -1,6 +0,0 @@
-{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.3603839874267578, "p50": 0.361952006816864, "p90": 0.3624640107154846, "mean": 0.3619711995124817, "reps": 5, "warmup": 2}, "compile_ms": 1.5701119899749756, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.3892799913883209, "p50": 0.3909760117530823, "p90": 0.3922559916973114, "mean": 0.3912447988986969, "reps": 5, "warmup": 2}, "compile_ms": 0.35811200737953186, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5240640044212341, "p50": 0.5248960256576538, "p90": 0.5248960256576538, "mean": 0.5258048176765442, "reps": 5, "warmup": 2}, "compile_ms": 0.4891839921474457, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5265600085258484, "p50": 0.5277760028839111, "p90": 0.5282559990882874, "mean": 0.5276032090187073, "reps": 5, "warmup": 2}, "compile_ms": 0.4968000054359436, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5639039874076843, "p50": 0.5657920241355896, "p90": 0.5668479800224304, "mean": 0.5656383991241455, "reps": 5, "warmup": 2}, "compile_ms": 0.5312319993972778, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:59:35Z", "run": "8bc1bbc1e0504355abbb1f58e69828d3", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5689600110054016, "p50": 0.5698239803314209, "p90": 0.5713919997215271, "mean": 0.5789952039718628, "reps": 5, "warmup": 2}, "compile_ms": 0.5350080132484436, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}

flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl DELETED Viewed

@@ -1,6 +0,0 @@
-{"ts": "2025-10-02T19:58:18Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5141760110855103, "p50": 0.5175679922103882, "p90": 0.5197759866714478, "mean": 0.5181439876556396, "reps": 5, "warmup": 2}, "compile_ms": 3084.621826171875, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5549119710922241, "p50": 0.5582720041275024, "p90": 0.5598080158233643, "mean": 0.5579584002494812, "reps": 5, "warmup": 2}, "compile_ms": 270.21795654296875, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6853119730949402, "p50": 0.687391996383667, "p90": 0.6883519887924194, "mean": 0.6872959971427918, "reps": 5, "warmup": 2}, "compile_ms": 269.78741455078125, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7128639817237854, "p50": 0.7160959839820862, "p90": 0.7167680263519287, "mean": 0.716153597831726, "reps": 5, "warmup": 2}, "compile_ms": 269.8607177734375, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:58:19Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7386879920959473, "p50": 0.7400959730148315, "p90": 0.7415040135383606, "mean": 0.7418303966522217, "reps": 5, "warmup": 2}, "compile_ms": 269.20501708984375, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:58:20Z", "run": "9ebc449a917f4f2196503654e5549239", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7708160281181335, "p50": 0.7740799784660339, "p90": 0.7753919959068298, "mean": 0.7745471954345703, "reps": 5, "warmup": 2}, "compile_ms": 270.93829345703125, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}

flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl DELETED Viewed

@@ -1,6 +0,0 @@
-{"ts": "2025-10-02T19:57:25Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6144000291824341, "p50": 0.6245759725570679, "p90": 0.6483200192451477, "mean": 0.6468096017837525, "reps": 5, "warmup": 2}, "compile_ms": 4407.3388671875, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:57:27Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6689280271530151, "p50": 0.6851199865341187, "p90": 0.7184960246086121, "mean": 0.7060160160064697, "reps": 5, "warmup": 2}, "compile_ms": 1686.2735595703125, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:57:29Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7953600287437439, "p50": 0.8155840039253235, "p90": 0.8403519988059998, "mean": 0.8332608103752136, "reps": 5, "warmup": 2}, "compile_ms": 1462.938232421875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:57:31Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8470720052719116, "p50": 0.849727988243103, "p90": 0.8745279908180237, "mean": 0.8719295978546142, "reps": 5, "warmup": 2}, "compile_ms": 1689.3455810546875, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:57:33Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8677120208740234, "p50": 0.8835520148277283, "p90": 0.9034240245819092, "mean": 0.9034304022789001, "reps": 5, "warmup": 2}, "compile_ms": 1693.035888671875, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T19:57:34Z", "run": "edb73be653834cdf8524ee78b403db7f", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9154239892959595, "p50": 0.9213759899139404, "p90": 0.9359679818153381, "mean": 0.9387519836425782, "reps": 5, "warmup": 2}, "compile_ms": 1689.36279296875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}

flash_attn/impls/cells/benchmark.py DELETED Viewed

@@ -1,71 +0,0 @@
-# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#     "numpy",
-#     "torch",
-#     "kernels-benchmark-tools",
-#     "kernels",
-# ]
-#
-# [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
-# ///
-import torch
-import sys
-import os
-import kernels_benchmark_tools as kbt
-from kernels import get_kernel
-hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
-def hf_flash_attention3(query, key, value):
-    return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
-kbt.add(
-    "hf_kernels_flash_attn3",
-    hf_flash_attention3,
-    tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
-)
-if __name__ == "__main__":
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    if device == "cpu":
-        print("HF Kernels Flash Attention 3 requires CUDA - skipping benchmark")
-        sys.exit(0)
-    dtype = "bfloat16"
-    # Flux-like workloads
-    base = 1024
-    flux_sizes = [128, 256, 320, 384, 448, 512]
-    heads = 24
-    head_dim = 128
-    wl = []
-    for L in flux_sizes:
-        wl.append(
-            {
-                "name": f"flux_L{L}",
-                "batch": 1,
-                "seq_len": base + L,
-                "heads": heads,
-                "head_dim": head_dim,
-                "dtype": dtype,
-                "device": device,
-                "seed": 0,
-            }
-        )
-    kbt.run(
-        wl,
-        jsonl="attn.jsonl",
-        reps=5,
-        warmup=2,
-        gen=kbt.attn.gen_qkv,
-        ref=kbt.attn.ref_math,
-        cmp=kbt.attn.cmp_allclose,
-    )
-    kbt.summarize(["attn.jsonl"])

flash_attn/impls/cells/benchmark_default.py DELETED Viewed

@@ -1,70 +0,0 @@
-# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#     "numpy",
-#     "torch",
-#     "kernels-benchmark-tools",
-# ]
-#
-# [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
-# ///
-import torch
-import sys
-import os
-import kernels_benchmark_tools as kbt
-def torch_flash_base(q, k, v):
-    qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
-    with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
-        o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
-    return o.transpose(1, 2).contiguous()
-# Compile with default mode
-compiled_flash_default = torch.compile(torch_flash_base, mode="default", fullgraph=True, dynamic=False)
-kbt.add(
-    "torch_flash_compiled_default",
-    compiled_flash_default,
-    tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "default"},
-)
-if __name__ == "__main__":
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype = "float32" if device == "cpu" else "bfloat16"
-    # Flux-like workloads
-    base = 1024 if device == "cuda" else 512
-    flux_sizes = (
-        [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
-    )
-    heads = 24 if device == "cuda" else 8
-    head_dim = 128 if device == "cuda" else 64
-    wl = []
-    for L in flux_sizes:
-        wl.append(
-            {
-                "name": f"flux_L{L}",
-                "batch": 1,
-                "seq_len": base + L,
-                "heads": heads,
-                "head_dim": head_dim,
-                "dtype": dtype,
-                "device": device,
-                "seed": 0,
-            }
-        )
-    kbt.run(
-        wl,
-        jsonl="attn_default.jsonl",
-        reps=5,
-        warmup=2,
-        gen=kbt.attn.gen_qkv,
-        ref=kbt.attn.ref_math,
-        cmp=kbt.attn.cmp_allclose,
-    )
-    kbt.summarize(["attn_default.jsonl"])

flash_attn/impls/cells/benchmark_max_autotune.py DELETED Viewed

@@ -1,70 +0,0 @@
-# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#     "numpy",
-#     "torch",
-#     "kernels-benchmark-tools",
-# ]
-#
-# [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
-# ///
-import torch
-import sys
-import os
-import kernels_benchmark_tools as kbt
-def torch_flash_base(q, k, v):
-    qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
-    with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
-        o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
-    return o.transpose(1, 2).contiguous()
-# Compile with max-autotune mode
-compiled_flash_max_autotune = torch.compile(torch_flash_base, mode="max-autotune", fullgraph=True, dynamic=False)
-kbt.add(
-    "torch_flash_compiled_max_autotune",
-    compiled_flash_max_autotune,
-    tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
-)
-if __name__ == "__main__":
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype = "float32" if device == "cpu" else "bfloat16"
-    # Flux-like workloads
-    base = 1024 if device == "cuda" else 512
-    flux_sizes = (
-        [128, 256, 320, 384, 448, 512] if device == "cuda" else [64, 128, 192, 256]
-    )
-    heads = 24 if device == "cuda" else 8
-    head_dim = 128 if device == "cuda" else 64
-    wl = []
-    for L in flux_sizes:
-        wl.append(
-            {
-                "name": f"flux_L{L}",
-                "batch": 1,
-                "seq_len": base + L,
-                "heads": heads,
-                "head_dim": head_dim,
-                "dtype": dtype,
-                "device": device,
-                "seed": 0,
-            }
-        )
-    kbt.run(
-        wl,
-        jsonl="attn_max_autotune.jsonl",
-        reps=5,
-        warmup=2,
-        gen=kbt.attn.gen_qkv,
-        ref=kbt.attn.ref_math,
-        cmp=kbt.attn.cmp_allclose,
-    )
-    kbt.summarize(["attn_max_autotune.jsonl"])

flash_attn/impls/cells/nv.py DELETED Viewed

@@ -1,3 +0,0 @@
-import subprocess
-print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)

flash_attn/impls/compiled_variants.html DELETED Viewed

The diff for this file is too large to render. See raw diff

flash_attn/impls/flash_attention.html DELETED Viewed

The diff for this file is too large to render. See raw diff

flash_attn/impls/hf_kernels_flash_attn.html DELETED Viewed

The diff for this file is too large to render. See raw diff

flash_attn/impls/hf_kernels_flash_attn3.html DELETED Viewed

The diff for this file is too large to render. See raw diff

flash_attn/impls/index.html DELETED Viewed

@@ -1,94 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
-  <title>Index of /flash_attn/impls</title>
-  <style>
-    :root {
-      --bg-primary: #0a0a0a;
-      --bg-secondary: #121212;
-      --bg-tertiary: #181818;
-      --text-primary: #e0e0e0;
-      --text-secondary: #888888;
-      --text-link: #64b5f6;
-      --border-primary: #2a2a2a;
-    }
-    body {
-      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
-      background: var(--bg-primary);
-      color: var(--text-primary);
-      margin: 0;
-      padding: 16px;
-      max-width: 900px;
-      margin: 0 auto;
-    }
-    .controls {
-      display: flex;
-      justify-content: flex-end;
-      margin-bottom: 1rem;
-    }
-    .back-button {
-      background: var(--bg-secondary);
-      border: 1px solid var(--border-primary);
-      padding: 8px 12px;
-      border-radius: 4px;
-      color: var(--text-secondary);
-      cursor: pointer;
-      font-size: 0.9rem;
-      text-decoration: none;
-      display: inline-block;
-    }
-    .back-button:hover {
-      color: var(--text-primary);
-      background: var(--bg-tertiary);
-    }
-    h1 {
-      font-size: 1.5em;
-      margin: 1rem 0;
-      color: var(--text-primary);
-      border-bottom: 1px solid var(--border-primary);
-      padding-bottom: 0.5rem;
-    }
-    ul {
-      list-style-type: none;
-      padding: 0;
-    }
-    li {
-      margin: 0;
-      border-bottom: 1px solid var(--border-primary);
-    }
-    li:last-child {
-      border-bottom: none;
-    }
-    a {
-      display: block;
-      padding: 0.75rem 0.5rem;
-      text-decoration: none;
-      color: var(--text-link);
-      transition: background 0.2s ease;
-    }
-    a:hover {
-      background: var(--bg-secondary);
-    }
-    .dir {
-      font-weight: 500;
-    }
-  </style>
-</head>
-<body>
-  <div class='controls'>
-    <a href='../index.html' class='back-button'>← back</a>
-  </div>
-  <h1>Index of /flash_attn/impls</h1>
-  <ul>
-    <li><a href='compiled_variants.html' class='file'>compiled_variants.html</a></li>
-    <li><a href='flash_attention.html' class='file'>flash_attention.html</a></li>
-    <li><a href='hf_kernels_flash_attn.html' class='file'>hf_kernels_flash_attn.html</a></li>
-    <li><a href='hf_kernels_flash_attn3.html' class='file'>hf_kernels_flash_attn3.html</a></li>
-    <li><a href='mem_efficient_attention.html' class='file'>mem_efficient_attention.html</a></li>
-    <li><a href='sage_attention.html' class='file'>sage_attention.html</a></li>
-    <li><a href='xformers.html' class='file'>xformers.html</a></li>
-  </ul>
-</body>
-</html>

flash_attn/impls/mem_efficient_attention.html DELETED Viewed

The diff for this file is too large to render. See raw diff

flash_attn/impls/sage_attention.html DELETED Viewed

The diff for this file is too large to render. See raw diff

flash_attn/impls/xformers.html DELETED Viewed

The diff for this file is too large to render. See raw diff

flash_attn/index.html DELETED Viewed

@@ -1,89 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
-  <title>Index of /flash_attn</title>
-  <style>
-    :root {
-      --bg-primary: #0a0a0a;
-      --bg-secondary: #121212;
-      --bg-tertiary: #181818;
-      --text-primary: #e0e0e0;
-      --text-secondary: #888888;
-      --text-link: #64b5f6;
-      --border-primary: #2a2a2a;
-    }
-    body {
-      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
-      background: var(--bg-primary);
-      color: var(--text-primary);
-      margin: 0;
-      padding: 16px;
-      max-width: 900px;
-      margin: 0 auto;
-    }
-    .controls {
-      display: flex;
-      justify-content: flex-end;
-      margin-bottom: 1rem;
-    }
-    .back-button {
-      background: var(--bg-secondary);
-      border: 1px solid var(--border-primary);
-      padding: 8px 12px;
-      border-radius: 4px;
-      color: var(--text-secondary);
-      cursor: pointer;
-      font-size: 0.9rem;
-      text-decoration: none;
-      display: inline-block;
-    }
-    .back-button:hover {
-      color: var(--text-primary);
-      background: var(--bg-tertiary);
-    }
-    h1 {
-      font-size: 1.5em;
-      margin: 1rem 0;
-      color: var(--text-primary);
-      border-bottom: 1px solid var(--border-primary);
-      padding-bottom: 0.5rem;
-    }
-    ul {
-      list-style-type: none;
-      padding: 0;
-    }
-    li {
-      margin: 0;
-      border-bottom: 1px solid var(--border-primary);
-    }
-    li:last-child {
-      border-bottom: none;
-    }
-    a {
-      display: block;
-      padding: 0.75rem 0.5rem;
-      text-decoration: none;
-      color: var(--text-link);
-      transition: background 0.2s ease;
-    }
-    a:hover {
-      background: var(--bg-secondary);
-    }
-    .dir {
-      font-weight: 500;
-    }
-  </style>
-</head>
-<body>
-  <div class='controls'>
-    <a href='../index.html' class='back-button'>← back</a>
-  </div>
-  <h1>Index of /flash_attn</h1>
-  <ul>
-    <li><a href='impls/index.html' class='dir'>impls/</a></li>
-    <li><a href='results/index.html' class='dir'>results/</a></li>
-  </ul>
-</body>
-</html>

flash_attn/results/artifacts/combine/latency.csv DELETED Viewed

@@ -1,43 +0,0 @@
-Implementation,Impl ID,Workload,Batch,Seq Length,Heads,Head Dim,Dtype,Mean (ms),P10 (ms),P50 (ms),P90 (ms),Reps,Peak Mem (MB),Backend,Family
-Flash (PyTorch SDPA),torch_flash_ma,flux_L128,1,1152,24,128,bfloat16,0.49411200881004336,0.48844799399375916,0.4936000108718872,0.4944640100002289,5,83.38,FLASH,torch-sdpa
-Flash (PyTorch SDPA),torch_flash_ma,flux_L256,1,1280,24,128,bfloat16,0.5234112024307251,0.5224320292472839,0.5235199928283691,0.5235840082168579,5,90.62,FLASH,torch-sdpa
-Flash (PyTorch SDPA),torch_flash_ma,flux_L320,1,1344,24,128,bfloat16,0.6527232170104981,0.6503040194511414,0.6524800062179565,0.6545600295066833,5,95.06,FLASH,torch-sdpa
-Flash (PyTorch SDPA),torch_flash_ma,flux_L384,1,1408,24,128,bfloat16,0.682803213596344,0.6805760264396667,0.6828799843788147,0.6832640171051025,5,99.88,FLASH,torch-sdpa
-Flash (PyTorch SDPA),torch_flash_ma,flux_L448,1,1472,24,128,bfloat16,0.7075456142425537,0.7057600021362305,0.7063360214233398,0.7070720195770264,5,103.81,FLASH,torch-sdpa
-Flash (PyTorch SDPA),torch_flash_ma,flux_L512,1,1536,24,128,bfloat16,0.7379711985588073,0.7368639707565308,0.7372480034828186,0.7391039729118347,5,109.12,FLASH,torch-sdpa
-MemEff (PyTorch SDPA),torch_mem_eff,flux_L128,1,1152,24,128,bfloat16,0.5874239921569824,0.5861759781837463,0.5873280167579651,0.5877439975738525,5,83.38,EFFICIENT,torch-sdpa
-MemEff (PyTorch SDPA),torch_mem_eff,flux_L256,1,1280,24,128,bfloat16,0.6502719998359681,0.6490240097045898,0.649183988571167,0.6517760157585144,5,90.62,EFFICIENT,torch-sdpa
-MemEff (PyTorch SDPA),torch_mem_eff,flux_L320,1,1344,24,128,bfloat16,0.7812095880508423,0.7761600017547607,0.7803199887275696,0.7852799892425537,5,95.94,EFFICIENT,torch-sdpa
-MemEff (PyTorch SDPA),torch_mem_eff,flux_L384,1,1408,24,128,bfloat16,0.7948480010032654,0.7911999821662903,0.7935360074043274,0.7948480248451233,5,100.0,EFFICIENT,torch-sdpa
-MemEff (PyTorch SDPA),torch_mem_eff,flux_L448,1,1472,24,128,bfloat16,0.8463295936584473,0.8449919819831848,0.8459839820861816,0.8461120128631592,5,103.81,EFFICIENT,torch-sdpa
-MemEff (PyTorch SDPA),torch_mem_eff,flux_L512,1,1536,24,128,bfloat16,0.9538687944412232,0.9492800235748291,0.9518399834632874,0.9581760168075562,5,109.12,EFFICIENT,torch-sdpa
-xFormers,xformers_meff,flux_L128,1,1152,24,128,bfloat16,0.4515071928501129,0.44364801049232483,0.4524799883365631,0.4557119905948639,5,83.38,memory_efficient,xformers
-xFormers,xformers_meff,flux_L256,1,1280,24,128,bfloat16,0.46787199974060056,0.46489599347114563,0.4684160053730011,0.46908798813819885,5,90.62,memory_efficient,xformers
-xFormers,xformers_meff,flux_L320,1,1344,24,128,bfloat16,0.6001471996307373,0.596992015838623,0.5984640121459961,0.6016640067100525,5,95.06,memory_efficient,xformers
-xFormers,xformers_meff,flux_L384,1,1408,24,128,bfloat16,0.6023231983184815,0.5997440218925476,0.6031039953231812,0.6032639741897583,5,99.88,memory_efficient,xformers
-xFormers,xformers_meff,flux_L448,1,1472,24,128,bfloat16,0.6411136031150818,0.6381760239601135,0.6414719820022583,0.6421440243721008,5,103.81,memory_efficient,xformers
-xFormers,xformers_meff,flux_L512,1,1536,24,128,bfloat16,0.6594688057899475,0.6441280245780945,0.6496639847755432,0.6527680158615112,5,109.12,memory_efficient,xformers
-Compiled (default),torch_flash_compiled_default,flux_L128,1,1152,24,128,bfloat16,0.5181439876556396,0.5141760110855103,0.5175679922103882,0.5197759866714478,5,83.38,FLASH,torch-sdpa
-Compiled (default),torch_flash_compiled_default,flux_L256,1,1280,24,128,bfloat16,0.5579584002494812,0.5549119710922241,0.5582720041275024,0.5598080158233643,5,90.62,FLASH,torch-sdpa
-Compiled (default),torch_flash_compiled_default,flux_L320,1,1344,24,128,bfloat16,0.6872959971427918,0.6853119730949402,0.687391996383667,0.6883519887924194,5,95.25,FLASH,torch-sdpa
-Compiled (default),torch_flash_compiled_default,flux_L384,1,1408,24,128,bfloat16,0.716153597831726,0.7128639817237854,0.7160959839820862,0.7167680263519287,5,99.88,FLASH,torch-sdpa
-Compiled (default),torch_flash_compiled_default,flux_L448,1,1472,24,128,bfloat16,0.7418303966522217,0.7386879920959473,0.7400959730148315,0.7415040135383606,5,103.81,FLASH,torch-sdpa
-Compiled (default),torch_flash_compiled_default,flux_L512,1,1536,24,128,bfloat16,0.7745471954345703,0.7708160281181335,0.7740799784660339,0.7753919959068298,5,109.12,FLASH,torch-sdpa
-Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L128,1,1152,24,128,bfloat16,0.6468096017837525,0.6144000291824341,0.6245759725570679,0.6483200192451477,5,67.5,FLASH,torch-sdpa
-Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L256,1,1280,24,128,bfloat16,0.7060160160064697,0.6689280271530151,0.6851199865341187,0.7184960246086121,5,75.0,FLASH,torch-sdpa
-Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L320,1,1344,24,128,bfloat16,0.8332608103752136,0.7953600287437439,0.8155840039253235,0.8403519988059998,5,80.38,FLASH,torch-sdpa
-Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L384,1,1408,24,128,bfloat16,0.8719295978546142,0.8470720052719116,0.849727988243103,0.8745279908180237,5,82.5,FLASH,torch-sdpa
-Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L448,1,1472,24,128,bfloat16,0.9034304022789001,0.8677120208740234,0.8835520148277283,0.9034240245819092,5,86.25,FLASH,torch-sdpa
-Compiled (max-autotune),torch_flash_compiled_max_autotune,flux_L512,1,1536,24,128,bfloat16,0.9387519836425782,0.9154239892959595,0.9213759899139404,0.9359679818153381,5,90.0,FLASH,torch-sdpa
-HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L128,1,1152,24,128,bfloat16,0.3455295979976654,0.34355199337005615,0.34563198685646057,0.34643200039863586,5,83.38,flash-attn,hf-kernels
-HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L256,1,1280,24,128,bfloat16,0.3756160080432892,0.37411201000213623,0.3752000033855438,0.3770880103111267,5,90.62,flash-attn,hf-kernels
-HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L320,1,1344,24,128,bfloat16,0.4953216016292572,0.49324798583984375,0.49433600902557373,0.49663999676704407,5,95.06,flash-attn,hf-kernels
-HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L384,1,1408,24,128,bfloat16,0.5157055854797363,0.5142719745635986,0.516319990158081,0.516543984413147,5,99.88,flash-attn,hf-kernels
-HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L448,1,1472,24,128,bfloat16,0.5356672048568726,0.5346879959106445,0.5358080267906189,0.5361599922180176,5,103.81,flash-attn,hf-kernels
-HF Kernels Flash Attn,hf_kernels_flash_attn,flux_L512,1,1536,24,128,bfloat16,0.5587136030197144,0.5557760000228882,0.5574079751968384,0.5581120252609253,5,109.12,flash-attn,hf-kernels
-HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L128,1,1152,24,128,bfloat16,0.3619711995124817,0.3603839874267578,0.361952006816864,0.3624640107154846,5,83.38,flash-attn3,hf-kernels
-HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L256,1,1280,24,128,bfloat16,0.3912447988986969,0.3892799913883209,0.3909760117530823,0.3922559916973114,5,90.62,flash-attn3,hf-kernels
-HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L320,1,1344,24,128,bfloat16,0.5258048176765442,0.5240640044212341,0.5248960256576538,0.5248960256576538,5,95.06,flash-attn3,hf-kernels
-HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L384,1,1408,24,128,bfloat16,0.5276032090187073,0.5265600085258484,0.5277760028839111,0.5282559990882874,5,99.88,flash-attn3,hf-kernels
-HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L448,1,1472,24,128,bfloat16,0.5656383991241455,0.5639039874076843,0.5657920241355896,0.5668479800224304,5,103.81,flash-attn3,hf-kernels
-HF Kernels Flash Attn3,hf_kernels_flash_attn3,flux_L512,1,1536,24,128,bfloat16,0.5789952039718628,0.5689600110054016,0.5698239803314209,0.5713919997215271,5,109.12,flash-attn3,hf-kernels

flash_attn/results/artifacts/combine/latency.png DELETED Viewed

Git LFS Details

SHA256: 87dbea8f2773d7fcee9fd191cb6e67cd1e2ddd379cef90ee01bb4ac40a55b5f1
Pointer size: 131 Bytes
Size of remote file: 110 kB

flash_attn/results/artifacts/combine/latency.svg DELETED Viewed

Git LFS Details

SHA256: 2c1da56080e7fd1a85c14295083b11d6bac981f6fb3faef98b0753eb2c1676c7
Pointer size: 130 Bytes
Size of remote file: 28.2 kB

flash_attn/results/cells/combine.py DELETED Viewed

@@ -1,319 +0,0 @@
-# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#     "numpy",
-#     "torch",
-#     "kernels-benchmark-tools",
-#     "matplotlib",
-# ]
-#
-# [tool.uv.sources]
-# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
-# ///
-import os
-import sys
-from pathlib import Path
-import json
-import torch  # noqa: F401  # imported because upstream may expect torch to be importable
-import kernels_benchmark_tools as kbt
-# --- Matplotlib setup and helpers ------------------------------------------------
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-import csv
-# Keep text as text (not paths) so CSS can style fonts, size, etc.
-mpl.rcParams["svg.fonttype"] = "none"
-# Make ids deterministic across builds
-mpl.rcParams["svg.hashsalt"] = "latency-benchmark-combined"
-# Avoid auto-closed figures interfering with our tagging
-mpl.rcParams["figure.autolayout"] = True
-# Make background transparent
-mpl.rcParams["figure.facecolor"] = "none"
-mpl.rcParams["axes.facecolor"] = "none"
-mpl.rcParams["savefig.facecolor"] = "none"
-mpl.rcParams["savefig.edgecolor"] = "none"
-def _slugify(s: str) -> str:
-    s = (s or "").strip().lower()
-    keep = []
-    for ch in s:
-        if ch.isalnum():
-            keep.append(ch)
-        elif ch in (" ", "-", "_", "/", ".", ":"):
-            keep.append("-")
-        else:
-            keep.append("")
-    out = "".join(keep)
-    while "--" in out:
-        out = out.replace("--", "-")
-    return out.strip("-") or "unnamed"
-def _tag_current_figure(default_series_prefix="series"):
-    """Attach SVG ids (gid) to key artists so they can be targeted from CSS."""
-    fig = plt.gcf()
-    if fig is None:
-        return
-    # Tag the figure itself
-    fig.set_gid("figure--latency")
-    for ax_idx, ax in enumerate(fig.get_axes(), start=1):
-        ax.set_gid(f"axes--{ax_idx}")
-        # Axis labels & title
-        if ax.get_title():
-            for t in ax.texts:
-                if t.get_text() == ax.get_title():
-                    t.set_gid("title--main")
-        if ax.xaxis and ax.xaxis.get_label():
-            ax.xaxis.label.set_gid("label--x")
-        if ax.yaxis and ax.yaxis.get_label():
-            ax.yaxis.label.set_gid("label--y")
-        # Gridlines
-        for i, gl in enumerate(ax.get_xgridlines(), start=1):
-            gl.set_gid(f"grid-x--{i}")
-        for i, gl in enumerate(ax.get_ygridlines(), start=1):
-            gl.set_gid(f"grid-y--{i}")
-        # Legend block & entries
-        leg = ax.get_legend()
-        if leg is not None:
-            leg.set_gid("legend")
-            for i, txt in enumerate(leg.get_texts(), start=1):
-                label_slug = _slugify(txt.get_text())
-                txt.set_gid(f"legend-label--{label_slug or i}")
-        # Series (lines, patches)
-        # Lines
-        line_seen = {}
-        for ln in getattr(ax, "lines", []):
-            raw_label = ln.get_label() or ""
-            # Matplotlib uses labels beginning with "_" for non-legendable items
-            label = raw_label if not raw_label.startswith("_") else f"{default_series_prefix}"
-            slug = _slugify(label)
-            line_seen[slug] = line_seen.get(slug, 0) + 1
-            suffix = "" if line_seen[slug] == 1 else f"-{line_seen[slug]}"
-            ln.set_gid(f"series--{slug}{suffix}")
-        # Patches (bars, areas)
-        patch_seen = {}
-        for pt in getattr(ax, "patches", []):
-            label = getattr(pt, "get_label", lambda: "")() or f"{default_series_prefix}"
-            if isinstance(label, str) and label.startswith("_"):
-                label = default_series_prefix
-            slug = _slugify(label)
-            patch_seen[slug] = patch_seen.get(slug, 0) + 1
-            suffix = "" if patch_seen[slug] == 1 else f"-{patch_seen[slug]}"
-            pt.set_gid(f"series--{slug}{suffix}")
-def _postprocess_svg_add_classes(svg_path: Path):
-    """Add convenient CSS classes alongside ids (e.g., class='series grid grid-x')."""
-    try:
-        import xml.etree.ElementTree as ET
-        ET.register_namespace("", "http://www.w3.org/2000/svg")
-        tree = ET.parse(svg_path)
-        root = tree.getroot()
-        for el in root.iter():
-            el_id = el.attrib.get("id", "")
-            if not el_id:
-                continue
-            cls = []
-            if el_id.startswith("figure--"):
-                cls.append("figure")
-            elif el_id.startswith("axes--"):
-                cls.append("axes")
-            elif el_id.startswith("grid-x--"):
-                cls += ["grid", "grid-x"]
-            elif el_id.startswith("grid-y--"):
-                cls += ["grid", "grid-y"]
-            elif el_id.startswith("legend"):
-                cls.append("legend")
-            elif el_id.startswith("label--x"):
-                cls.append("xlabel")
-            elif el_id.startswith("label--y"):
-                cls.append("ylabel")
-            elif el_id.startswith("title--"):
-                cls.append("title")
-            elif el_id.startswith("series--"):
-                cls.append("series")
-            if cls:
-                # Preserve any existing class (unlikely from Matplotlib)
-                existing = el.attrib.get("class", "")
-                el.set("class", (existing + " " + " ".join(cls)).strip())
-        tree.write(svg_path, encoding="utf-8", xml_declaration=True)
-    except Exception as e:
-        print(f"✗ SVG postprocess (classes) skipped: {e}")
-# Monkey-patch savefig to force SVG & ensure tagging occurs even if kbt.viz saves internally.
-_orig_savefig = plt.savefig
-def _savefig_svg(fname, *args, **kwargs):
-    # Always save as SVG at a stable path for the artifact system
-    out = Path("latency.svg")
-    kwargs["format"] = "svg"
-    # Ensure everything we care about has ids before export
-    _tag_current_figure()
-    res = _orig_savefig(out, *args, **kwargs)
-    # Add helpful CSS classes on top of ids
-    _postprocess_svg_add_classes(out)
-    print(f"✓ Combined visualization saved as {out}")
-    return res
-plt.savefig = _savefig_svg  # apply patch
-# Capture close calls in case kbt.viz() closes figures before we re-save
-_orig_close = plt.close
-_last_closed = {"fig": None}
-def _capture_close(arg=None):
-    try:
-        if hasattr(arg, "savefig"):  # looks like a Figure
-            _last_closed["fig"] = arg
-        else:
-            _last_closed["fig"] = plt.gcf()
-    finally:
-        return _orig_close(arg)
-plt.close = _capture_close
-# --- Locate benchmark artifacts --------------------------------------------------
-cache_dirs = {
-    "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
-    "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
-    "Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
-    "xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
-    "SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
-    "Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
-    "Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
-    "HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
-    "HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
-}
-print("LOADING BENCHMARK DATA")
-for name, cache_dir in cache_dirs.items():
-    print(f"{name:30s}: {cache_dir}")
-print()
-file_mapping = {
-    "Flash (PyTorch SDPA)": "attn.jsonl",
-    "MemEff (PyTorch SDPA)": "attn.jsonl",
-    "Flash Attn 2": "attn.jsonl",
-    "xFormers": "attn.jsonl",
-    "SageAttention": "attn.jsonl",
-    "Compiled (default)": "attn_default.jsonl",
-    "Compiled (max-autotune)": "attn_max_autotune.jsonl",
-    "HF Kernels Flash Attn": "attn.jsonl",
-    "HF Kernels Flash Attn3": "attn.jsonl",
-}
-all_paths = []
-for name, cache_dir in cache_dirs.items():
-    if cache_dir:
-        path = Path(cache_dir) / file_mapping[name]
-        if path.exists() and path.stat().st_size > 0:
-            all_paths.append(str(path))
-            print(f"✓ Found {name}: {path}")
-        else:
-            print(f"⊘ Empty/Missing {name}: {path}")
-    else:
-        print(f"✗ No cache dir for {name}")
-print()
-if not all_paths:
-    print("ERROR: No benchmark data files found!")
-    # restore patched functions before exiting
-    plt.savefig = _orig_savefig
-    plt.close = _orig_close
-    sys.exit(1)
-# --- Summary + Visualization -----------------------------------------------------
-print("COMBINED BENCHMARK SUMMARY\n")
-kbt.summarize(all_paths)
-print("\nGENERATING COMBINED VISUALIZATION\n")
-try:
-    # If kbt.viz saves internally, our patched savefig ensures SVG gets written,
-    # and it will carry ids/classes for CSS styling.
-    kbt.viz(all_paths)
-    # Safety net: if kbt.viz didn't save, save now.
-    # if not Path("latency.svg").exists():
-    #     _tag_current_figure()
-    # plt.savefig("latency.svg")
-    plt.savefig("latency.svg")  # ensure saved with tagging
-    print("✓ SVG visualization ready: latency.svg!")
-except ImportError as e:
-    print(f"✗ Visualization requires matplotlib: {e}")
-except Exception as e:
-    print(f"✗ Visualization failed: {e}")
-finally:
-    # Clean up patches to avoid side effects in later cells
-    plt.savefig = _orig_savefig
-    plt.close = _orig_close
-print()
-print("ANALYSIS COMPLETE")
-print(f"Total implementations analyzed: {len(all_paths)}")
-print(f"\nImplementations included:")
-for name, cache_dir in cache_dirs.items():
-    if cache_dir:
-        path = Path(cache_dir) / file_mapping[name]
-        if path.exists() and path.stat().st_size > 0:
-            print(f"  ✓ {name}")
-# Collect all benchmark data and export to CSV
-all_data = {}
-for name, cache_dir in cache_dirs.items():
-    if cache_dir:
-        path = Path(cache_dir) / file_mapping[name]
-        if path.exists() and path.stat().st_size > 0:
-            with open(path, 'r') as f:
-                records = [json.loads(line) for line in f]
-                all_data[name] = records
-# Export to CSV
-csv_path = Path("latency.csv")
-with open(csv_path, 'w', newline='') as csvfile:
-    writer = csv.writer(csvfile)
-    # Write header
-    header = ["Implementation", "Impl ID", "Workload", "Batch", "Seq Length", "Heads", "Head Dim", "Dtype",
-              "Mean (ms)", "P10 (ms)", "P50 (ms)", "P90 (ms)", "Reps",
-            #   "Compile (ms)",
-              "Peak Mem (MB)", "Backend", "Family"]
-    writer.writerow(header)
-    # Write data rows
-    for impl_name, records in all_data.items():
-        for record in records:
-            wl = record.get('wl', {})
-            lat = record.get('lat_ms', {})
-            tags = record.get('tags', {})
-            row = [
-                impl_name,
-                record.get('impl', ''),
-                wl.get('name', ''),
-                wl.get('batch', ''),
-                wl.get('seq_len', ''),
-                wl.get('heads', ''),
-                wl.get('head_dim', ''),
-                wl.get('dtype', ''),
-                lat.get('mean', ''),
-                lat.get('p10', ''),
-                lat.get('p50', ''),
-                lat.get('p90', ''),
-                lat.get('reps', ''),
-                # record.get('compile_ms', ''),
-                round(record.get('peak_bytes', 0) / 1024 / 1024, 2) if record.get('peak_bytes') else '',
-                tags.get('backend', ''),
-                tags.get('family', ''),
-            ]
-            writer.writerow(row)
-print(f"✓ CSV export complete: {csv_path}")
-print(f"Total implementations: {len(all_data)}")
-print(f"Total records: {sum(len(records) for records in all_data.values())}")

flash_attn/results/combined_results.html DELETED Viewed

The diff for this file is too large to render. See raw diff

flash_attn/results/index.html DELETED Viewed

@@ -1,88 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
-  <title>Index of /flash_attn/results</title>
-  <style>
-    :root {
-      --bg-primary: #0a0a0a;
-      --bg-secondary: #121212;
-      --bg-tertiary: #181818;
-      --text-primary: #e0e0e0;
-      --text-secondary: #888888;
-      --text-link: #64b5f6;
-      --border-primary: #2a2a2a;
-    }
-    body {
-      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
-      background: var(--bg-primary);
-      color: var(--text-primary);
-      margin: 0;
-      padding: 16px;
-      max-width: 900px;
-      margin: 0 auto;
-    }
-    .controls {
-      display: flex;
-      justify-content: flex-end;
-      margin-bottom: 1rem;
-    }
-    .back-button {
-      background: var(--bg-secondary);
-      border: 1px solid var(--border-primary);
-      padding: 8px 12px;
-      border-radius: 4px;
-      color: var(--text-secondary);
-      cursor: pointer;
-      font-size: 0.9rem;
-      text-decoration: none;
-      display: inline-block;
-    }
-    .back-button:hover {
-      color: var(--text-primary);
-      background: var(--bg-tertiary);
-    }
-    h1 {
-      font-size: 1.5em;
-      margin: 1rem 0;
-      color: var(--text-primary);
-      border-bottom: 1px solid var(--border-primary);
-      padding-bottom: 0.5rem;
-    }
-    ul {
-      list-style-type: none;
-      padding: 0;
-    }
-    li {
-      margin: 0;
-      border-bottom: 1px solid var(--border-primary);
-    }
-    li:last-child {
-      border-bottom: none;
-    }
-    a {
-      display: block;
-      padding: 0.75rem 0.5rem;
-      text-decoration: none;
-      color: var(--text-link);
-      transition: background 0.2s ease;
-    }
-    a:hover {
-      background: var(--bg-secondary);
-    }
-    .dir {
-      font-weight: 500;
-    }
-  </style>
-</head>
-<body>
-  <div class='controls'>
-    <a href='../index.html' class='back-button'>← back</a>
-  </div>
-  <h1>Index of /flash_attn/results</h1>
-  <ul>
-    <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
-  </ul>
-</body>
-</html>

index.html DELETED Viewed

@@ -1,85 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <meta name='viewport' content='width=device-width, initial-scale=1.0'>
-  <title>Index of /</title>
-  <style>
-    :root {
-      --bg-primary: #0a0a0a;
-      --bg-secondary: #121212;
-      --bg-tertiary: #181818;
-      --text-primary: #e0e0e0;
-      --text-secondary: #888888;
-      --text-link: #64b5f6;
-      --border-primary: #2a2a2a;
-    }
-    body {
-      font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
-      background: var(--bg-primary);
-      color: var(--text-primary);
-      margin: 0;
-      padding: 16px;
-      max-width: 900px;
-      margin: 0 auto;
-    }
-    .controls {
-      display: flex;
-      justify-content: flex-end;
-      margin-bottom: 1rem;
-    }
-    .back-button {
-      background: var(--bg-secondary);
-      border: 1px solid var(--border-primary);
-      padding: 8px 12px;
-      border-radius: 4px;
-      color: var(--text-secondary);
-      cursor: pointer;
-      font-size: 0.9rem;
-      text-decoration: none;
-      display: inline-block;
-    }
-    .back-button:hover {
-      color: var(--text-primary);
-      background: var(--bg-tertiary);
-    }
-    h1 {
-      font-size: 1.5em;
-      margin: 1rem 0;
-      color: var(--text-primary);
-      border-bottom: 1px solid var(--border-primary);
-      padding-bottom: 0.5rem;
-    }
-    ul {
-      list-style-type: none;
-      padding: 0;
-    }
-    li {
-      margin: 0;
-      border-bottom: 1px solid var(--border-primary);
-    }
-    li:last-child {
-      border-bottom: none;
-    }
-    a {
-      display: block;
-      padding: 0.75rem 0.5rem;
-      text-decoration: none;
-      color: var(--text-link);
-      transition: background 0.2s ease;
-    }
-    a:hover {
-      background: var(--bg-secondary);
-    }
-    .dir {
-      font-weight: 500;
-    }
-  </style>
-</head>
-<body>
-  <h1>Index of /</h1>
-  <ul>
-    <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
-  </ul>
-</body>
-</html>

megablocks/cells/forward_and_backward.py DELETED Viewed

@@ -1,196 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
-import sys
-import torch.profiler
-import gc
-import logging
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
-# remove liger kernel for testing
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 128  # Reduced to help with memory usage
-# Clear memory before backward pass
-reset_peak_memory_stats()
-print(f"Pre-generation memory: {get_memory_stats()}")
-# forward and backward pass
-with torch.autograd.set_grad_enabled(True):
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-    print(tokenizer.decode(generated[0], skip_special_tokens=False))
-    print(f"Generation took {end_time - start_time:.2f} seconds")
-    print(f"Post-generation memory: {get_memory_stats()}")
-    # Use gradient checkpointing to reduce memory usage
-    if hasattr(model, 'gradient_checkpointing_enable'):
-        model.gradient_checkpointing_enable()
-        print("Enabled gradient checkpointing")
-    # Reduce sequence length if needed for memory
-    max_seq_len = 512  # Limit sequence length for backward pass
-    if generated.size(1) > max_seq_len:
-        print(f"Truncating sequence from {generated.size(1)} to {max_seq_len} tokens")
-        full_sequence = generated[:, -max_seq_len:]
-    else:
-        full_sequence = generated
-    # Get model outputs for the full sequence
-    model.train()  # Enable dropout and other training behaviors
-    try:
-        outputs = model(
-            input_ids=full_sequence,
-            labels=full_sequence,  # This will compute loss internally
-            return_dict=True
-        )
-        print(f"Post-forward memory: {get_memory_stats()}")
-        # If model doesn't compute loss, compute it manually
-        if outputs.loss is None:
-            shift_logits = outputs.logits[..., :-1, :].contiguous()
-            shift_labels = full_sequence[..., 1:].contiguous()
-            # Use CrossEntropyLoss with ignore_index for padding tokens
-            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -100)
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
-                shift_labels.view(-1)
-            )
-        else:
-            loss = outputs.loss
-        print(f"Loss: {loss.item():.4f}")
-        # Clear intermediate tensors to save memory
-        del outputs
-        torch.cuda.empty_cache()
-        # Perform backward pass with memory management
-        print("Running backward pass...")
-        print(f"Pre-backward memory: {get_memory_stats()}")
-        loss.backward()
-        print(f"Post-backward memory: {get_memory_stats()}")
-    except torch.cuda.OutOfMemoryError as e:
-        print(f"OOM during forward/backward pass: {e}")
-        print("Try reducing max_tokens or max_seq_len")
-        raise
-    # Calculate gradient statistics and print sample gradients
-    total_norm = 0.0
-    param_count = 0
-    grad_samples = {}
-    for name, p in model.named_parameters():
-        if p.grad is not None:
-            param_count += 1
-            grad_norm = p.grad.data.norm(2).item()
-            total_norm += grad_norm ** 2
-            # Collect gradient statistics for key layers
-            if any(key in name for key in ['embed', 'lm_head', 'mlp.up', 'mlp.down', 'self_attn.q_proj', 'norm']):
-                grad_samples[name] = {
-                    'norm': grad_norm,
-                    'mean': p.grad.data.mean().item(),
-                    'std': p.grad.data.std().item(),
-                    'max': p.grad.data.max().item(),
-                    'min': p.grad.data.min().item(),
-                }
-    total_norm = total_norm ** 0.5
-    print(f"\nGradient norm: {total_norm:.4f}")
-    print(f"Parameters with gradients: {param_count}")
-    # Print sample gradients from important layers
-    print("\nSample gradient statistics:")
-    for i, (name, stats) in enumerate(list(grad_samples.items())[:10]):
-        print(f"  {name[:60]:<60} | norm: {stats['norm']:.4e} | mean: {stats['mean']:.4e} | std: {stats['std']:.4e}")
-    # Optional: zero gradients for next iteration
-    model.zero_grad()
-    model.eval()  # Switch back to eval mode

megablocks/cells/forward_and_backward_no_kernel.py DELETED Viewed

@@ -1,196 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
-import sys
-import torch.profiler
-import gc
-import logging
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
-# remove liger kernel for testing
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=False,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 128  # Reduced to help with memory usage
-# Clear memory before backward pass
-reset_peak_memory_stats()
-print(f"Pre-generation memory: {get_memory_stats()}")
-# forward and backward pass
-with torch.autograd.set_grad_enabled(True):
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-    print(tokenizer.decode(generated[0], skip_special_tokens=False))
-    print(f"Generation took {end_time - start_time:.2f} seconds")
-    print(f"Post-generation memory: {get_memory_stats()}")
-    # Use gradient checkpointing to reduce memory usage
-    if hasattr(model, 'gradient_checkpointing_enable'):
-        model.gradient_checkpointing_enable()
-        print("Enabled gradient checkpointing")
-    # Reduce sequence length if needed for memory
-    max_seq_len = 512  # Limit sequence length for backward pass
-    if generated.size(1) > max_seq_len:
-        print(f"Truncating sequence from {generated.size(1)} to {max_seq_len} tokens")
-        full_sequence = generated[:, -max_seq_len:]
-    else:
-        full_sequence = generated
-    # Get model outputs for the full sequence
-    model.train()  # Enable dropout and other training behaviors
-    try:
-        outputs = model(
-            input_ids=full_sequence,
-            labels=full_sequence,  # This will compute loss internally
-            return_dict=True
-        )
-        print(f"Post-forward memory: {get_memory_stats()}")
-        # If model doesn't compute loss, compute it manually
-        if outputs.loss is None:
-            shift_logits = outputs.logits[..., :-1, :].contiguous()
-            shift_labels = full_sequence[..., 1:].contiguous()
-            # Use CrossEntropyLoss with ignore_index for padding tokens
-            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -100)
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
-                shift_labels.view(-1)
-            )
-        else:
-            loss = outputs.loss
-        print(f"Loss: {loss.item():.4f}")
-        # Clear intermediate tensors to save memory
-        del outputs
-        torch.cuda.empty_cache()
-        # Perform backward pass with memory management
-        print("Running backward pass...")
-        print(f"Pre-backward memory: {get_memory_stats()}")
-        loss.backward()
-        print(f"Post-backward memory: {get_memory_stats()}")
-    except torch.cuda.OutOfMemoryError as e:
-        print(f"OOM during forward/backward pass: {e}")
-        print("Try reducing max_tokens or max_seq_len")
-        raise
-    # Calculate gradient statistics and print sample gradients
-    total_norm = 0.0
-    param_count = 0
-    grad_samples = {}
-    for name, p in model.named_parameters():
-        if p.grad is not None:
-            param_count += 1
-            grad_norm = p.grad.data.norm(2).item()
-            total_norm += grad_norm ** 2
-            # Collect gradient statistics for key layers
-            if any(key in name for key in ['embed', 'lm_head', 'mlp.up', 'mlp.down', 'self_attn.q_proj', 'norm']):
-                grad_samples[name] = {
-                    'norm': grad_norm,
-                    'mean': p.grad.data.mean().item(),
-                    'std': p.grad.data.std().item(),
-                    'max': p.grad.data.max().item(),
-                    'min': p.grad.data.min().item(),
-                }
-    total_norm = total_norm ** 0.5
-    print(f"\nGradient norm: {total_norm:.4f}")
-    print(f"Parameters with gradients: {param_count}")
-    # Print sample gradients from important layers
-    print("\nSample gradient statistics:")
-    for i, (name, stats) in enumerate(list(grad_samples.items())[:10]):
-        print(f"  {name[:60]:<60} | norm: {stats['norm']:.4e} | mean: {stats['mean']:.4e} | std: {stats['std']:.4e}")
-    # Optional: zero gradients for next iteration
-    model.zero_grad()
-    model.eval()  # Switch back to eval mode

megablocks/cells/forward_only.py DELETED Viewed

@@ -1,101 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
-import sys
-import torch.profiler
-import gc
-import logging
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 256
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")

megablocks/cells/no_kernels.py DELETED Viewed

@@ -1,98 +0,0 @@
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
-import sys
-import torch.profiler
-import gc
-import logging
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=False,
-    quantization_config=quantization_config,
-).eval()
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-max_tokens = 256
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")

megablocks/cells/nv.py DELETED Viewed

@@ -1,3 +0,0 @@
-import subprocess
-print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)

megablocks/index.html DELETED Viewed

@@ -1,24 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset='UTF-8'>
-  <title>Directory Index</title>
-  <style>
-    body { font-family: monospace; margin: 20px; }
-    h1 { font-size: 1.5em; }
-    ul { list-style-type: none; padding-left: 20px; }
-    li { margin: 5px 0; }
-    .dir { font-weight: bold; }
-    .file { color: #0066cc; }
-    a { text-decoration: none; }
-    a:hover { text-decoration: underline; }
-  </style>
-</head>
-<body>
-  <h1>Index of /megablocks</h1>
-  <ul>
-    <li><a href='../index.html' class='dir'>../</a></li>
-    <li><a href='megablocks_only.html' class='file'>megablocks_only.html</a></li>
-  </ul>
-</body>
-</html>

megablocks/megablocks_only.html DELETED Viewed

The diff for this file is too large to render. See raw diff

megablocks_yamoe/artifacts/binned_run/binned_results.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "implementation": "binned_results",
-  "config": {
-    "warmup": 10,
-    "iters": 50,
-    "device": "cuda",
-    "dtype": "torch.float32",
-    "tokens": 100,
-    "vary_inputs": true
-  },
-  "stats": {
-    "avg_ms": 36.26809924006011,
-    "min_ms": 34.103908000361116,
-    "max_ms": 37.68557000057626,
-    "std_ms": 1.1598518125118418,
-    "p50_ms": 36.52223600056459,
-    "p95_ms": 37.6427445000445,
-    "p99_ms": 37.677440410316194,
-    "num_iters": 50,
-    "tokens_per_s": 2757.2440269917565,
-    "throughput_variance": 89.13103199163609
-  },
-  "output_sum": 3.97190523147583
-}

megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "implementation": "gptoss_results",
-  "config": {
-    "warmup": 10,
-    "iters": 50,
-    "device": "cuda",
-    "dtype": "torch.float32",
-    "tokens": 100,
-    "vary_inputs": true
-  },
-  "stats": {
-    "avg_ms": 46.913985819956,
-    "min_ms": 40.44806400088419,
-    "max_ms": 51.07520399997156,
-    "std_ms": 2.9921332618008196,
-    "p50_ms": 47.418902999652346,
-    "p95_ms": 50.800493049837314,
-    "p99_ms": 50.948625239852845,
-    "num_iters": 50,
-    "tokens_per_s": 2131.560519794133,
-    "throughput_variance": 139.93911554997217
-  },
-  "output_sum": 11.53223705291748
-}

megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "implementation": "gptoss_training_results",
-  "config": {
-    "warmup": 10,
-    "iters": 50,
-    "device": "cuda",
-    "dtype": "torch.float32",
-    "tokens": 100,
-    "vary_inputs": true
-  },
-  "stats": {
-    "avg_ms": 46.289439859992854,
-    "min_ms": 39.97907499979192,
-    "max_ms": 50.58144600025116,
-    "std_ms": 2.9172154402078077,
-    "p50_ms": 46.64785849990949,
-    "p95_ms": 50.26727430031315,
-    "p99_ms": 50.5162941305025,
-    "num_iters": 50,
-    "tokens_per_s": 2160.3199412751637,
-    "throughput_variance": 139.86427060112865
-  },
-  "output_sum": 11.53223705291748
-}

megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "implementation": "yamoe_results",
-  "config": {
-    "warmup": 10,
-    "iters": 50,
-    "device": "cuda",
-    "dtype": "torch.float32",
-    "tokens": 100,
-    "vary_inputs": true
-  },
-  "stats": {
-    "avg_ms": 4.248197240067384,
-    "min_ms": 4.136622000260104,
-    "max_ms": 4.280714999367774,
-    "std_ms": 0.02141682051311511,
-    "p50_ms": 4.253484999935608,
-    "p95_ms": 4.265540049709671,
-    "p99_ms": 4.273649199667489,
-    "num_iters": 50,
-    "tokens_per_s": 23539.396677922097,
-    "throughput_variance": 120.66648678204231
-  },
-  "output_sum": 3.97190523147583
-}

megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc DELETED Viewed

Binary file (16.1 kB)

megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc DELETED Viewed

Binary file (680 Bytes)

megablocks_yamoe/cells/bench_utils.py DELETED Viewed

@@ -1,241 +0,0 @@
-# /// script
-# dependencies = [
-#     "torch",
-#     "numpy",
-# ]
-# ///
-"""Reusable benchmarking utilities for performance testing."""
-import time
-import numpy as np
-from contextlib import contextmanager
-from typing import Callable, Dict, Tuple, Any, Optional
-import torch
-def to_dtype(dtype_str: str):
-    """Convert string to torch dtype."""
-    if dtype_str == "float16":
-        return torch.float16
-    if dtype_str == "bfloat16":
-        return torch.bfloat16
-    return torch.float32
-def _sync(device: str):
-    """Synchronize device if CUDA."""
-    if device == "cuda":
-        torch.cuda.synchronize()
-def _compute_stats(times_s, tokens: Optional[int] = None) -> Dict[str, float]:
-    """Compute comprehensive latency and throughput statistics."""
-    lat_ms = np.array([t * 1000.0 for t in times_s])
-    lat_ms_sorted = np.sort(lat_ms)
-    n = len(lat_ms)
-    stats = {
-        "avg_ms": np.mean(lat_ms),
-        "min_ms": np.min(lat_ms),
-        "max_ms": np.max(lat_ms),
-        "std_ms": np.std(lat_ms),
-        "p50_ms": np.percentile(lat_ms, 50),
-        "p95_ms": np.percentile(lat_ms, 95),
-        "p99_ms": np.percentile(lat_ms, 99),
-        "num_iters": n
-    }
-    if tokens is not None and n > 0:
-        avg_s = np.mean(times_s)
-        stats["tokens_per_s"] = tokens / avg_s if avg_s > 0 else float("inf")
-        stats["throughput_variance"] = np.std([tokens / t for t in times_s if t > 0])
-    return stats
-def _format_timing_stats(stats: Dict[str, float], tokens: Optional[int] = None) -> str:
-    """Format timing statistics for display."""
-    lines = [
-        "\n━━━━━━━━━━━━━━━━━━━━ Benchmark Results ━━━━━━━━━━━━━━━━━━━━",
-        f"Iterations: {stats.get('num_iters', 0)}",
-        "\nLatency Statistics:",
-        f"  Average: {stats['avg_ms']:.3f} ms",
-        f"  Min:     {stats['min_ms']:.3f} ms",
-        f"  Max:     {stats['max_ms']:.3f} ms",
-        f"  Std Dev: {stats['std_ms']:.3f} ms",
-        "\nPercentiles:",
-        f"  P50 (median): {stats['p50_ms']:.3f} ms",
-        f"  P95:          {stats['p95_ms']:.3f} ms",
-        f"  P99:          {stats['p99_ms']:.3f} ms",
-    ]
-    if tokens is not None and 'tokens_per_s' in stats:
-        lines.extend([
-            "\nThroughput:",
-            f"  Tokens/sec: {stats['tokens_per_s']:.1f}",
-            f"  Std Dev:    {stats.get('throughput_variance', 0):.1f}",
-        ])
-    lines.append("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
-    return "\n".join(lines)
-def _bench_engine(
-    call: Callable[[], Any], *, warmup: int, iters: int, device: str, dtype, input_gen: Callable[[], Any] = None
-) -> Tuple[Any, list]:
-    """Core benchmarking engine with warmup and timing."""
-    use_autocast = device == "cuda" and dtype in (torch.float16, torch.bfloat16)
-    # Warmup phase
-    print(f"\nWarming up ({warmup} iterations)...")
-    with torch.inference_mode():
-        for _ in range(max(0, warmup)):
-            if use_autocast:
-                with torch.autocast(device_type="cuda", dtype=dtype):
-                    if input_gen is not None:
-                        _ = call(input_gen())
-                    else:
-                        _ = call()
-            else:
-                if input_gen is not None:
-                    _ = call(input_gen())
-                else:
-                    _ = call()
-        _sync(device)
-    # Measurement phase
-    print(f"Benchmarking ({iters} iterations)...")
-    times_s = []
-    last = None
-    with torch.inference_mode():
-        for i in range(max(1, iters)):
-            start = time.perf_counter()
-            if use_autocast:
-                with torch.autocast(device_type="cuda", dtype=dtype):
-                    if input_gen is not None:
-                        last = call(input_gen())
-                    else:
-                        last = call()
-            else:
-                if input_gen is not None:
-                    last = call(input_gen())
-                else:
-                    last = call()
-            _sync(device)
-            end = time.perf_counter()
-            times_s.append(end - start)
-            # Progress indicator every 20% of iterations
-            if i > 0 and i % max(1, iters // 5) == 0:
-                pct = (i / iters) * 100
-                avg_so_far = np.mean(times_s[:i]) * 1000
-                print(f"  Progress: {pct:.0f}% complete (avg: {avg_so_far:.3f} ms)")
-    return last, times_s
-def tensor_stats(t: torch.Tensor) -> str:
-    """Generate comprehensive stats string for a tensor."""
-    return (f"shape={tuple(t.shape)}, "
-            f"dtype={t.dtype}, "
-            f"device={t.device}, "
-            f"range=[{t.min().item():.6f}, {t.max().item():.6f}], "
-            f"mean={t.mean().item():.6f}, "
-            f"std={t.std().item():.6f}, "
-            f"norm={t.norm().item():.6f}")
-@contextmanager
-def bench_context(
-    *, warmup: int = 25, iters: int = 100, device: str = "cuda", dtype=torch.float32, tokens: Optional[int] = None, verbose: bool = True, save_json: Optional[str] = None, vary_inputs: bool = True
-):
-    """Context that yields a runner: runner(fn, *args, **kwargs) -> (result, stats).
-    If vary_inputs=True, the first argument should be a base tensor that will be varied each iteration
-    by adding a small deterministic increment to prevent caching artifacts.
-    """
-    def runner(fn: Callable[..., Any], *args, **kwargs) -> Tuple[Any, Dict[str, float]]:
-        # Log configuration
-        if verbose:
-            print(f"\n┌─ Benchmark Configuration ─────────────────────────────┐")
-            # print(f"│ Device: {device:<15} Dtype: {dtype}              │")
-            print(f"│ Warmup: {warmup:<15} Iters: {iters}              │")
-            if tokens:
-                print(f"│ Tokens: {tokens}                                        │")
-            if vary_inputs:
-                print(f"│ Input Variation: Enabled (prevents caching artifacts)  │")
-            print(f"└────────────────────────────────────────────────────────┘")
-        # Set up input generation
-        input_gen = None
-        if vary_inputs and args and isinstance(args[0], torch.Tensor):
-            base_input = args[0].clone()
-            iteration_counter = [0]  # Use list for mutable closure
-            def generate_varied_input():
-                """Generate input tensor varied by iteration to prevent caching."""
-                # Add small deterministic increment: 0.001 * iteration_number
-                varied_input = base_input + (iteration_counter[0] * 0.001)
-                iteration_counter[0] += 1
-                return varied_input
-            input_gen = generate_varied_input
-            call = lambda x: fn(x, *args[1:], **kwargs)
-            # Log base input stats
-            if verbose:
-                print(f"\nBase Input: {tensor_stats(base_input)}")
-                print(f"Input Variation: +{0.001:.3f} * iteration (deterministic)")
-        else:
-            # Legacy mode - static inputs
-            call = lambda: fn(*args, **kwargs)
-            if verbose and args and isinstance(args[0], torch.Tensor):
-                print(f"\nInput: {tensor_stats(args[0])}")
-        result, times_s = _bench_engine(call, warmup=warmup, iters=iters, device=device, dtype=dtype, input_gen=input_gen)
-        # Log output if it's a tensor or tuple with tensors
-        if verbose:
-            print("\nOutput tensors:")
-            if isinstance(result, torch.Tensor):
-                print(f"  Primary: {tensor_stats(result)}")
-            elif isinstance(result, tuple) and len(result) > 0 and isinstance(result[0], torch.Tensor):
-                print(f"  Primary: {tensor_stats(result[0])}")
-                if len(result) > 1:
-                    if isinstance(result[1], torch.Tensor):
-                        print(f"  Auxiliary: {tensor_stats(result[1])}")
-                    else:
-                        print(f"  Auxiliary: {type(result[1]).__name__}")
-        # Compute and display statistics
-        stats = _compute_stats(times_s, tokens=tokens)
-        if verbose:
-            print(_format_timing_stats(stats, tokens))
-        # Save to JSON if requested
-        if save_json:
-            import json
-            json_data = {
-                "implementation": save_json.replace(".json", ""),
-                "config": {
-                    "warmup": warmup,
-                    "iters": iters,
-                    "device": str(device),  # Convert device to string
-                    "dtype": str(dtype),
-                    "tokens": tokens,
-                    "vary_inputs": vary_inputs
-                },
-                "stats": stats,
-                "output_sum": float(result[0].sum().item()) if isinstance(result, tuple) and len(result) > 0 else float(result.sum().item()) if isinstance(result, torch.Tensor) else None
-            }
-            with open(save_json, 'w') as f:
-                json.dump(json_data, f, indent=2)
-            if verbose:
-                print(f"\nSaved benchmark results to {save_json}")
-        return result, stats
-    yield runner
-def set_seed(seed: int):
-    """Set seeds for reproducibility."""
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False

megablocks_yamoe/cells/binned_run.py DELETED Viewed

@@ -1,195 +0,0 @@
-# /// script
-# dependencies = [
-#     "torch",
-#     "numpy",
-# ]
-# ///
-import torch
-from torch import nn
-from torch.nn import functional as F
-from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
-from config import (
-    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
-    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
-    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
-)
-from pathlib import Path
-import os
-# Discover the upstream artifact directory from env
-data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
-router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
-router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
-gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
-gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
-down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
-down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
-print("Loaded shared weights from artifacts")
-print(f"Router weight sum: {router_weight.sum().item():.6f}")
-print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
-print(f"Down sum: {down_proj.sum().item():.6f}")
-def binned_gather(x, indices, bins, expert_capacity, top_k):
-    E, H = bins.shape[0], x.shape[1]
-    out = torch.zeros((E, expert_capacity, H), device=x.device, dtype=x.dtype)
-    for e in range(E):
-        start = 0 if e == 0 else bins[e - 1]
-        end = bins[e]
-        n = min(end - start, expert_capacity)
-        for i in range(n):
-            flat_pos = indices[start + i]
-            tok = flat_pos // top_k
-            out[e, i] = x[tok]
-    return out
-def binned_scatter(x, indices, weights, bins, expert_capacity, top_k):
-    E, C, H = x.shape
-    N = indices.shape[0] // top_k
-    out = torch.zeros((N, top_k, H), dtype=x.dtype, device=x.device)
-    for e in range(E):
-        start = 0 if e == 0 else bins[e - 1]
-        end = bins[e]
-        n = end - start
-        if n == 0:
-            continue
-        take = min(n, expert_capacity)
-        for i in range(take):
-            flat_pos = indices[start + i]
-            tok = flat_pos // top_k
-            slot = flat_pos % top_k
-            scale = weights[flat_pos] if weights is not None else 1.0
-            out[tok, slot] = x[e, i] * scale
-    return out.sum(dim=1)
-def sort_tokens_by_expert(router_indices, num_experts):
-    flat_indices = router_indices.flatten()
-    sorted_values, sorted_indices = torch.sort(flat_indices)
-    tokens_per_expert = torch.bincount(sorted_values, minlength=num_experts)
-    bins = torch.cumsum(tokens_per_expert, dim=0)
-    return sorted_indices, sorted_values, bins, tokens_per_expert
-def binned_experts_ref(
-    hidden_states,
-    router_indices,
-    routing_weights,
-    gate_up_proj,
-    gate_up_proj_bias,
-    down_proj,
-    down_proj_bias,
-    expert_capacity,
-):
-    B, S, H = hidden_states.shape
-    E, K = routing_weights.shape[1], router_indices.shape[1]
-    indices, _, bins, _ = sort_tokens_by_expert(router_indices, E)
-    x = binned_gather(hidden_states.view(-1, H), indices, bins, expert_capacity, K)
-    gate_up = torch.bmm(x, gate_up_proj)
-    gate_up += gate_up_proj_bias[..., None, :]
-    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-    # clamp to limit
-    limit = 7.0
-    gate = gate.clamp(min=None, max=limit)
-    up = up.clamp(min=-limit, max=limit)
-    glu = gate * torch.sigmoid(gate * 1.702)
-    x = (up + 1) * glu
-    x = torch.bmm(x, down_proj) + down_proj_bias[..., None, :]
-    # build routing weights aligned to (token, slot)
-    flat_dense = routing_weights.view(-1, E)
-    flat_router = router_indices.view(-1, K)
-    selected = torch.gather(flat_dense, 1, flat_router).reshape(-1)
-    # scatter back
-    y = binned_scatter(x, indices, selected, bins, expert_capacity, K)
-    return y.view(B, S, H)
-class BinnedRouter(nn.Module):
-    def __init__(self, router_weight, router_bias):
-        super().__init__()
-        self.top_k = TOP_K
-        self.num_experts = NUM_EXPERTS
-        self.hidden_dim = HIDDEN_SIZE
-        self.weight = nn.Parameter(router_weight.clone())
-        self.bias = nn.Parameter(router_bias.clone())
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
-        router_logits = F.linear(hidden_states, self.weight, self.bias)
-        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
-        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
-        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
-        return router_scores, router_indices
-def ceil_div(a, b):
-    return (a + b - 1) // b
-class BinnedMoEMLP(nn.Module):
-    def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
-        super().__init__()
-        self.router = BinnedRouter(router_weight, router_bias)
-        self.num_experts = NUM_EXPERTS
-        self.hidden_size = HIDDEN_SIZE
-        self.top_k = TOP_K
-        # Expert weights - use the loaded weights
-        self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
-        self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
-        self.down_proj = nn.Parameter(down_proj.clone())
-        self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
-    def forward(self, hidden_states):
-        router_scores, router_indices = self.router(hidden_states)
-        batch_size = hidden_states.shape[0]
-        expert_capacity = ceil_div(batch_size * self.top_k, self.num_experts)
-        output = binned_experts_ref(
-            hidden_states,
-            router_indices,
-            router_scores,
-            self.gate_up_proj,
-            self.gate_up_proj_bias,
-            self.down_proj,
-            self.down_proj_bias,
-            expert_capacity,
-        )
-        return output, router_scores
-# Run the model
-set_seed(GENERAL_SEED)
-device = torch.device(DEVICE)
-dtype = to_dtype(DTYPE)
-print("\n=== Binned Implementation ===")
-# Initialize model with loaded weights
-model = BinnedMoEMLP(
-    router_weight.to(device),
-    router_bias.to(device),
-    gate_up_proj.to(device),
-    gate_up_proj_bias.to(device),
-    down_proj.to(device),
-    down_proj_bias.to(device)
-).to(device=device)
-print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
-print(f"Gate/up proj sum: {model.gate_up_proj.sum().item():.6f}")
-print(f"Down proj sum: {model.down_proj.sum().item():.6f}")
-# Generate the same input as Yamoe
-set_seed(INPUT_SEED)
-x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
-# Benchmark the model with varied inputs to prevent caching artifacts
-tokens = BATCH_SIZE * SEQ_LEN
-with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="binned_results.json", vary_inputs=True) as bench:
-    output, stats = bench(model, x)
-    print(f"\nOutput sum: {output[0].sum().item():.6f}")

megablocks_yamoe/cells/config.py DELETED Viewed

@@ -1,27 +0,0 @@
-# /// script
-# dependencies = [
-#     "torch",
-#     "numpy",
-# ]
-# ///
-"""Shared configuration for both implementations."""
-import torch
-# Model configuration
-NUM_EXPERTS = 128
-HIDDEN_SIZE = 1152
-INTERMEDIATE_SIZE = 3072
-TOP_K = 4
-# Input configuration
-BATCH_SIZE = 1
-SEQ_LEN = 100
-DTYPE = "float32"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Seeds for reproducibility
-WEIGHT_SEED = 999
-EXPERT_SEED = 777
-INPUT_SEED = 123
-GENERAL_SEED = 42

megablocks_yamoe/cells/gptoss_run.py DELETED Viewed

@@ -1,147 +0,0 @@
-# /// script
-# dependencies = [
-#     "torch",
-#     "numpy",
-# ]
-# ///
-import torch
-from torch import nn
-from torch.nn import functional as F
-from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
-from config import (
-    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
-    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
-    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
-)
-from pathlib import Path
-import os
-# Discover the upstream artifact directory from env
-data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
-router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
-router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
-gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
-gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
-down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
-down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
-print("Loaded shared weights from artifacts")
-print(f"Router weight sum: {router_weight.sum().item():.6f}")
-print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
-print(f"Down sum: {down_proj.sum().item():.6f}")
-class GptOssRouter(nn.Module):
-    def __init__(self, router_weight, router_bias):
-        super().__init__()
-        self.top_k = TOP_K
-        self.num_experts = NUM_EXPERTS
-        self.hidden_dim = HIDDEN_SIZE
-        self.weight = nn.Parameter(router_weight.clone())
-        self.bias = nn.Parameter(router_bias.clone())
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
-        router_logits = F.linear(hidden_states, self.weight, self.bias)
-        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
-        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
-        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
-        return router_scores, router_indices
-class GptOssExperts(nn.Module):
-    def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
-        super().__init__()
-        self.num_experts = NUM_EXPERTS
-        self.hidden_size = HIDDEN_SIZE
-        self.expert_dim = self.hidden_size
-        self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
-        self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
-        self.down_proj = nn.Parameter(down_proj.clone())
-        self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
-        self.alpha = 1.702
-        self.limit = 7.0
-    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
-        batch_size = hidden_states.shape[0]
-        hidden_states = hidden_states.reshape(-1, self.hidden_size)
-        num_experts = routing_weights.shape[1]
-        if hidden_states.device.type == "cpu" or self.training:
-            next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
-            with torch.no_grad():
-                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
-                expert_mask = expert_mask.permute(2, 1, 0)
-                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-            for expert_idx in expert_hit[:]:
-                expert_idx = expert_idx[0]
-                with torch.no_grad():
-                    _, token_idx = torch.where(expert_mask[expert_idx])
-                current_state = hidden_states[token_idx]
-                gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
-                gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-                gate = gate.clamp(min=None, max=self.limit)
-                up = up.clamp(min=-self.limit, max=self.limit)
-                glu = gate * torch.sigmoid(gate * self.alpha)
-                gated_output = (up + 1) * glu
-                out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
-                weighted_output = out * routing_weights[token_idx, expert_idx, None]
-                next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
-            next_states = next_states.view(batch_size, -1, self.hidden_size)
-        else:
-            hidden_states = hidden_states.repeat(num_experts, 1)
-            hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
-            gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
-            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-            gate = gate.clamp(min=None, max=self.limit)
-            up = up.clamp(min=-self.limit, max=self.limit)
-            glu = gate * torch.sigmoid(gate * self.alpha)
-            next_states = torch.bmm(((up + 1) * glu), self.down_proj)
-            next_states = next_states + self.down_proj_bias[..., None, :]
-            next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
-            next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
-            next_states = next_states.sum(dim=0)
-        return next_states
-class GptOssMoEMLP(nn.Module):
-    def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
-        super().__init__()
-        self.router = GptOssRouter(router_weight, router_bias)
-        self.experts = GptOssExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias)
-    def forward(self, hidden_states):
-        router_scores, router_indices = self.router(hidden_states)
-        routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
-        return routed_out, router_scores
-# Run the model
-set_seed(GENERAL_SEED)
-device = torch.device(DEVICE)
-dtype = to_dtype(DTYPE)
-print("\n=== GPT-OSS Implementation ===")
-# Initialize model with loaded weights
-model = GptOssMoEMLP(
-    router_weight.to(device),
-    router_bias.to(device),
-    gate_up_proj.to(device),
-    gate_up_proj_bias.to(device),
-    down_proj.to(device),
-    down_proj_bias.to(device)
-).to(device=device)
-print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
-print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}")
-print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}")
-# Generate the same input as other implementations
-set_seed(INPUT_SEED)
-x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
-# Benchmark the model with varied inputs to prevent caching artifacts
-tokens = BATCH_SIZE * SEQ_LEN
-with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="gptoss_results.json", vary_inputs=True) as bench:
-    output, stats = bench(model, x)
-    print(f"\nOutput sum: {output[0].sum().item():.6f}")

megablocks_yamoe/cells/gptoss_training_run.py DELETED Viewed

@@ -1,138 +0,0 @@
-# /// script
-# dependencies = [
-#     "torch",
-#     "numpy",
-# ]
-# ///
-import torch
-from torch import nn
-from torch.nn import functional as F
-from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
-from config import (
-    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
-    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
-    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
-)
-from pathlib import Path
-import os
-# Discover the upstream artifact directory from env
-data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
-router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
-router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
-gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
-gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
-down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
-down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
-print("Loaded shared weights from artifacts")
-print(f"Router weight sum: {router_weight.sum().item():.6f}")
-print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
-print(f"Down sum: {down_proj.sum().item():.6f}")
-class GptOssTrainingRouter(nn.Module):
-    def __init__(self, router_weight, router_bias):
-        super().__init__()
-        self.top_k = TOP_K
-        self.num_experts = NUM_EXPERTS
-        self.hidden_dim = HIDDEN_SIZE
-        self.weight = nn.Parameter(router_weight.clone())
-        self.bias = nn.Parameter(router_bias.clone())
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
-        router_logits = F.linear(hidden_states, self.weight, self.bias)
-        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
-        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
-        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
-        return router_scores, router_indices
-class GptOssTrainingExperts(nn.Module):
-    def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
-        super().__init__()
-        self.num_experts = NUM_EXPERTS
-        self.hidden_size = HIDDEN_SIZE
-        self.expert_dim = self.hidden_size
-        self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
-        self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
-        self.down_proj = nn.Parameter(down_proj.clone())
-        self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
-        self.alpha = 1.702
-        self.limit = 7.0
-    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
-        batch_size = hidden_states.shape[0]
-        hidden_states = hidden_states.reshape(-1, self.hidden_size)
-        num_experts = routing_weights.shape[1]
-        # Force training mode path (expert loop instead of batched)
-        next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
-        with torch.no_grad():
-            expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
-            expert_mask = expert_mask.permute(2, 1, 0)
-            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hit[:]:
-            expert_idx = expert_idx[0]
-            with torch.no_grad():
-                _, token_idx = torch.where(expert_mask[expert_idx])
-            current_state = hidden_states[token_idx]
-            gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
-            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-            gate = gate.clamp(min=None, max=self.limit)
-            up = up.clamp(min=-self.limit, max=self.limit)
-            glu = gate * torch.sigmoid(gate * self.alpha)
-            gated_output = (up + 1) * glu
-            out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
-            weighted_output = out * routing_weights[token_idx, expert_idx, None]
-            next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
-        next_states = next_states.view(batch_size, -1, self.hidden_size)
-        return next_states
-class GptOssTrainingMoEMLP(nn.Module):
-    def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
-        super().__init__()
-        self.router = GptOssTrainingRouter(router_weight, router_bias)
-        self.experts = GptOssTrainingExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias)
-    def forward(self, hidden_states):
-        router_scores, router_indices = self.router(hidden_states)
-        routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
-        return routed_out, router_scores
-# Run the model
-set_seed(GENERAL_SEED)
-device = torch.device(DEVICE)
-dtype = to_dtype(DTYPE)
-print("\n=== GPT-OSS Implementation (Training Mode - Expert Loop) ===")
-# Initialize model with loaded weights and force training mode
-model = GptOssTrainingMoEMLP(
-    router_weight.to(device),
-    router_bias.to(device),
-    gate_up_proj.to(device),
-    gate_up_proj_bias.to(device),
-    down_proj.to(device),
-    down_proj_bias.to(device)
-).to(device=device)
-# Set to training mode to force expert loop path
-model.train()
-print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
-print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}")
-print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}")
-print(f"Model training mode: {model.training}")
-# Generate the same input as other implementations
-set_seed(INPUT_SEED)
-x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
-# Benchmark the model with varied inputs to prevent caching artifacts
-tokens = BATCH_SIZE * SEQ_LEN
-with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="gptoss_training_results.json", vary_inputs=True) as bench:
-    output, stats = bench(model, x)
-    print(f"\nOutput sum: {output[0].sum().item():.6f}")

megablocks_yamoe/cells/megablocks_run.py DELETED Viewed

@@ -1,103 +0,0 @@
-# /// script
-# dependencies = [
-#     "torch",
-#     "numpy",
-#     "kernels",
-# ]
-# ///
-import torch
-from torch import nn
-from torch.nn import functional as F
-from kernels import get_kernel, get_local_kernel
-from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
-from config import (
-    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
-    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
-    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
-)
-from pathlib import Path
-from collections import namedtuple
-import os
-# Discover the upstream artifact directory from env
-data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
-print(f"Loading weights from: {data_dir}")
-router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
-router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
-gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
-gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
-down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
-down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
-print("Loaded shared weights from artifacts")
-print(f"Router weight sum: {router_weight.sum().item():.6f}")
-print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
-print(f"Down sum: {down_proj.sum().item():.6f}")
-def build_megablocks_model(device: torch.device):
-    # Download optimized kernels from the Hugging Face hub
-    megablocks = get_kernel("kernels-community/megablocks", revision="v0.0.2")
-    model = megablocks.layers.MegaBlocksMoeMLP()
-    # Create attribute container for expert weights
-    model.experts = namedtuple(
-        "Experts", ["gate_up_proj", "gate_up_proj_bias", "down_proj", "down_proj_bias", "hidden_size"]
-    )
-    # Use loaded router weights for consistency
-    model.router = torch.nn.Linear(HIDDEN_SIZE, NUM_EXPERTS, device=device)
-    with torch.no_grad():
-        model.router.weight.copy_(router_weight)
-        model.router.bias.copy_(router_bias)
-    # Attach loaded expert weights to the experts container
-    e = model.experts
-    e.alpha = 1.702
-    e.capacity_factor = 32
-    e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
-    e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
-    e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
-    e.down_proj_bias = torch.nn.Parameter(down_proj_bias.clone().to(device))
-    e.hidden_size = HIDDEN_SIZE
-    # Log weight statistics for comparison
-    print(f"[MegaBlocks] Router weight sum: {model.router.weight.sum().item():.6f}")
-    print(f"[MegaBlocks] Gate/up projection shape: {tuple(e.gate_up_proj.shape)}, sum: {e.gate_up_proj.sum().item():.6f}")
-    print(f"[MegaBlocks] Down projection shape: {tuple(e.down_proj.shape)}, sum: {e.down_proj.sum().item():.6f}")
-    return model
-# Create a wrapper to match the interface of other implementations
-class MegaBlocksMoEWrapper(nn.Module):
-    def __init__(self, megablocks_model):
-        super().__init__()
-        self.model = megablocks_model
-    def forward(self, hidden_states):
-        # MegaBlocks expects input in the format (batch, seq_len, hidden_dim)
-        output, dummy_routing_weights = self.model(hidden_states)
-        return output, dummy_routing_weights
-# Run the model
-set_seed(GENERAL_SEED)
-device = torch.device(DEVICE)
-dtype = to_dtype(DTYPE)
-print("\n=== MegaBlocks Implementation ===")
-# Build MegaBlocks model with loaded weights
-megablocks_model = build_megablocks_model(device)
-model = MegaBlocksMoEWrapper(megablocks_model).to(device=device)
-# Generate the same input as other implementations
-set_seed(INPUT_SEED)
-x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
-# Benchmark the model with varied inputs to prevent caching artifacts
-tokens = BATCH_SIZE * SEQ_LEN
-with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="megablocks_results.json", vary_inputs=True) as bench:
-    output, stats = bench(model, x)
-    print(f"\nOutput sum: {output[0].sum().item():.6f}")

megablocks_yamoe/cells/nv.py DELETED Viewed

@@ -1,3 +0,0 @@
-import subprocess
-print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)

megablocks_yamoe/cells/save_data.py DELETED Viewed

@@ -1,42 +0,0 @@
-# /// script
-# dependencies = [
-#     "torch",
-#     "numpy",
-# ]
-# ///
-"""
-Generate deterministic shared weights once and save as artifacts so
-both implementations load identical parameters.
-"""
-import torch
-from config import NUM_EXPERTS, HIDDEN_SIZE, WEIGHT_SEED, EXPERT_SEED
-def save_shared_weights():
-    # Router: Kaiming uniform as used by both, bias zeros
-    torch.manual_seed(WEIGHT_SEED)
-    router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE)
-    torch.nn.init.kaiming_uniform_(router_weight)
-    router_bias = torch.zeros(NUM_EXPERTS)
-    # Experts: normal(0, 0.02), biases zeros
-    torch.manual_seed(EXPERT_SEED)
-    gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
-    gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE)
-    down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
-    down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE)
-    # Save artifacts
-    torch.save(router_weight, 'router_weight.pt')
-    torch.save(router_bias, 'router_bias.pt')
-    torch.save(gate_up_proj, 'gate_up_proj.pt')
-    torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt')
-    torch.save(down_proj, 'down_proj.pt')
-    torch.save(down_proj_bias, 'down_proj_bias.pt')
-    print("Saved shared weights to artifacts")
-    print(f"Router weight sum: {router_weight.sum().item():.6f}")
-    print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
-    print(f"Down sum: {down_proj.sum().item():.6f}")
-save_shared_weights()