diff --git a/.venv/index.html b/.venv/index.html new file mode 100644 index 0000000000000000000000000000000000000000..f3cb6ee07e1711f67b6447c984a2998e866c84b7 --- /dev/null +++ b/.venv/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /.venv

+ + + \ No newline at end of file diff --git a/.venv/lib/index.html b/.venv/lib/index.html new file mode 100644 index 0000000000000000000000000000000000000000..ccdf4339e7d9656235b67279909e397bd4c1dd5e --- /dev/null +++ b/.venv/lib/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /.venv/lib

+ + + \ No newline at end of file diff --git a/.venv/lib/python3.11/index.html b/.venv/lib/python3.11/index.html new file mode 100644 index 0000000000000000000000000000000000000000..3af34a216f69970720906537015aa45ee791045c --- /dev/null +++ b/.venv/lib/python3.11/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /.venv/lib/python3.11

+ + + \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/flask/index.html b/.venv/lib/python3.11/site-packages/flask/index.html new file mode 100644 index 0000000000000000000000000000000000000000..b7ce8a2a5d485d91433d50deb142d19e909e1adb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/flask/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /.venv/lib/python3.11/site-packages/flask

+ + + \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/flask/sansio/index.html b/.venv/lib/python3.11/site-packages/flask/sansio/index.html new file mode 100644 index 0000000000000000000000000000000000000000..94bc5e014b7c53407b9c7c5f1848d268c3f3028f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/flask/sansio/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /.venv/lib/python3.11/site-packages/flask/sansio

+ + + \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/index.html b/.venv/lib/python3.11/site-packages/index.html new file mode 100644 index 0000000000000000000000000000000000000000..d4c40d702642ed2e26b172d36dac4831f914c234 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/index.html @@ -0,0 +1,26 @@ + + + + + Directory Index + + + +

Index of /.venv/lib/python3.11/site-packages

+ + + \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html b/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html new file mode 100644 index 0000000000000000000000000000000000000000..7f69784006ffb575de6d1e146a4f7dbeca91a70d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info

+ + + \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html b/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html new file mode 100644 index 0000000000000000000000000000000000000000..ea09a5b2b7d7c6a483abc292a6092f3adb6d0391 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses

+ + + \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/werkzeug/debug/index.html b/.venv/lib/python3.11/site-packages/werkzeug/debug/index.html new file mode 100644 index 0000000000000000000000000000000000000000..ac697e964e418006a7e0a563c4e536e12994224a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/werkzeug/debug/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /.venv/lib/python3.11/site-packages/werkzeug/debug

+ + + \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html b/.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html new file mode 100644 index 0000000000000000000000000000000000000000..818ff7551c3a328860e310bc5e5ea4daf0bb9ae8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /.venv/lib/python3.11/site-packages/werkzeug/debug/shared

+ + + \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/werkzeug/index.html b/.venv/lib/python3.11/site-packages/werkzeug/index.html new file mode 100644 index 0000000000000000000000000000000000000000..0deece299e75d2120c0b4c58b8f2d20ad8784c80 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/werkzeug/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /.venv/lib/python3.11/site-packages/werkzeug

+ + + \ No newline at end of file diff --git a/index.html b/index.html index 23c5a059d2fb98596e33a750a34dfa0b2253f0e8..8011fbdbc92ac521ef8d42fd61efa43c6163ee16 100644 --- a/index.html +++ b/index.html @@ -1,4593 +1,24 @@ - + - - - uvnote Integration Test Report - - - + + Directory Index + - - -
-
-
light
-
reset
- -
-
- -
-
Generated on:
-
- Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36 -
-
- -
-
-
- -▼ code -▼ output - ▶ uv-logs - | -Cell: nv | 0.53s - | - -Raw -
-
-
-
-1 -2 -3 -
-
-
import subprocess
-
-print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
-
- -
-
-
-
-
-
Tue Sep 23 19:46:07 2025 -+-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 | -|-----------------------------------------+------------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+========================+======================| -| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 | -| 0% 42C P0 71W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 | -| 0% 43C P0 44W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 | -| 0% 42C P0 46W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 | -| 0% 41C P0 43W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=========================================================================================| -| No running processes found | -+-----------------------------------------------------------------------------------------+ - -
-
-
- -
-
- -▼ code -▼ output - ▶ uv-logs - | -Cell: setup | 133.12s - | - -Raw -
-
-
-
-1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 -32 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -47 -48 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -64 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -79 -80 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -95 -96 -97 -98 -99 -100 -101 -102 -103 -104 -105 -106 -107 -108 -109 -110 -111 -112 -113 -114 -115 -116 -
-
-
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-
-replace_kernel_forward_from_hub(GptOssMLP, "Yamoe")  # direct, type-safe
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-max_tokens = 512
-
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-
- -
-
-
-
-
-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI. -Knowledge cutoff: 2024-06 -Current date: 2025-09-23 - -Reasoning: low - -# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions - -What is Tensor Parallelism? - -<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are aggregated. It's used to scale up models beyond single device memory. Also mention pipeline parallelism, data parallelism. Provide details: e.g., for a linear layer weight matrix W of shape (out_features, in_features), we can split along out_features dimension across devices. Each device computes its part of the output. Then gather results. In backward, gradients are computed locally and then aggregated. Provide example: GPT-3 training uses tensor parallelism. Also mention frameworks: Megatron-LM, DeepSpeed, etc. Provide pros/cons. Provide code snippet maybe. Also mention that it's different from data parallelism. Provide explanation of how it works in practice. Provide mention of communication overhead. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide mention of "tensor parallelism" in context of huggingface accelerate. Provide mention of "tensor parallelism" in context of DeepSpeed ZeRO stage 3. Provide mention of "tensor parallelism" in context of Megatron-LM. Provide mention of "tensor parallelism" in context of GPT-NeoX. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-Offload" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-2" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor parallelism" in context of "DeepSpeed's ZeRO-3" maybe. Provide mention of "tensor -Generation took 51.92 seconds -
-
-
▶ UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|███▎ | 1/3 [00:06<00:13, 6.78s/it] -Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.65s/it] -Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.75s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.34s/it] -Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it] -Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it] -Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it] -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` - -Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] -Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01, 3.89it/s] -Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 17.67it/s] -/tmp/uvnote-run-hvgovjfd/home/.cache/uv/environments-v2/setup-443c07e337d3be43/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -/tmp/uvnote-run-hvgovjfd/home/.cache/uv/environments-v2/setup-443c07e337d3be43/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe` -INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
-
-
- -

Reference kernel

-
-
- -▼ code -▼ output - ▶ uv-logs - | -Cell: setup2 | 139.97s - | - -Raw -
-
-
-
-1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 -32 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -47 -48 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -64 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -79 -80 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -95 -96 -97 -98 -99 -100 -101 -102 -103 -104 -105 -106 -107 -108 -109 -110 -111 -112 -113 -114 -115 -
-
-
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-import logging
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-
-from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
-
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
-
-replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
-custom_mapping = {
-    "Yamoe": {
-        "cuda": {
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            )
-        }
-    }
-}
-register_kernel_mapping(custom_mapping)
-
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-max_tokens = 512
-
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-
- -
-
-
-
-
-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI. -Knowledge cutoff: 2024-06 -Current date: 2025-09-23 - -Reasoning: low - -# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions - -What is Tensor Parallelism? - -<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it's used in large models like GPT-3, Megatron-LM, DeepSpeed, etc. Provide explanation of how it reduces memory usage, increases throughput. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide mention of "tensor parallelism" in context of huggingface accelerate, DeepSpeed, Megatron. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in the "DeepSpeed ZeRO-Offload" or "ZeRO-3" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed" and "Megatron-LM" and "DeepSpeed's ZeRO" and "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the context of "tensor parallelism" in "DeepSpeed's ZeRO-3" and "DeepSpeed's ZeRO-2" etc. Provide mention of "tensor parallelism" in the -Generation took 57.98 seconds -
-
-
▶ UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|███▎ | 1/3 [00:06<00:12, 6.38s/it] -Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.61s/it] -Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.69s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.34s/it] -Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it] -Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it] -Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it] -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` - -Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s] -Fetching 66 files: 2%|▏ | 1/66 [00:00<00:10, 6.10it/s] -Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:01, 30.47it/s] -Fetching 66 files: 24%|██▍ | 16/66 [00:00<00:01, 37.56it/s] -Fetching 66 files: 30%|███ | 20/66 [00:01<00:03, 14.24it/s] -Fetching 66 files: 67%|██████▋ | 44/66 [00:01<00:00, 37.14it/s] -Fetching 66 files: 91%|█████████ | 60/66 [00:01<00:00, 49.97it/s] -Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 36.02it/s] -/tmp/uvnote-run-nw4e52ut/home/.cache/uv/environments-v2/setup2-69adf76231e4ab4f/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -/tmp/uvnote-run-nw4e52ut/home/.cache/uv/environments-v2/setup2-69adf76231e4ab4f/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` -INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-
-
-
- +

Index of /

+ \ No newline at end of file diff --git a/moe_benchmarks/index.html b/moe_benchmarks/index.html new file mode 100644 index 0000000000000000000000000000000000000000..459357c72e0afcc10921ae5c13c251e999b35f06 --- /dev/null +++ b/moe_benchmarks/index.html @@ -0,0 +1,25 @@ + + + + + Directory Index + + + +

Index of /moe_benchmarks

+ + + \ No newline at end of file diff --git a/moe_benchmarks/megablocks/cells/forward_only.py b/moe_benchmarks/megablocks/cells/forward_only.py new file mode 100644 index 0000000000000000000000000000000000000000..c72358d0eef5e1f993aef1e76dfb0f26761c4881 --- /dev/null +++ b/moe_benchmarks/megablocks/cells/forward_only.py @@ -0,0 +1,101 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "accelerate>=1.10.1", +# "torch>=2.7.0", +# "kernels==0.10.0", +# "transformers@https://github.com/huggingface/transformers.git", +# "ipdb>=0.13.13", +# "matplotlib>=3.7.2", +# "numpy>=1.24.3", +# ] +# /// + +import torch +from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config +import time +import torch.nn as nn +from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub +import sys +import torch.profiler +import gc +import logging +from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm + + +replace_kernel_forward_from_hub(GptOssRMSNorm, None) + +# set to debug logging +logging.basicConfig(level=logging.INFO) + +def reset_peak_memory_stats(): + """Clear CUDA cache and reset memory allocation counters.""" + torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats() + gc.collect() + +def get_memory_stats(): + """Get current and peak CUDA memory usage.""" + if not torch.cuda.is_available(): + return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} + return { + "allocated_gb": torch.cuda.memory_allocated() / 1e9, + "peak_gb": torch.cuda.max_memory_allocated() / 1e9, + "reserved_gb": torch.cuda.memory_reserved() / 1e9, + } + +def override_kernel_layer_name(cls_name: str, value) -> bool: + """Helper to dynamically override the kernel_layer_name in a model class.""" + for mod in sys.modules.values(): + if mod is None: + continue + obj = getattr(mod, cls_name, None) + if isinstance(obj, type) and issubclass(obj, nn.Module): + setattr(obj, "kernel_layer_name", value) + print(f"Overrode {cls_name}.kernel_layer_name to {value}") + return True + return False + + +# Init the model the normal way +model_id = "openai/gpt-oss-20b" +tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) +quantization_config = Mxfp4Config(dequantize=True) + + + +model = GptOssForCausalLM.from_pretrained( + model_id, + dtype="bfloat16", + device_map="auto", + use_kernels=True, + quantization_config=quantization_config, +).eval() + +messages = [ + {"role": "system", "content": "What is Tensor Parallelism?"}, +] + +inputs = tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + return_tensors="pt", + return_dict=True, + reasoning_effort="low", +).to("cuda") + +max_tokens = 256 + +with torch.inference_mode(): + start_time = time.perf_counter() + generated = model.generate( + **inputs, + max_new_tokens=max_tokens, + do_sample=False, + temperature=None, + ) + end_time = time.perf_counter() + +print(tokenizer.decode(generated[0], skip_special_tokens=False)) +print(f"Generation took {end_time - start_time:.2f} seconds") diff --git a/moe_benchmarks/megablocks/index.html b/moe_benchmarks/megablocks/index.html new file mode 100644 index 0000000000000000000000000000000000000000..5058977b1559a20266c8982c064cfc3de010bb13 --- /dev/null +++ b/moe_benchmarks/megablocks/index.html @@ -0,0 +1,24 @@ + + + + + Directory Index + + + +

Index of /moe_benchmarks/megablocks

+ + + \ No newline at end of file diff --git a/moe_benchmarks/megablocks/megablocks_only.html b/moe_benchmarks/megablocks/megablocks_only.html new file mode 100644 index 0000000000000000000000000000000000000000..8606aa9eddbd37a08f18ccfdeb910a8caa1cf0b5 --- /dev/null +++ b/moe_benchmarks/megablocks/megablocks_only.html @@ -0,0 +1,4084 @@ + + + + + + Megablocks Only Test + + + + + + + +
+
+
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-6.11.0-1018-azure-x86_64-with-glibc2.39 +
+
+ +
+

No Kernels

+

First, we run the model without any custom kernels to get a reference point.

+

Forward

+

Forward and Backward

+

Next, we'll attempt to run a forward and backward pass without any custom kernels. This will likely run out of memory since the default implementation is not optimized for memory usage.

+

Kernels

+

Next we can run with Megablocks kernels enabled.

+

Forward

+

First, we run a forward pass with Megablocks kernels.

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: forward_only | 118.48s | FAILED + | + +Raw +
+
+
+
+1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +
+
+
# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "accelerate>=1.10.1",
+#     "torch>=2.7.0",
+#     "kernels==0.10.0",
+#     "transformers@https://github.com/huggingface/transformers.git",
+#     "ipdb>=0.13.13",
+#     "matplotlib>=3.7.2",
+#     "numpy>=1.24.3",
+# ]
+# ///
+
+import torch
+from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
+import time
+import torch.nn as nn
+from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
+import sys
+import torch.profiler
+import gc
+import logging
+from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
+
+
+replace_kernel_forward_from_hub(GptOssRMSNorm, None)
+
+# set to debug logging
+logging.basicConfig(level=logging.INFO)
+
+def reset_peak_memory_stats():
+    """Clear CUDA cache and reset memory allocation counters."""
+    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    gc.collect()
+
+def get_memory_stats():
+    """Get current and peak CUDA memory usage."""
+    if not torch.cuda.is_available():
+        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
+    return {
+        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
+        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
+        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
+    }
+
+def override_kernel_layer_name(cls_name: str, value) -> bool:
+    """Helper to dynamically override the kernel_layer_name in a model class."""
+    for mod in sys.modules.values():
+        if mod is None:
+            continue
+        obj = getattr(mod, cls_name, None)
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            setattr(obj, "kernel_layer_name", value)
+            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
+            return True
+    return False
+
+
+# Init the model the normal way
+model_id = "openai/gpt-oss-20b"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
+quantization_config = Mxfp4Config(dequantize=True)
+
+
+
+model = GptOssForCausalLM.from_pretrained(
+    model_id,
+    dtype="bfloat16",
+    device_map="auto",
+    use_kernels=True,
+    quantization_config=quantization_config,
+).eval()
+
+messages = [
+    {"role": "system", "content": "What is Tensor Parallelism?"},
+]
+
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    return_dict=True,
+    reasoning_effort="low",
+).to("cuda")
+
+max_tokens = 256
+
+with torch.inference_mode():
+    start_time = time.perf_counter()
+    generated = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        do_sample=False,
+        temperature=None,
+    )
+    end_time = time.perf_counter()
+
+print(tokenizer.decode(generated[0], skip_special_tokens=False))
+print(f"Generation took {end_time - start_time:.2f} seconds")
+
+ +
+
+
+
+
+
+
▶ UV Install Logs
+ +
+
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] +Fetching 3 files: 0%| | 0/3 [00:50<?, ?it/s] +Traceback (most recent call last): + File "/home/runner/work/kernels-uvnotes/kernels-uvnotes/moe_benchmarks/megablocks/.uvnote/cells/forward_only.py", line 68, in <module> + model = GptOssForCausalLM.from_pretrained( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/modeling_utils.py", line 285, in _wrapper + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/modeling_utils.py", line 4904, in from_pretrained + checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/modeling_utils.py", line 1239, in _get_resolved_checkpoint_files + checkpoint_files, sharded_metadata = get_checkpoint_shard_files( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/utils/hub.py", line 1116, in get_checkpoint_shard_files + cached_filenames = cached_files( + ^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/utils/hub.py", line 564, in cached_files + raise e + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/utils/hub.py", line 491, in cached_files + snapshot_download( + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/_snapshot_download.py", line 332, in snapshot_download + thread_map( + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/tqdm/contrib/concurrent.py", line 69, in thread_map + return _executor_map(ThreadPoolExecutor, fn, *iterables, **tqdm_kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/tqdm/contrib/concurrent.py", line 51, in _executor_map + return list(tqdm_class(ex.map(fn, *iterables, chunksize=chunksize), **kwargs)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/tqdm/std.py", line 1181, in __iter__ + for obj in iterable: + File "/usr/lib/python3.12/concurrent/futures/_base.py", line 619, in result_iterator + yield _result_or_cancel(fs.pop()) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/concurrent/futures/_base.py", line 317, in _result_or_cancel + return fut.result(timeout) + ^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/concurrent/futures/_base.py", line 456, in result + return self.__get_result() + ^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result + raise self._exception + File "/usr/lib/python3.12/concurrent/futures/thread.py", line 58, in run + result = self.fn(*self.args, **self.kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/_snapshot_download.py", line 306, in _inner_hf_hub_download + return hf_hub_download( + ^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1010, in hf_hub_download + return _hf_hub_download_to_cache_dir( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1171, in _hf_hub_download_to_cache_dir + _download_to_tmp_and_move( + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 1723, in _download_to_tmp_and_move + xet_get( + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/file_download.py", line 629, in xet_get + download_files( +RuntimeError: Data processing error: CAS service error : IO Error: No space left on device (os error 28)
+
+
+ +

Forward and Backward

+

Next, we run a forward and backward pass with Megablocks kernels enabled. This should be more memory efficient and allow us to complete the backward pass without running out of memory.

+
+ + + \ No newline at end of file diff --git a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5df252f2e6edaec8717d49f0fe7d72b278c362e Binary files /dev/null and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ diff --git a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be5ea0a48cedabb22eac9d1ef3f5b0422d87c5c2 Binary files /dev/null and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ diff --git a/moe_benchmarks/megablocks_yamoe/cells/bench_utils.py b/moe_benchmarks/megablocks_yamoe/cells/bench_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6bb3706118149df02c1f7ebaaa6fbba84e71cd5e --- /dev/null +++ b/moe_benchmarks/megablocks_yamoe/cells/bench_utils.py @@ -0,0 +1,241 @@ +# /// script +# dependencies = [ +# "torch", +# "numpy", +# ] +# /// + +"""Reusable benchmarking utilities for performance testing.""" +import time +import numpy as np +from contextlib import contextmanager +from typing import Callable, Dict, Tuple, Any, Optional +import torch + +def to_dtype(dtype_str: str): + """Convert string to torch dtype.""" + if dtype_str == "float16": + return torch.float16 + if dtype_str == "bfloat16": + return torch.bfloat16 + return torch.float32 + +def _sync(device: str): + """Synchronize device if CUDA.""" + if device == "cuda": + torch.cuda.synchronize() + +def _compute_stats(times_s, tokens: Optional[int] = None) -> Dict[str, float]: + """Compute comprehensive latency and throughput statistics.""" + lat_ms = np.array([t * 1000.0 for t in times_s]) + lat_ms_sorted = np.sort(lat_ms) + n = len(lat_ms) + + stats = { + "avg_ms": np.mean(lat_ms), + "min_ms": np.min(lat_ms), + "max_ms": np.max(lat_ms), + "std_ms": np.std(lat_ms), + "p50_ms": np.percentile(lat_ms, 50), + "p95_ms": np.percentile(lat_ms, 95), + "p99_ms": np.percentile(lat_ms, 99), + "num_iters": n + } + + if tokens is not None and n > 0: + avg_s = np.mean(times_s) + stats["tokens_per_s"] = tokens / avg_s if avg_s > 0 else float("inf") + stats["throughput_variance"] = np.std([tokens / t for t in times_s if t > 0]) + + return stats + +def _format_timing_stats(stats: Dict[str, float], tokens: Optional[int] = None) -> str: + """Format timing statistics for display.""" + lines = [ + "\n━━━━━━━━━━━━━━━━━━━━ Benchmark Results ━━━━━━━━━━━━━━━━━━━━", + f"Iterations: {stats.get('num_iters', 0)}", + "\nLatency Statistics:", + f" Average: {stats['avg_ms']:.3f} ms", + f" Min: {stats['min_ms']:.3f} ms", + f" Max: {stats['max_ms']:.3f} ms", + f" Std Dev: {stats['std_ms']:.3f} ms", + "\nPercentiles:", + f" P50 (median): {stats['p50_ms']:.3f} ms", + f" P95: {stats['p95_ms']:.3f} ms", + f" P99: {stats['p99_ms']:.3f} ms", + ] + + if tokens is not None and 'tokens_per_s' in stats: + lines.extend([ + "\nThroughput:", + f" Tokens/sec: {stats['tokens_per_s']:.1f}", + f" Std Dev: {stats.get('throughput_variance', 0):.1f}", + ]) + + lines.append("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + return "\n".join(lines) + +def _bench_engine( + call: Callable[[], Any], *, warmup: int, iters: int, device: str, dtype, input_gen: Callable[[], Any] = None +) -> Tuple[Any, list]: + """Core benchmarking engine with warmup and timing.""" + use_autocast = device == "cuda" and dtype in (torch.float16, torch.bfloat16) + + # Warmup phase + print(f"\nWarming up ({warmup} iterations)...") + with torch.inference_mode(): + for _ in range(max(0, warmup)): + if use_autocast: + with torch.autocast(device_type="cuda", dtype=dtype): + if input_gen is not None: + _ = call(input_gen()) + else: + _ = call() + else: + if input_gen is not None: + _ = call(input_gen()) + else: + _ = call() + _sync(device) + + # Measurement phase + print(f"Benchmarking ({iters} iterations)...") + times_s = [] + last = None + with torch.inference_mode(): + for i in range(max(1, iters)): + start = time.perf_counter() + if use_autocast: + with torch.autocast(device_type="cuda", dtype=dtype): + if input_gen is not None: + last = call(input_gen()) + else: + last = call() + else: + if input_gen is not None: + last = call(input_gen()) + else: + last = call() + _sync(device) + end = time.perf_counter() + times_s.append(end - start) + + # Progress indicator every 20% of iterations + if i > 0 and i % max(1, iters // 5) == 0: + pct = (i / iters) * 100 + avg_so_far = np.mean(times_s[:i]) * 1000 + print(f" Progress: {pct:.0f}% complete (avg: {avg_so_far:.3f} ms)") + + return last, times_s + +def tensor_stats(t: torch.Tensor) -> str: + """Generate comprehensive stats string for a tensor.""" + return (f"shape={tuple(t.shape)}, " + f"dtype={t.dtype}, " + f"device={t.device}, " + f"range=[{t.min().item():.6f}, {t.max().item():.6f}], " + f"mean={t.mean().item():.6f}, " + f"std={t.std().item():.6f}, " + f"norm={t.norm().item():.6f}") + +@contextmanager +def bench_context( + *, warmup: int = 25, iters: int = 100, device: str = "cuda", dtype=torch.float32, tokens: Optional[int] = None, verbose: bool = True, save_json: Optional[str] = None, vary_inputs: bool = True +): + """Context that yields a runner: runner(fn, *args, **kwargs) -> (result, stats). + + If vary_inputs=True, the first argument should be a base tensor that will be varied each iteration + by adding a small deterministic increment to prevent caching artifacts. + """ + + def runner(fn: Callable[..., Any], *args, **kwargs) -> Tuple[Any, Dict[str, float]]: + # Log configuration + if verbose: + print(f"\n┌─ Benchmark Configuration ─────────────────────────────┐") + # print(f"│ Device: {device:<15} Dtype: {dtype} │") + print(f"│ Warmup: {warmup:<15} Iters: {iters} │") + if tokens: + print(f"│ Tokens: {tokens} │") + if vary_inputs: + print(f"│ Input Variation: Enabled (prevents caching artifacts) │") + print(f"└────────────────────────────────────────────────────────┘") + + # Set up input generation + input_gen = None + if vary_inputs and args and isinstance(args[0], torch.Tensor): + base_input = args[0].clone() + iteration_counter = [0] # Use list for mutable closure + + def generate_varied_input(): + """Generate input tensor varied by iteration to prevent caching.""" + # Add small deterministic increment: 0.001 * iteration_number + varied_input = base_input + (iteration_counter[0] * 0.001) + iteration_counter[0] += 1 + return varied_input + + input_gen = generate_varied_input + call = lambda x: fn(x, *args[1:], **kwargs) + + # Log base input stats + if verbose: + print(f"\nBase Input: {tensor_stats(base_input)}") + print(f"Input Variation: +{0.001:.3f} * iteration (deterministic)") + else: + # Legacy mode - static inputs + call = lambda: fn(*args, **kwargs) + if verbose and args and isinstance(args[0], torch.Tensor): + print(f"\nInput: {tensor_stats(args[0])}") + + result, times_s = _bench_engine(call, warmup=warmup, iters=iters, device=device, dtype=dtype, input_gen=input_gen) + + # Log output if it's a tensor or tuple with tensors + if verbose: + print("\nOutput tensors:") + if isinstance(result, torch.Tensor): + print(f" Primary: {tensor_stats(result)}") + elif isinstance(result, tuple) and len(result) > 0 and isinstance(result[0], torch.Tensor): + print(f" Primary: {tensor_stats(result[0])}") + if len(result) > 1: + if isinstance(result[1], torch.Tensor): + print(f" Auxiliary: {tensor_stats(result[1])}") + else: + print(f" Auxiliary: {type(result[1]).__name__}") + + # Compute and display statistics + stats = _compute_stats(times_s, tokens=tokens) + if verbose: + print(_format_timing_stats(stats, tokens)) + + # Save to JSON if requested + if save_json: + import json + json_data = { + "implementation": save_json.replace(".json", ""), + "config": { + "warmup": warmup, + "iters": iters, + "device": str(device), # Convert device to string + "dtype": str(dtype), + "tokens": tokens, + "vary_inputs": vary_inputs + }, + "stats": stats, + "output_sum": float(result[0].sum().item()) if isinstance(result, tuple) and len(result) > 0 else float(result.sum().item()) if isinstance(result, torch.Tensor) else None + } + with open(save_json, 'w') as f: + json.dump(json_data, f, indent=2) + if verbose: + print(f"\nSaved benchmark results to {save_json}") + + return result, stats + + yield runner + +def set_seed(seed: int): + """Set seeds for reproducibility.""" + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False \ No newline at end of file diff --git a/moe_benchmarks/megablocks_yamoe/cells/config.py b/moe_benchmarks/megablocks_yamoe/cells/config.py new file mode 100644 index 0000000000000000000000000000000000000000..747a7224106854e57904aa10edc15f4d5f0c4a17 --- /dev/null +++ b/moe_benchmarks/megablocks_yamoe/cells/config.py @@ -0,0 +1,27 @@ +# /// script +# dependencies = [ +# "torch", +# "numpy", +# ] +# /// + +"""Shared configuration for both implementations.""" +import torch + +# Model configuration +NUM_EXPERTS = 128 +HIDDEN_SIZE = 1152 +INTERMEDIATE_SIZE = 3072 +TOP_K = 4 + +# Input configuration +BATCH_SIZE = 1 +SEQ_LEN = 100 +DTYPE = "float32" +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# Seeds for reproducibility +WEIGHT_SEED = 999 +EXPERT_SEED = 777 +INPUT_SEED = 123 +GENERAL_SEED = 42 \ No newline at end of file diff --git a/moe_benchmarks/megablocks_yamoe/cells/nv.py b/moe_benchmarks/megablocks_yamoe/cells/nv.py new file mode 100644 index 0000000000000000000000000000000000000000..80eef60a7536ed875fb21731ab2d059458bd20b4 --- /dev/null +++ b/moe_benchmarks/megablocks_yamoe/cells/nv.py @@ -0,0 +1,3 @@ +import subprocess + +print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) \ No newline at end of file diff --git a/moe_benchmarks/megablocks_yamoe/cells/save_data.py b/moe_benchmarks/megablocks_yamoe/cells/save_data.py new file mode 100644 index 0000000000000000000000000000000000000000..b15750dce52da48651ccd9805cdab51af88503d5 --- /dev/null +++ b/moe_benchmarks/megablocks_yamoe/cells/save_data.py @@ -0,0 +1,42 @@ +# /// script +# dependencies = [ +# "torch", +# "numpy", +# ] +# /// + +""" +Generate deterministic shared weights once and save as artifacts so +both implementations load identical parameters. +""" +import torch +from config import NUM_EXPERTS, HIDDEN_SIZE, WEIGHT_SEED, EXPERT_SEED + +def save_shared_weights(): + # Router: Kaiming uniform as used by both, bias zeros + torch.manual_seed(WEIGHT_SEED) + router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE) + torch.nn.init.kaiming_uniform_(router_weight) + router_bias = torch.zeros(NUM_EXPERTS) + + # Experts: normal(0, 0.02), biases zeros + torch.manual_seed(EXPERT_SEED) + gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02) + gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE) + down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02) + down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE) + + # Save artifacts + torch.save(router_weight, 'router_weight.pt') + torch.save(router_bias, 'router_bias.pt') + torch.save(gate_up_proj, 'gate_up_proj.pt') + torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt') + torch.save(down_proj, 'down_proj.pt') + torch.save(down_proj_bias, 'down_proj_bias.pt') + + print("Saved shared weights to artifacts") + print(f"Router weight sum: {router_weight.sum().item():.6f}") + print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}") + print(f"Down sum: {down_proj.sum().item():.6f}") + +save_shared_weights() \ No newline at end of file diff --git a/moe_benchmarks/megablocks_yamoe/cells/utils.py b/moe_benchmarks/megablocks_yamoe/cells/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f1f83f42e002602ff034c10cdc3f2f598c779e1f --- /dev/null +++ b/moe_benchmarks/megablocks_yamoe/cells/utils.py @@ -0,0 +1,34 @@ +# /// script +# dependencies = [ +# "torch", +# "numpy", +# ] +# /// + +"""Simple utilities for running the models.""" +import torch + +def to_dtype(dtype_str: str): + """Convert string to torch dtype.""" + if dtype_str == "float16": + return torch.float16 + if dtype_str == "bfloat16": + return torch.bfloat16 + return torch.float32 + +def tensor_stats(t: torch.Tensor) -> str: + """Generate stats string for a tensor.""" + return (f"shape={tuple(t.shape)}, " + f"dtype={t.dtype}, " + f"device={t.device}, " + f"mean={t.mean().item():.6f}, " + f"std={t.std().item():.6f}") + +def set_seed(seed: int): + """Set seeds for reproducibility.""" + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False \ No newline at end of file diff --git a/moe_benchmarks/megablocks_yamoe/cells/yamoe_run.py b/moe_benchmarks/megablocks_yamoe/cells/yamoe_run.py new file mode 100644 index 0000000000000000000000000000000000000000..b3e73c4cb44433286cab638f8faae2623c5a5030 --- /dev/null +++ b/moe_benchmarks/megablocks_yamoe/cells/yamoe_run.py @@ -0,0 +1,135 @@ +# /// script +# dependencies = [ +# "torch", +# "kernels", +# "numpy", +# ] +# /// + +import torch +from torch import nn +from torch.nn import functional as F +from kernels import get_kernel, get_local_kernel +from bench_utils import to_dtype, tensor_stats, set_seed, bench_context +from config import ( + NUM_EXPERTS, HIDDEN_SIZE, TOP_K, + BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE, + WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED +) +from pathlib import Path +import os + +# Discover the upstream artifact directory from env +data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.') +print(f"Loading weights from: {data_dir}") + +router_weight = torch.load(Path(data_dir) / 'router_weight.pt') +router_bias = torch.load(Path(data_dir) / 'router_bias.pt') +gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt') +gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt') +down_proj = torch.load(Path(data_dir) / 'down_proj.pt') +down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt') + +print("Loaded shared weights from artifacts") +print(f"Router weight sum: {router_weight.sum().item():.6f}") +print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}") +print(f"Down sum: {down_proj.sum().item():.6f}") + +class YamoeRouter(nn.Module): + def __init__(self, router_weight, router_bias): + super().__init__() + self.top_k = TOP_K + self.num_experts = NUM_EXPERTS + self.hidden_dim = HIDDEN_SIZE + self.weight = nn.Parameter(router_weight.clone()) + self.bias = nn.Parameter(router_bias.clone()) + + def forward(self, hidden_states): + hidden_states = hidden_states.reshape(-1, self.hidden_dim) + router_logits = F.linear(hidden_states, self.weight, self.bias) + router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1) + router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype) + router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value) + return router_scores, router_indices + +def ceil_div(a, b): + return (a + b - 1) // b + +class YamoeMoEMLP(nn.Module): + def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias): + super().__init__() + self.router = YamoeRouter(router_weight, router_bias) + self.num_experts = NUM_EXPERTS + self.hidden_size = HIDDEN_SIZE + self.top_k = TOP_K + + # Load Yamoe kernel + # self.yamoe = get_local_kernel(Path("/home/ubuntu/Projects/yamoe/result"), "yamoe") + self.yamoe = get_kernel("drbh/yamoe", revision="v0.2.0") + + # Expert weights - use the loaded weights + self.gate_up_proj = nn.Parameter(gate_up_proj.clone()) + self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone()) + self.down_proj = nn.Parameter(down_proj.clone()) + self.down_proj_bias = nn.Parameter(down_proj_bias.clone()) + + def forward(self, hidden_states): + batch_size, seq_len, hidden_dim = hidden_states.shape + + # Get routing decisions + routing_weights, router_indices = self.router(hidden_states) + + # Reshape for Yamoe kernel + hidden_states_flat = hidden_states.view(-1, hidden_dim) + routing_weights_flat = routing_weights.view(-1, self.num_experts) + expert_capacity = ceil_div(batch_size * self.top_k, self.num_experts) + + # Call Yamoe optimized kernel + output = self.yamoe.experts( + hidden_states_flat, + router_indices, + routing_weights_flat, + self.gate_up_proj, + self.gate_up_proj_bias, + self.down_proj, + self.down_proj_bias, + expert_capacity, + self.num_experts, + self.top_k, + ) + + # Reshape output back + output = output.view(batch_size, seq_len, hidden_dim) + + return output, routing_weights + +# Run the model +set_seed(GENERAL_SEED) + +device = torch.device(DEVICE if DEVICE == "cuda" else "cuda") +dtype = to_dtype(DTYPE) + +print("\n=== Yamoe Implementation ===") +# Initialize model with loaded weights +model = YamoeMoEMLP( + router_weight.to(device), + router_bias.to(device), + gate_up_proj.to(device), + gate_up_proj_bias.to(device), + down_proj.to(device), + down_proj_bias.to(device) +).to(device=device) + +print(f"Router weight sum: {model.router.weight.sum().item():.6f}") +print(f"Gate/up proj sum: {model.gate_up_proj.sum().item():.6f}") +print(f"Down proj sum: {model.down_proj.sum().item():.6f}") + +# Generate input +set_seed(INPUT_SEED) +x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1 + +# Benchmark the model with varied inputs to prevent caching artifacts +tokens = BATCH_SIZE * SEQ_LEN +with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="yamoe_results.json", vary_inputs=True) as bench: + output, stats = bench(model, x) + print(f"\nOutput sum: {output[0].sum().item():.6f}") \ No newline at end of file diff --git a/moe_benchmarks/megablocks_yamoe/index.html b/moe_benchmarks/megablocks_yamoe/index.html new file mode 100644 index 0000000000000000000000000000000000000000..eb7b2fa8f6dfc6dacc0572fe71072184ac1d81ea --- /dev/null +++ b/moe_benchmarks/megablocks_yamoe/index.html @@ -0,0 +1,25 @@ + + + + + Directory Index + + + +

Index of /moe_benchmarks/megablocks_yamoe

+ + + \ No newline at end of file diff --git a/moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html b/moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html new file mode 100644 index 0000000000000000000000000000000000000000..d483be109634d9f2c6ca41723356d82e1bf2cfa1 --- /dev/null +++ b/moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html @@ -0,0 +1,3771 @@ + + + + + + uvnote Integration Test Report + + + + + + + +
+
+
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-6.11.0-1018-azure-x86_64-with-glibc2.39 +
+
+ +
+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.07s | FAILED + | + +Raw +
+
+
+
+1 +2 +3 +
+
+
import subprocess
+
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
+
Traceback (most recent call last): + File "/home/runner/work/kernels-uvnotes/kernels-uvnotes/moe_benchmarks/megablocks_yamoe/.uvnote/cells/nv.py", line 3, in <module> + print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/hostedtoolcache/Python/3.11.13/x64/lib/python3.11/subprocess.py", line 548, in run + with Popen(*popenargs, **kwargs) as process: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/opt/hostedtoolcache/Python/3.11.13/x64/lib/python3.11/subprocess.py", line 1026, in __init__ + self._execute_child(args, executable, preexec_fn, close_fds, + File "/opt/hostedtoolcache/Python/3.11.13/x64/lib/python3.11/subprocess.py", line 1955, in _execute_child + raise child_exception_type(errno_num, err_msg, err_filename) +FileNotFoundError: [Errno 2] No such file or directory: 'nvidia-smi' +
+
+
+ +

Comparison of Megablocks and Yamoe Kernels

+

This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.

+

Megablocks kernel

+

Yamoe Kernel

+
+ + + \ No newline at end of file diff --git a/moe_benchmarks/megablocks_yamoe/torch_profile.html b/moe_benchmarks/megablocks_yamoe/torch_profile.html new file mode 100644 index 0000000000000000000000000000000000000000..03274be1af151bba4833da45e7954d7de1f9a558 --- /dev/null +++ b/moe_benchmarks/megablocks_yamoe/torch_profile.html @@ -0,0 +1,4818 @@ + + + + + + Compare Yamoe and Binned MoE Implementations + + + + + + + +
+
+
light
+
reset
+ +
+
+ +
+
Generated on:
+
+ Linux x86_64 | Linux-6.11.0-1018-azure-x86_64-with-glibc2.39 +
+
+ +
+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: utils | deps: torch, numpy | 3.06s + | + +Raw +
+
+
+
+1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +
+
+
"""Simple utilities for running the models."""
+import torch
+
+def to_dtype(dtype_str: str):
+    """Convert string to torch dtype."""
+    if dtype_str == "float16":
+        return torch.float16
+    if dtype_str == "bfloat16":
+        return torch.bfloat16
+    return torch.float32
+
+def tensor_stats(t: torch.Tensor) -> str:
+    """Generate stats string for a tensor."""
+    return (f"shape={tuple(t.shape)}, "
+            f"dtype={t.dtype}, "
+            f"device={t.device}, "
+            f"mean={t.mean().item():.6f}, "
+            f"std={t.std().item():.6f}")
+
+def set_seed(seed: int):
+    """Set seeds for reproducibility."""
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+ +
+
+
+
+
+
+
▶ UV Install Logs
+ +
+
+
+ +
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: bench_utils | deps: torch, numpy | 13.67s + | + +Raw +
+
+
+
+1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +
+
+
"""Reusable benchmarking utilities for performance testing."""
+import time
+import numpy as np
+from contextlib import contextmanager
+from typing import Callable, Dict, Tuple, Any, Optional
+import torch
+
+def to_dtype(dtype_str: str):
+    """Convert string to torch dtype."""
+    if dtype_str == "float16":
+        return torch.float16
+    if dtype_str == "bfloat16":
+        return torch.bfloat16
+    return torch.float32
+
+def _sync(device: str):
+    """Synchronize device if CUDA."""
+    if device == "cuda":
+        torch.cuda.synchronize()
+
+def _compute_stats(times_s, tokens: Optional[int] = None) -> Dict[str, float]:
+    """Compute comprehensive latency and throughput statistics."""
+    lat_ms = np.array([t * 1000.0 for t in times_s])
+    lat_ms_sorted = np.sort(lat_ms)
+    n = len(lat_ms)
+
+    stats = {
+        "avg_ms": np.mean(lat_ms),
+        "min_ms": np.min(lat_ms),
+        "max_ms": np.max(lat_ms),
+        "std_ms": np.std(lat_ms),
+        "p50_ms": np.percentile(lat_ms, 50),
+        "p95_ms": np.percentile(lat_ms, 95),
+        "p99_ms": np.percentile(lat_ms, 99),
+        "num_iters": n
+    }
+
+    if tokens is not None and n > 0:
+        avg_s = np.mean(times_s)
+        stats["tokens_per_s"] = tokens / avg_s if avg_s > 0 else float("inf")
+        stats["throughput_variance"] = np.std([tokens / t for t in times_s if t > 0])
+
+    return stats
+
+def _format_timing_stats(stats: Dict[str, float], tokens: Optional[int] = None) -> str:
+    """Format timing statistics for display."""
+    lines = [
+        "\n━━━━━━━━━━━━━━━━━━━━ Benchmark Results ━━━━━━━━━━━━━━━━━━━━",
+        f"Iterations: {stats.get('num_iters', 0)}",
+        "\nLatency Statistics:",
+        f"  Average: {stats['avg_ms']:.3f} ms",
+        f"  Min:     {stats['min_ms']:.3f} ms",
+        f"  Max:     {stats['max_ms']:.3f} ms", 
+        f"  Std Dev: {stats['std_ms']:.3f} ms",
+        "\nPercentiles:",
+        f"  P50 (median): {stats['p50_ms']:.3f} ms",
+        f"  P95:          {stats['p95_ms']:.3f} ms",
+        f"  P99:          {stats['p99_ms']:.3f} ms",
+    ]
+
+    if tokens is not None and 'tokens_per_s' in stats:
+        lines.extend([
+            "\nThroughput:",
+            f"  Tokens/sec: {stats['tokens_per_s']:.1f}",
+            f"  Std Dev:    {stats.get('throughput_variance', 0):.1f}",
+        ])
+
+    lines.append("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+    return "\n".join(lines)
+
+def _bench_engine(
+    call: Callable[[], Any], *, warmup: int, iters: int, device: str, dtype, input_gen: Callable[[], Any] = None
+) -> Tuple[Any, list]:
+    """Core benchmarking engine with warmup and timing."""
+    use_autocast = device == "cuda" and dtype in (torch.float16, torch.bfloat16)
+
+    # Warmup phase
+    print(f"\nWarming up ({warmup} iterations)...")
+    with torch.inference_mode():
+        for _ in range(max(0, warmup)):
+            if use_autocast:
+                with torch.autocast(device_type="cuda", dtype=dtype):
+                    if input_gen is not None:
+                        _ = call(input_gen())
+                    else:
+                        _ = call()
+            else:
+                if input_gen is not None:
+                    _ = call(input_gen())
+                else:
+                    _ = call()
+        _sync(device)
+
+    # Measurement phase
+    print(f"Benchmarking ({iters} iterations)...")
+    times_s = []
+    last = None
+    with torch.inference_mode():
+        for i in range(max(1, iters)):
+            start = time.perf_counter()
+            if use_autocast:
+                with torch.autocast(device_type="cuda", dtype=dtype):
+                    if input_gen is not None:
+                        last = call(input_gen())
+                    else:
+                        last = call()
+            else:
+                if input_gen is not None:
+                    last = call(input_gen())
+                else:
+                    last = call()
+            _sync(device)
+            end = time.perf_counter()
+            times_s.append(end - start)
+
+            # Progress indicator every 20% of iterations
+            if i > 0 and i % max(1, iters // 5) == 0:
+                pct = (i / iters) * 100
+                avg_so_far = np.mean(times_s[:i]) * 1000
+                print(f"  Progress: {pct:.0f}% complete (avg: {avg_so_far:.3f} ms)")
+
+    return last, times_s
+
+def tensor_stats(t: torch.Tensor) -> str:
+    """Generate comprehensive stats string for a tensor."""
+    return (f"shape={tuple(t.shape)}, "
+            f"dtype={t.dtype}, "
+            f"device={t.device}, "
+            f"range=[{t.min().item():.6f}, {t.max().item():.6f}], "
+            f"mean={t.mean().item():.6f}, "
+            f"std={t.std().item():.6f}, "
+            f"norm={t.norm().item():.6f}")
+
+@contextmanager
+def bench_context(
+    *, warmup: int = 25, iters: int = 100, device: str = "cuda", dtype=torch.float32, tokens: Optional[int] = None, verbose: bool = True, save_json: Optional[str] = None, vary_inputs: bool = True
+):
+    """Context that yields a runner: runner(fn, *args, **kwargs) -> (result, stats).
+
+    If vary_inputs=True, the first argument should be a base tensor that will be varied each iteration
+    by adding a small deterministic increment to prevent caching artifacts.
+    """
+
+    def runner(fn: Callable[..., Any], *args, **kwargs) -> Tuple[Any, Dict[str, float]]:
+        # Log configuration
+        if verbose:
+            print(f"\n┌─ Benchmark Configuration ─────────────────────────────┐")
+            # print(f"│ Device: {device:<15} Dtype: {dtype}              │")
+            print(f"│ Warmup: {warmup:<15} Iters: {iters}              │")
+            if tokens:
+                print(f"│ Tokens: {tokens}                                        │")
+            if vary_inputs:
+                print(f"│ Input Variation: Enabled (prevents caching artifacts)  │")
+            print(f"└────────────────────────────────────────────────────────┘")
+
+        # Set up input generation
+        input_gen = None
+        if vary_inputs and args and isinstance(args[0], torch.Tensor):
+            base_input = args[0].clone()
+            iteration_counter = [0]  # Use list for mutable closure
+
+            def generate_varied_input():
+                """Generate input tensor varied by iteration to prevent caching."""
+                # Add small deterministic increment: 0.001 * iteration_number
+                varied_input = base_input + (iteration_counter[0] * 0.001)
+                iteration_counter[0] += 1
+                return varied_input
+
+            input_gen = generate_varied_input
+            call = lambda x: fn(x, *args[1:], **kwargs)
+
+            # Log base input stats
+            if verbose:
+                print(f"\nBase Input: {tensor_stats(base_input)}")
+                print(f"Input Variation: +{0.001:.3f} * iteration (deterministic)")
+        else:
+            # Legacy mode - static inputs
+            call = lambda: fn(*args, **kwargs)
+            if verbose and args and isinstance(args[0], torch.Tensor):
+                print(f"\nInput: {tensor_stats(args[0])}")
+
+        result, times_s = _bench_engine(call, warmup=warmup, iters=iters, device=device, dtype=dtype, input_gen=input_gen)
+
+        # Log output if it's a tensor or tuple with tensors
+        if verbose:
+            print("\nOutput tensors:")
+            if isinstance(result, torch.Tensor):
+                print(f"  Primary: {tensor_stats(result)}")
+            elif isinstance(result, tuple) and len(result) > 0 and isinstance(result[0], torch.Tensor):
+                print(f"  Primary: {tensor_stats(result[0])}")
+                if len(result) > 1:
+                    if isinstance(result[1], torch.Tensor):
+                        print(f"  Auxiliary: {tensor_stats(result[1])}")
+                    else:
+                        print(f"  Auxiliary: {type(result[1]).__name__}")
+
+        # Compute and display statistics
+        stats = _compute_stats(times_s, tokens=tokens)
+        if verbose:
+            print(_format_timing_stats(stats, tokens))
+
+        # Save to JSON if requested
+        if save_json:
+            import json
+            json_data = {
+                "implementation": save_json.replace(".json", ""),
+                "config": {
+                    "warmup": warmup,
+                    "iters": iters,
+                    "device": str(device),  # Convert device to string
+                    "dtype": str(dtype),
+                    "tokens": tokens,
+                    "vary_inputs": vary_inputs
+                },
+                "stats": stats,
+                "output_sum": float(result[0].sum().item()) if isinstance(result, tuple) and len(result) > 0 else float(result.sum().item()) if isinstance(result, torch.Tensor) else None
+            }
+            with open(save_json, 'w') as f:
+                json.dump(json_data, f, indent=2)
+            if verbose:
+                print(f"\nSaved benchmark results to {save_json}")
+
+        return result, stats
+
+    yield runner
+
+def set_seed(seed: int):
+    """Set seeds for reproducibility."""
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+ +
+
+
+
+
+
+
▶ UV Install Logs
+ +
+
+
+ +

This notebook benchmarks multiple MoE implementations with varied inputs across iterations to prevent unrealistic caching artifacts and measure true performance characteristics.

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: config | deps: torch, numpy | 3.02s + | + +Raw +
+
+
+
+1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +
+
+
"""Shared configuration for both implementations."""
+import torch
+
+# Model configuration
+NUM_EXPERTS = 128
+HIDDEN_SIZE = 1152
+INTERMEDIATE_SIZE = 3072
+TOP_K = 4
+
+# Input configuration
+BATCH_SIZE = 1
+SEQ_LEN = 100
+DTYPE = "float32"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+# Seeds for reproducibility
+WEIGHT_SEED = 999
+EXPERT_SEED = 777
+INPUT_SEED = 123
+GENERAL_SEED = 42
+
+ +
+
+
+
+
+
+
▶ UV Install Logs
+ +
+
+
+ +
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: save_data | deps: torch, numpy | 11.90s + | + +Raw +
+
+
+
+1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +
+
+
"""
+Generate deterministic shared weights once and save as artifacts so
+both implementations load identical parameters.
+"""
+import torch
+from config import NUM_EXPERTS, HIDDEN_SIZE, WEIGHT_SEED, EXPERT_SEED
+
+def save_shared_weights():
+    # Router: Kaiming uniform as used by both, bias zeros
+    torch.manual_seed(WEIGHT_SEED)
+    router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE)
+    torch.nn.init.kaiming_uniform_(router_weight)
+    router_bias = torch.zeros(NUM_EXPERTS)
+
+    # Experts: normal(0, 0.02), biases zeros
+    torch.manual_seed(EXPERT_SEED)
+    gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
+    gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE)
+    down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
+    down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE)
+
+    # Save artifacts
+    torch.save(router_weight, 'router_weight.pt')
+    torch.save(router_bias, 'router_bias.pt')
+    torch.save(gate_up_proj, 'gate_up_proj.pt')
+    torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt')
+    torch.save(down_proj, 'down_proj.pt')
+    torch.save(down_proj_bias, 'down_proj_bias.pt')
+
+    print("Saved shared weights to artifacts")
+    print(f"Router weight sum: {router_weight.sum().item():.6f}")
+    print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
+    print(f"Down sum: {down_proj.sum().item():.6f}")
+
+save_shared_weights()
+
+ +
+
+
+
+
+
Saved shared weights to artifacts +Router weight sum: 12.588735 +Gate/up sum: 1026.601807 +Down sum: 206.729279 +
+
+
▶ UV Install Logs
+ +
+ +
+
+ +

Yamoe Implementation

+

This section runs the Yamoe MoE implementation with optimized Triton kernels.

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: yamoe_run | deps: torch, kernels, numpy | 4.02s | FAILED + | + +Raw +
+
+
+
+1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +
+
+
import torch
+from torch import nn
+from torch.nn import functional as F
+from kernels import get_kernel, get_local_kernel
+from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
+from config import (
+    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
+    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
+    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
+)
+from pathlib import Path
+import os
+
+# Discover the upstream artifact directory from env
+data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
+print(f"Loading weights from: {data_dir}")
+
+router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
+router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
+gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
+gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
+down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
+down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
+
+print("Loaded shared weights from artifacts")
+print(f"Router weight sum: {router_weight.sum().item():.6f}")
+print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
+print(f"Down sum: {down_proj.sum().item():.6f}")
+
+class YamoeRouter(nn.Module):
+    def __init__(self, router_weight, router_bias):
+        super().__init__()
+        self.top_k = TOP_K
+        self.num_experts = NUM_EXPERTS
+        self.hidden_dim = HIDDEN_SIZE
+        self.weight = nn.Parameter(router_weight.clone())
+        self.bias = nn.Parameter(router_bias.clone())
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
+        router_logits = F.linear(hidden_states, self.weight, self.bias)
+        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
+        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
+        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
+        return router_scores, router_indices
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+class YamoeMoEMLP(nn.Module):
+    def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
+        super().__init__()
+        self.router = YamoeRouter(router_weight, router_bias)
+        self.num_experts = NUM_EXPERTS
+        self.hidden_size = HIDDEN_SIZE
+        self.top_k = TOP_K
+
+        # Load Yamoe kernel
+        # self.yamoe = get_local_kernel(Path("/home/ubuntu/Projects/yamoe/result"), "yamoe")
+        self.yamoe = get_kernel("drbh/yamoe", revision="v0.2.0")
+
+        # Expert weights - use the loaded weights
+        self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
+        self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
+        self.down_proj = nn.Parameter(down_proj.clone())
+        self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
+
+    def forward(self, hidden_states):
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+
+        # Get routing decisions
+        routing_weights, router_indices = self.router(hidden_states)
+
+        # Reshape for Yamoe kernel
+        hidden_states_flat = hidden_states.view(-1, hidden_dim)
+        routing_weights_flat = routing_weights.view(-1, self.num_experts)
+        expert_capacity = ceil_div(batch_size * self.top_k, self.num_experts)
+
+        # Call Yamoe optimized kernel
+        output = self.yamoe.experts(
+            hidden_states_flat,
+            router_indices,
+            routing_weights_flat,
+            self.gate_up_proj,
+            self.gate_up_proj_bias,
+            self.down_proj,
+            self.down_proj_bias,
+            expert_capacity,
+            self.num_experts,
+            self.top_k,
+        )
+
+        # Reshape output back
+        output = output.view(batch_size, seq_len, hidden_dim)
+
+        return output, routing_weights
+
+# Run the model
+set_seed(GENERAL_SEED)
+
+device = torch.device(DEVICE if DEVICE == "cuda" else "cuda")
+dtype = to_dtype(DTYPE)
+
+print("\n=== Yamoe Implementation ===")
+# Initialize model with loaded weights
+model = YamoeMoEMLP(
+    router_weight.to(device),
+    router_bias.to(device),
+    gate_up_proj.to(device),
+    gate_up_proj_bias.to(device),
+    down_proj.to(device),
+    down_proj_bias.to(device)
+).to(device=device)
+
+print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
+print(f"Gate/up proj sum: {model.gate_up_proj.sum().item():.6f}")
+print(f"Down proj sum: {model.down_proj.sum().item():.6f}")
+
+# Generate input
+set_seed(INPUT_SEED)
+x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
+
+# Benchmark the model with varied inputs to prevent caching artifacts
+tokens = BATCH_SIZE * SEQ_LEN
+with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="yamoe_results.json", vary_inputs=True) as bench:
+    output, stats = bench(model, x)
+    print(f"\nOutput sum: {output[0].sum().item():.6f}")
+
+ +
+
+
+
+
+
Loading weights from: /home/runner/work/kernels-uvnotes/kernels-uvnotes/moe_benchmarks/megablocks_yamoe/.uvnote/cache/57bbe537b6c3412d45373a8967728666b60b8687c5d1f5d0decc3ba51923edde +Loaded shared weights from artifacts +Router weight sum: 12.588735 +Gate/up sum: 1026.601807 +Down sum: 206.729279 + +=== Yamoe Implementation === +
+
+
▶ UV Install Logs
+ +
+
Traceback (most recent call last): + File "/home/runner/work/kernels-uvnotes/kernels-uvnotes/moe_benchmarks/megablocks_yamoe/.uvnote/cells/yamoe_run.py", line 115, in <module> + router_weight.to(device), + ^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/runner/work/_temp/setup-uv-cache/environments-v2/yamoe-run-07f6c9b004377cec/lib/python3.11/site-packages/torch/cuda/__init__.py", line 412, in _lazy_init + torch._C._cuda_init() +RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx
+
+
+ +

Binned Implementation

+

This section runs the binned implementation that manually handles token gathering/scattering.

+

GPT-OSS Implementation

+

This section runs the GPT-OSS MoE implementation with manual expert loop handling.

+

GPT-OSS Implementation (Training Mode)

+

This section runs the GPT-OSS MoE implementation with training mode enabled to force the expert loop path.

+

MegaBlocks Implementation

+

This section runs the MegaBlocks MoE implementation with optimized kernels from the Hugging Face hub.

+

Performance Visualization

+

This section reads all benchmark results and creates a comprehensive performance comparison chart.

+
+ + + \ No newline at end of file