Upload folder using huggingface_hub
Browse files- megablocks/megablocks_only.html +0 -0
- megablocks_yamoe/artifacts/binned_run/binned_results.json +9 -9
- megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json +9 -9
- megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json +9 -9
- megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json +9 -9
- megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc +0 -0
- megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc +0 -0
- megablocks_yamoe/cells/megablocks_run.py +1 -1
- megablocks_yamoe/megablocks_yamoe.html +73 -79
- megablocks_yamoe/torch_profile.html +222 -222
megablocks/megablocks_only.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
megablocks_yamoe/artifacts/binned_run/binned_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms":
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms": 1.
|
| 16 |
-
"p50_ms": 36.
|
| 17 |
-
"p95_ms": 38.
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 36.21514258000616,
|
| 13 |
+
"min_ms": 33.172280000030696,
|
| 14 |
+
"max_ms": 38.75413800005845,
|
| 15 |
+
"std_ms": 1.401058294284512,
|
| 16 |
+
"p50_ms": 36.36444199997868,
|
| 17 |
+
"p95_ms": 38.060839599990004,
|
| 18 |
+
"p99_ms": 38.46422802999541,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2761.275888368544,
|
| 21 |
+
"throughput_variance": 108.05444381816277
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms":
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms": 3.
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 45.94982444000152,
|
| 13 |
+
"min_ms": 40.76497799997014,
|
| 14 |
+
"max_ms": 52.299967999942965,
|
| 15 |
+
"std_ms": 3.623045351544196,
|
| 16 |
+
"p50_ms": 45.46925300002158,
|
| 17 |
+
"p95_ms": 51.35251775002985,
|
| 18 |
+
"p99_ms": 52.12179027996967,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2176.286878540176,
|
| 21 |
+
"throughput_variance": 169.79505096491204
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms": 38.
|
| 14 |
-
"max_ms": 49.
|
| 15 |
-
"std_ms": 2.
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms": 48.
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 46.09780513999567,
|
| 13 |
+
"min_ms": 38.8389360000474,
|
| 14 |
+
"max_ms": 49.40391599996019,
|
| 15 |
+
"std_ms": 2.4686999934552376,
|
| 16 |
+
"p50_ms": 47.23983950003685,
|
| 17 |
+
"p95_ms": 48.725092950002136,
|
| 18 |
+
"p99_ms": 49.16830440000467,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2169.300679203864,
|
| 21 |
+
"throughput_variance": 122.29861537972276
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 4.
|
| 13 |
-
"min_ms": 4.
|
| 14 |
-
"max_ms": 4.
|
| 15 |
-
"std_ms": 0.
|
| 16 |
-
"p50_ms": 4.
|
| 17 |
-
"p95_ms": 4.
|
| 18 |
-
"p99_ms": 4.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 4.247618279998733,
|
| 13 |
+
"min_ms": 4.12893800000802,
|
| 14 |
+
"max_ms": 4.265831999987313,
|
| 15 |
+
"std_ms": 0.020712896658640616,
|
| 16 |
+
"p50_ms": 4.251555999985612,
|
| 17 |
+
"p95_ms": 4.263803499975438,
|
| 18 |
+
"p99_ms": 4.2652827100027935,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 23542.605151428495,
|
| 21 |
+
"throughput_variance": 117.11531020813602
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc
CHANGED
|
Binary files a/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc and b/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ
|
|
|
megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc
CHANGED
|
Binary files a/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc and b/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ
|
|
|
megablocks_yamoe/cells/megablocks_run.py
CHANGED
|
@@ -56,7 +56,7 @@ def build_megablocks_model(device: torch.device):
|
|
| 56 |
# Attach loaded expert weights to the experts container
|
| 57 |
e = model.experts
|
| 58 |
e.alpha = 1.702
|
| 59 |
-
e.capacity_factor =
|
| 60 |
e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
|
| 61 |
e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
|
| 62 |
e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
|
|
|
|
| 56 |
# Attach loaded expert weights to the experts container
|
| 57 |
e = model.experts
|
| 58 |
e.alpha = 1.702
|
| 59 |
+
e.capacity_factor = 32
|
| 60 |
e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
|
| 61 |
e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
|
| 62 |
e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
|
megablocks_yamoe/megablocks_yamoe.html
CHANGED
|
@@ -3722,7 +3722,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3722 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
-
Cell: nv | 0.
|
| 3726 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3745,7 +3745,7 @@ Cell: nv | 0.53s
|
|
| 3745 |
</div>
|
| 3746 |
</div>
|
| 3747 |
<div id="output-nv" class="cell-output">
|
| 3748 |
-
<div class="cell-stdout">Wed Sep 24
|
| 3749 |
+-----------------------------------------------------------------------------------------+
|
| 3750 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3754,19 +3754,19 @@ Cell: nv | 0.53s
|
|
| 3754 |
| | | MIG M. |
|
| 3755 |
|=========================================+========================+======================|
|
| 3756 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
-
| 0%
|
| 3758 |
| | | N/A |
|
| 3759 |
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
-
| 0% 37C P0
|
| 3762 |
| | | N/A |
|
| 3763 |
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
-
| 0%
|
| 3766 |
| | | N/A |
|
| 3767 |
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
-
| 0%
|
| 3770 |
| | | N/A |
|
| 3771 |
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
|
|
@@ -3792,7 +3792,7 @@ Cell: nv | 0.53s
|
|
| 3792 |
<span onclick="toggleOutput('setup2')" style="cursor: pointer;">▼ output</span>
|
| 3793 |
<span id="uv-indicator-setup2" onclick="toggleUvLogsFromHeader('setup2')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3794 |
</span> |
|
| 3795 |
-
Cell: setup2 |
|
| 3796 |
| <button class="run-btn" onclick="runCell('setup2')">▶ run</button>
|
| 3797 |
<button class="copy-btn" onclick="copyCell('setup2')">Copy</button>
|
| 3798 |
<a href="cells/setup2.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4050,7 +4050,7 @@ Reasoning: low
|
|
| 4050 |
What is Tensor Parallelism?
|
| 4051 |
|
| 4052 |
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it's used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it's also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide mention of "tensor parallelism" in Megatron-LM: splitting weight matrices across GPUs. Provide mention of "tensor parallelism" in DeepSpeed: "ZeRO-Offload" etc. Provide mention
|
| 4053 |
-
Generation took 31.
|
| 4054 |
</div>
|
| 4055 |
<div class="uv-install-logs" id="uv-logs-setup2">
|
| 4056 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
@@ -4059,31 +4059,31 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
|
|
| 4059 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4060 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4061 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4062 |
-
Downloading
|
|
|
|
| 4063 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4064 |
-
Downloading
|
| 4065 |
-
Downloading pillow (6.3MiB)
|
| 4066 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4067 |
Downloading networkx (1.9MiB)
|
| 4068 |
-
Downloading
|
| 4069 |
-
Downloading tokenizers (3.1MiB)
|
| 4070 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4071 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4072 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4073 |
-
Downloading jedi (1.5MiB)
|
| 4074 |
-
Downloading numpy (15.9MiB)
|
| 4075 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4076 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4077 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4078 |
Downloading triton (148.4MiB)
|
| 4079 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4080 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4081 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4082 |
-
Downloading
|
| 4083 |
-
Downloading matplotlib (8.3MiB)
|
| 4084 |
-
Downloading fonttools (4.7MiB)
|
| 4085 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4086 |
Downloading kiwisolver (1.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4087 |
Downloading torch (846.8MiB)
|
| 4088 |
Downloading nvidia-cufile-cu12
|
| 4089 |
Downloading kiwisolver
|
|
@@ -4105,38 +4105,36 @@ Downloading torch (846.8MiB)
|
|
| 4105 |
Downloading triton
|
| 4106 |
Downloading nvidia-cufft-cu12
|
| 4107 |
Downloading nvidia-cusolver-cu12
|
| 4108 |
-
Downloading nvidia-cusparse-cu12
|
| 4109 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 4110 |
Downloading nvidia-nccl-cu12
|
| 4111 |
Downloading nvidia-cublas-cu12
|
| 4112 |
Downloading nvidia-cudnn-cu12
|
| 4113 |
Downloading torch
|
| 4114 |
-
Installed 69 packages in
|
| 4115 |
</div>
|
| 4116 |
</div>
|
| 4117 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4118 |
-
Fetching 3 files: 33%|███▎ | 1/3 [00:06<00:12, 6.
|
| 4119 |
-
Fetching 3 files: 67%|██████▋ | 2/3 [00:07<00:03, 3.
|
| 4120 |
-
Fetching 3 files: 100%|██████████| 3/3 [00:07<00:00, 2.
|
| 4121 |
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 4122 |
|
| 4123 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4124 |
-
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.
|
| 4125 |
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
|
| 4126 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4127 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
|
| 4128 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4129 |
|
| 4130 |
Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 4131 |
-
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:
|
| 4132 |
-
Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:
|
| 4133 |
-
Fetching 66 files: 26%|██▌ | 17/66 [00:
|
| 4134 |
-
Fetching 66 files:
|
| 4135 |
-
Fetching 66 files:
|
| 4136 |
-
Fetching 66 files:
|
| 4137 |
-
|
| 4138 |
-
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 37.62it/s]
|
| 4139 |
-
/tmp/uvnote-run-e6cle3et/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 4140 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4141 |
warnings.warn(
|
| 4142 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
@@ -4163,7 +4161,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
|
|
| 4163 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4164 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4165 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4166 |
-
/tmp/uvnote-run-
|
| 4167 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4168 |
warnings.warn(
|
| 4169 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
@@ -4200,7 +4198,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
|
|
| 4200 |
<span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
|
| 4201 |
<span id="uv-indicator-setup" onclick="toggleUvLogsFromHeader('setup')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4202 |
</span> |
|
| 4203 |
-
Cell: setup |
|
| 4204 |
| <button class="run-btn" onclick="runCell('setup')">▶ run</button>
|
| 4205 |
<button class="copy-btn" onclick="copyCell('setup')">Copy</button>
|
| 4206 |
<a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4459,12 +4457,8 @@ Reasoning: low
|
|
| 4459 |
|
| 4460 |
What is Tensor Parallelism?
|
| 4461 |
|
| 4462 |
-
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical
|
| 4463 |
-
|
| 4464 |
-
| **Aspect** | **What it is** | **Why it matters** |
|
| 4465 |
-
|------------|----------------|--------------------|
|
| 4466 |
-
| **Definition** | Splitting a *single* weight tensor (e.g., a large matrix in a transformer layer) across multiple devices so that each device holds only a *portion* of the tensor. | Allows training of models that are
|
| 4467 |
-
Generation took 26.28 seconds
|
| 4468 |
</div>
|
| 4469 |
<div class="uv-install-logs" id="uv-logs-setup">
|
| 4470 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
@@ -4473,37 +4467,37 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
|
|
| 4473 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4474 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4475 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4476 |
-
Downloading tokenizers (3.1MiB)
|
| 4477 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4478 |
-
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4479 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4480 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4481 |
-
Downloading pygments (1.2MiB)
|
| 4482 |
-
Downloading sympy (6.0MiB)
|
| 4483 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
|
|
|
| 4484 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4485 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4486 |
-
Downloading
|
|
|
|
|
|
|
| 4487 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4488 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4489 |
Downloading kiwisolver (1.4MiB)
|
| 4490 |
-
Downloading
|
|
|
|
|
|
|
| 4491 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
| 4492 |
Downloading fonttools (4.7MiB)
|
| 4493 |
-
Downloading hf-xet (3.0MiB)
|
| 4494 |
-
Downloading numpy (15.9MiB)
|
| 4495 |
-
Downloading matplotlib (8.3MiB)
|
| 4496 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4497 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4498 |
-
Downloading jedi (1.5MiB)
|
| 4499 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4500 |
Downloading triton (148.4MiB)
|
| 4501 |
Downloading torch (846.8MiB)
|
| 4502 |
Downloading nvidia-cufile-cu12
|
| 4503 |
Downloading kiwisolver
|
| 4504 |
Downloading pygments
|
| 4505 |
-
Downloading tokenizers
|
| 4506 |
Downloading hf-xet
|
|
|
|
| 4507 |
Downloading networkx
|
| 4508 |
Downloading fonttools
|
| 4509 |
Downloading pillow
|
|
@@ -4519,33 +4513,33 @@ Downloading torch (846.8MiB)
|
|
| 4519 |
Downloading triton
|
| 4520 |
Downloading nvidia-cufft-cu12
|
| 4521 |
Downloading nvidia-cusolver-cu12
|
| 4522 |
-
Downloading nvidia-cusparse-cu12
|
| 4523 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 4524 |
Downloading nvidia-nccl-cu12
|
| 4525 |
Downloading nvidia-cublas-cu12
|
| 4526 |
Downloading nvidia-cudnn-cu12
|
| 4527 |
Downloading torch
|
| 4528 |
-
Installed 69 packages in
|
| 4529 |
</div>
|
| 4530 |
</div>
|
| 4531 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4532 |
-
Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:14, 7.
|
| 4533 |
-
Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.
|
| 4534 |
-
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.
|
| 4535 |
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 4536 |
|
| 4537 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4538 |
-
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.
|
| 4539 |
-
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.
|
| 4540 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4541 |
-
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.
|
| 4542 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4543 |
|
| 4544 |
Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 4545 |
-
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:
|
| 4546 |
-
Fetching 6 files:
|
| 4547 |
-
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00,
|
| 4548 |
-
/tmp/uvnote-run-
|
| 4549 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4550 |
warnings.warn(
|
| 4551 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
|
@@ -4572,7 +4566,7 @@ INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for laye
|
|
| 4572 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4573 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4574 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4575 |
-
/tmp/uvnote-run-
|
| 4576 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4577 |
warnings.warn(
|
| 4578 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
|
|
|
| 3722 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
+
Cell: nv | 0.55s
|
| 3726 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3745 |
</div>
|
| 3746 |
</div>
|
| 3747 |
<div id="output-nv" class="cell-output">
|
| 3748 |
+
<div class="cell-stdout">Wed Sep 24 22:04:34 2025
|
| 3749 |
+-----------------------------------------------------------------------------------------+
|
| 3750 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3754 |
| | | MIG M. |
|
| 3755 |
|=========================================+========================+======================|
|
| 3756 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
+
| 0% 36C P0 45W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3758 |
| | | N/A |
|
| 3759 |
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
+
| 0% 37C P0 47W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3762 |
| | | N/A |
|
| 3763 |
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
+
| 0% 35C P0 47W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3766 |
| | | N/A |
|
| 3767 |
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
+
| 0% 37C P0 44W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3770 |
| | | N/A |
|
| 3771 |
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
|
|
|
|
| 3792 |
<span onclick="toggleOutput('setup2')" style="cursor: pointer;">▼ output</span>
|
| 3793 |
<span id="uv-indicator-setup2" onclick="toggleUvLogsFromHeader('setup2')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3794 |
</span> |
|
| 3795 |
+
Cell: setup2 | 114.03s
|
| 3796 |
| <button class="run-btn" onclick="runCell('setup2')">▶ run</button>
|
| 3797 |
<button class="copy-btn" onclick="copyCell('setup2')">Copy</button>
|
| 3798 |
<a href="cells/setup2.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4050 |
What is Tensor Parallelism?
|
| 4051 |
|
| 4052 |
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it's used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it's also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide mention of "tensor parallelism" in Megatron-LM: splitting weight matrices across GPUs. Provide mention of "tensor parallelism" in DeepSpeed: "ZeRO-Offload" etc. Provide mention
|
| 4053 |
+
Generation took 31.36 seconds
|
| 4054 |
</div>
|
| 4055 |
<div class="uv-install-logs" id="uv-logs-setup2">
|
| 4056 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
|
|
| 4059 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4060 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4061 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4062 |
+
Downloading jedi (1.5MiB)
|
| 4063 |
+
Downloading pygments (1.2MiB)
|
| 4064 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4065 |
+
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
| 4066 |
Downloading networkx (1.9MiB)
|
| 4067 |
+
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
| 4068 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4069 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4070 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4071 |
+
Downloading hf-xet (3.0MiB)
|
| 4072 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4073 |
+
Downloading fonttools (4.7MiB)
|
| 4074 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
| 4075 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
| 4076 |
Downloading triton (148.4MiB)
|
|
|
|
|
|
|
| 4077 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4078 |
+
Downloading tokenizers (3.1MiB)
|
|
|
|
|
|
|
|
|
|
| 4079 |
Downloading kiwisolver (1.4MiB)
|
| 4080 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4081 |
+
Downloading pillow (6.3MiB)
|
| 4082 |
+
Downloading numpy (15.9MiB)
|
| 4083 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4084 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4085 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4086 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4087 |
Downloading torch (846.8MiB)
|
| 4088 |
Downloading nvidia-cufile-cu12
|
| 4089 |
Downloading kiwisolver
|
|
|
|
| 4105 |
Downloading triton
|
| 4106 |
Downloading nvidia-cufft-cu12
|
| 4107 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4108 |
Downloading nvidia-cusparselt-cu12
|
| 4109 |
+
Downloading nvidia-cusparse-cu12
|
| 4110 |
Downloading nvidia-nccl-cu12
|
| 4111 |
Downloading nvidia-cublas-cu12
|
| 4112 |
Downloading nvidia-cudnn-cu12
|
| 4113 |
Downloading torch
|
| 4114 |
+
Installed 69 packages in 509ms
|
| 4115 |
</div>
|
| 4116 |
</div>
|
| 4117 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4118 |
+
Fetching 3 files: 33%|███▎ | 1/3 [00:06<00:12, 6.49s/it]
|
| 4119 |
+
Fetching 3 files: 67%|██████▋ | 2/3 [00:07<00:03, 3.44s/it]
|
| 4120 |
+
Fetching 3 files: 100%|██████████| 3/3 [00:07<00:00, 2.60s/it]
|
| 4121 |
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 4122 |
|
| 4123 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4124 |
+
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.35s/it]
|
| 4125 |
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
|
| 4126 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4127 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
|
| 4128 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4129 |
|
| 4130 |
Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 4131 |
+
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:10, 6.31it/s]
|
| 4132 |
+
Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:02, 26.39it/s]
|
| 4133 |
+
Fetching 66 files: 26%|██▌ | 17/66 [00:01<00:03, 12.42it/s]
|
| 4134 |
+
Fetching 66 files: 74%|███████▍ | 49/66 [00:01<00:00, 45.00it/s]
|
| 4135 |
+
Fetching 66 files: 91%|█████████ | 60/66 [00:01<00:00, 45.67it/s]
|
| 4136 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 34.31it/s]
|
| 4137 |
+
/tmp/uvnote-run-_uergc47/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
|
|
|
|
|
|
| 4138 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4139 |
warnings.warn(
|
| 4140 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
|
|
| 4161 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4162 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4163 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4164 |
+
/tmp/uvnote-run-_uergc47/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 4165 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4166 |
warnings.warn(
|
| 4167 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
|
|
| 4198 |
<span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
|
| 4199 |
<span id="uv-indicator-setup" onclick="toggleUvLogsFromHeader('setup')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4200 |
</span> |
|
| 4201 |
+
Cell: setup | 109.23s
|
| 4202 |
| <button class="run-btn" onclick="runCell('setup')">▶ run</button>
|
| 4203 |
<button class="copy-btn" onclick="copyCell('setup')">Copy</button>
|
| 4204 |
<a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4457 |
|
| 4458 |
What is Tensor Parallelism?
|
| 4459 |
|
| 4460 |
+
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical use cases, differences from data parallelism, pipeline parallelism, model parallelism. Provide example: splitting a fully connected layer's weight matrix across GPUs. Provide mention of frameworks: Megatron-LM, DeepSpeed, etc. Provide explanation of how forward/backward passes are computed. Provide mention of communication overhead, scaling, etc. Provide mention of "tensor parallelism" as part of "model parallelism" but specifically splitting tensors. Provide mention of "tensor parallelism" in context of transformer layers: splitting attention heads, feed-forward layers. Provide mention of "tensor parallelism" in context of "DeepSpeed ZeRO Stage 3" or "Megatron-LM's tensor parallelism". Provide mention of "tensor parallelism" as "model parallelism across the weight matrices" and "tensor parallelism" vs "pipeline parallelism". Provide mention of "tensor parallelism" as "splitting the weight matrix across GPUs, each GPU holds a slice of the matrix, and the input is broadcasted,
|
| 4461 |
+
Generation took 26.26 seconds
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4462 |
</div>
|
| 4463 |
<div class="uv-install-logs" id="uv-logs-setup">
|
| 4464 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
|
|
| 4467 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4468 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4469 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4470 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4471 |
+
Downloading pillow (6.3MiB)
|
| 4472 |
+
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4473 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4474 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4475 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4476 |
+
Downloading numpy (15.9MiB)
|
| 4477 |
+
Downloading hf-xet (3.0MiB)
|
| 4478 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4479 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4480 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4481 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4482 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4483 |
+
Downloading pygments (1.2MiB)
|
| 4484 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4485 |
+
Downloading jedi (1.5MiB)
|
| 4486 |
+
Downloading sympy (6.0MiB)
|
| 4487 |
Downloading kiwisolver (1.4MiB)
|
| 4488 |
+
Downloading matplotlib (8.3MiB)
|
| 4489 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4490 |
+
Downloading networkx (1.9MiB)
|
| 4491 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4492 |
+
Downloading tokenizers (3.1MiB)
|
| 4493 |
Downloading fonttools (4.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4494 |
Downloading triton (148.4MiB)
|
| 4495 |
Downloading torch (846.8MiB)
|
| 4496 |
Downloading nvidia-cufile-cu12
|
| 4497 |
Downloading kiwisolver
|
| 4498 |
Downloading pygments
|
|
|
|
| 4499 |
Downloading hf-xet
|
| 4500 |
+
Downloading tokenizers
|
| 4501 |
Downloading networkx
|
| 4502 |
Downloading fonttools
|
| 4503 |
Downloading pillow
|
|
|
|
| 4513 |
Downloading triton
|
| 4514 |
Downloading nvidia-cufft-cu12
|
| 4515 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4516 |
Downloading nvidia-cusparselt-cu12
|
| 4517 |
+
Downloading nvidia-cusparse-cu12
|
| 4518 |
Downloading nvidia-nccl-cu12
|
| 4519 |
Downloading nvidia-cublas-cu12
|
| 4520 |
Downloading nvidia-cudnn-cu12
|
| 4521 |
Downloading torch
|
| 4522 |
+
Installed 69 packages in 464ms
|
| 4523 |
</div>
|
| 4524 |
</div>
|
| 4525 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4526 |
+
Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:14, 7.38s/it]
|
| 4527 |
+
Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.64s/it]
|
| 4528 |
+
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.80s/it]
|
| 4529 |
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 4530 |
|
| 4531 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4532 |
+
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.34s/it]
|
| 4533 |
+
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
|
| 4534 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4535 |
+
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
|
| 4536 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4537 |
|
| 4538 |
Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 4539 |
+
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:00, 5.44it/s]
|
| 4540 |
+
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 6.96it/s]
|
| 4541 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 13.54it/s]
|
| 4542 |
+
/tmp/uvnote-run-jc1wbhvj/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 4543 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4544 |
warnings.warn(
|
| 4545 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
|
|
|
| 4566 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4567 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4568 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4569 |
+
/tmp/uvnote-run-jc1wbhvj/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 4570 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4571 |
warnings.warn(
|
| 4572 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
megablocks_yamoe/torch_profile.html
CHANGED
|
@@ -3720,7 +3720,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
-
Cell: utils | deps: torch, numpy |
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3794,24 +3794,24 @@ Cell: utils | deps: torch, numpy | 34.25s
|
|
| 3794 |
<div class="uv-install-logs" id="uv-logs-utils">
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
| 3797 |
-
Downloading
|
| 3798 |
Downloading setuptools (1.1MiB)
|
| 3799 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3800 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3801 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3802 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3803 |
Downloading numpy (16.2MiB)
|
| 3804 |
-
Downloading
|
| 3805 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3806 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3807 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 3808 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3809 |
-
Downloading
|
| 3810 |
-
Downloading nvidia-
|
|
|
|
| 3811 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3812 |
-
Downloading triton (148.3MiB)
|
| 3813 |
Downloading torch (846.9MiB)
|
| 3814 |
-
Downloading
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
| 3817 |
Downloading networkx
|
|
@@ -3824,13 +3824,13 @@ Downloading networkx (1.9MiB)
|
|
| 3824 |
Downloading triton
|
| 3825 |
Downloading nvidia-cufft-cu12
|
| 3826 |
Downloading nvidia-cusolver-cu12
|
| 3827 |
-
Downloading nvidia-cusparselt-cu12
|
| 3828 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 3829 |
Downloading nvidia-nccl-cu12
|
| 3830 |
Downloading nvidia-cublas-cu12
|
| 3831 |
Downloading nvidia-cudnn-cu12
|
| 3832 |
Downloading torch
|
| 3833 |
-
Installed 26 packages in
|
| 3834 |
</div>
|
| 3835 |
</div>
|
| 3836 |
</div>
|
|
@@ -3843,7 +3843,7 @@ Installed 26 packages in 446ms
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: bench_utils | deps: torch, numpy |
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4331,24 +4331,24 @@ Cell: bench_utils | deps: torch, numpy | 35.45s
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
| 4334 |
-
Downloading numpy (16.2MiB)
|
| 4335 |
-
Downloading torch (846.9MiB)
|
| 4336 |
-
Downloading triton (148.3MiB)
|
| 4337 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4338 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4339 |
Downloading setuptools (1.1MiB)
|
| 4340 |
-
Downloading nvidia-
|
| 4341 |
-
Downloading nvidia-
|
| 4342 |
-
Downloading nvidia-
|
| 4343 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4344 |
-
Downloading nvidia-
|
| 4345 |
Downloading sympy (6.0MiB)
|
| 4346 |
-
Downloading networkx (1.9MiB)
|
| 4347 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4348 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4349 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4350 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4351 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
@@ -4367,7 +4367,7 @@ Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
| 4369 |
Downloading torch
|
| 4370 |
-
Installed 26 packages in
|
| 4371 |
</div>
|
| 4372 |
</div>
|
| 4373 |
</div>
|
|
@@ -4381,7 +4381,7 @@ Installed 26 packages in 445ms
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
-
Cell: config | deps: torch, numpy | 34.
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4441,24 +4441,24 @@ Cell: config | deps: torch, numpy | 34.31s
|
|
| 4441 |
<div class="uv-install-logs" id="uv-logs-config">
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
| 4444 |
-
Downloading
|
| 4445 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4446 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4447 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4448 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4449 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4450 |
-
Downloading torch (846.9MiB)
|
| 4451 |
-
Downloading networkx (1.9MiB)
|
| 4452 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
|
|
|
| 4453 |
Downloading setuptools (1.1MiB)
|
| 4454 |
-
Downloading
|
| 4455 |
-
Downloading nvidia-
|
|
|
|
|
|
|
|
|
|
| 4456 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
| 4457 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4458 |
-
Downloading nvidia-
|
| 4459 |
-
Downloading
|
| 4460 |
-
Downloading triton (148.3MiB)
|
| 4461 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
| 4464 |
Downloading networkx
|
|
@@ -4471,13 +4471,13 @@ Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
| 4471 |
Downloading triton
|
| 4472 |
Downloading nvidia-cufft-cu12
|
| 4473 |
Downloading nvidia-cusolver-cu12
|
| 4474 |
-
Downloading nvidia-cusparse-cu12
|
| 4475 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 4476 |
Downloading nvidia-nccl-cu12
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
Downloading nvidia-cudnn-cu12
|
| 4479 |
Downloading torch
|
| 4480 |
-
Installed 26 packages in
|
| 4481 |
</div>
|
| 4482 |
</div>
|
| 4483 |
</div>
|
|
@@ -4490,7 +4490,7 @@ Installed 26 packages in 450ms
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
-
Cell: save_data | deps: torch, numpy |
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4585,24 +4585,24 @@ Down sum: 206.729263
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
| 4588 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4589 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4590 |
-
Downloading numpy (16.2MiB)
|
| 4591 |
Downloading setuptools (1.1MiB)
|
| 4592 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4593 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4594 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4595 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4596 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4597 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4598 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4599 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4600 |
-
Downloading nvidia-
|
| 4601 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4602 |
Downloading sympy (6.0MiB)
|
| 4603 |
Downloading torch (846.9MiB)
|
| 4604 |
-
Downloading
|
| 4605 |
-
Downloading
|
|
|
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
@@ -4621,16 +4621,16 @@ Downloading triton (148.3MiB)
|
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
-
Installed 26 packages in
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
| 4629 |
-
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
| 4630 |
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
| 4631 |
<a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
|
| 4632 |
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4633 |
<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
|
|
|
|
| 4634 |
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
| 4635 |
</div>
|
| 4636 |
</div>
|
|
@@ -4645,7 +4645,7 @@ Installed 26 packages in 446ms
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
-
Cell: yamoe_run | deps: torch, kernels, numpy |
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4938,10 +4938,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
-
Progress: 20% complete (avg: 4.
|
| 4942 |
-
Progress: 40% complete (avg: 4.
|
| 4943 |
-
Progress: 60% complete (avg: 4.
|
| 4944 |
-
Progress: 80% complete (avg: 4.
|
| 4945 |
|
| 4946 |
Output tensors:
|
| 4947 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -4951,19 +4951,19 @@ Output tensors:
|
|
| 4951 |
Iterations: 50
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
-
Average: 4.
|
| 4955 |
-
Min: 4.
|
| 4956 |
-
Max: 4.
|
| 4957 |
-
Std Dev: 0.
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
P50 (median): 4.252 ms
|
| 4961 |
-
P95: 4.
|
| 4962 |
-
P99: 4.
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
-
Tokens/sec:
|
| 4966 |
-
Std Dev:
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
|
@@ -4973,25 +4973,25 @@ Output sum: 3.971905
|
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
| 4976 |
-
Downloading hf-xet (3.0MiB)
|
| 4977 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4978 |
Downloading networkx (1.9MiB)
|
| 4979 |
-
Downloading
|
| 4980 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4981 |
Downloading setuptools (1.1MiB)
|
| 4982 |
-
Downloading nvidia-
|
|
|
|
| 4983 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
|
|
|
| 4984 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4985 |
-
Downloading
|
|
|
|
|
|
|
| 4986 |
Downloading torch (846.9MiB)
|
| 4987 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4988 |
-
Downloading sympy (6.0MiB)
|
| 4989 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4990 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4991 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4992 |
-
Downloading
|
| 4993 |
-
Downloading nvidia-
|
| 4994 |
-
Downloading
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
@@ -5011,13 +5011,14 @@ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
-
Installed 37 packages in
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
-
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:
|
| 5019 |
-
Fetching 6 files:
|
| 5020 |
-
Fetching 6 files:
|
|
|
|
| 5021 |
<div class="cell-artifacts">
|
| 5022 |
<h4>Artifacts:</h4>
|
| 5023 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
@@ -5034,7 +5035,7 @@ Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 8.2
|
|
| 5034 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5035 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5036 |
</span> |
|
| 5037 |
-
Cell: binned_run | deps: torch, numpy |
|
| 5038 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5039 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5040 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5448,10 +5449,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5448 |
|
| 5449 |
Warming up (10 iterations)...
|
| 5450 |
Benchmarking (50 iterations)...
|
| 5451 |
-
Progress: 20% complete (avg: 37.
|
| 5452 |
-
Progress: 40% complete (avg: 37.
|
| 5453 |
-
Progress: 60% complete (avg:
|
| 5454 |
-
Progress: 80% complete (avg: 36.
|
| 5455 |
|
| 5456 |
Output tensors:
|
| 5457 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -5461,19 +5462,19 @@ Output tensors:
|
|
| 5461 |
Iterations: 50
|
| 5462 |
|
| 5463 |
Latency Statistics:
|
| 5464 |
-
Average:
|
| 5465 |
-
Min:
|
| 5466 |
-
Max:
|
| 5467 |
-
Std Dev: 1.
|
| 5468 |
|
| 5469 |
Percentiles:
|
| 5470 |
-
P50 (median): 36.
|
| 5471 |
-
P95: 38.
|
| 5472 |
-
P99:
|
| 5473 |
|
| 5474 |
Throughput:
|
| 5475 |
-
Tokens/sec:
|
| 5476 |
-
Std Dev:
|
| 5477 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5478 |
|
| 5479 |
Saved benchmark results to binned_results.json
|
|
@@ -5483,24 +5484,24 @@ Output sum: 3.971905
|
|
| 5483 |
<div class="uv-install-logs" id="uv-logs-binned_run">
|
| 5484 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5485 |
<div class="uv-logs-content" style="display: none;">
|
| 5486 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5487 |
-
Downloading sympy (6.0MiB)
|
| 5488 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5489 |
-
Downloading setuptools (1.1MiB)
|
| 5490 |
-
Downloading numpy (16.2MiB)
|
| 5491 |
Downloading networkx (1.9MiB)
|
| 5492 |
-
Downloading
|
|
|
|
| 5493 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5494 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5495 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5496 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5497 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5498 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5499 |
-
Downloading nvidia-
|
| 5500 |
-
Downloading nvidia-
|
| 5501 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5502 |
-
Downloading
|
|
|
|
|
|
|
| 5503 |
Downloading triton (148.3MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5504 |
Downloading nvidia-cufile-cu12
|
| 5505 |
Downloading setuptools
|
| 5506 |
Downloading networkx
|
|
@@ -5513,13 +5514,13 @@ Downloading triton (148.3MiB)
|
|
| 5513 |
Downloading triton
|
| 5514 |
Downloading nvidia-cufft-cu12
|
| 5515 |
Downloading nvidia-cusolver-cu12
|
| 5516 |
-
Downloading nvidia-cusparselt-cu12
|
| 5517 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 5518 |
Downloading nvidia-nccl-cu12
|
| 5519 |
Downloading nvidia-cublas-cu12
|
| 5520 |
Downloading nvidia-cudnn-cu12
|
| 5521 |
Downloading torch
|
| 5522 |
-
Installed 26 packages in
|
| 5523 |
</div>
|
| 5524 |
</div>
|
| 5525 |
<div class="cell-artifacts">
|
|
@@ -5538,7 +5539,7 @@ Installed 26 packages in 446ms
|
|
| 5538 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5539 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5540 |
</span> |
|
| 5541 |
-
Cell: gptoss_run | deps: torch, numpy |
|
| 5542 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5543 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5544 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5856,10 +5857,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5856 |
|
| 5857 |
Warming up (10 iterations)...
|
| 5858 |
Benchmarking (50 iterations)...
|
| 5859 |
-
Progress: 20% complete (avg:
|
| 5860 |
-
Progress: 40% complete (avg:
|
| 5861 |
-
Progress: 60% complete (avg:
|
| 5862 |
-
Progress: 80% complete (avg:
|
| 5863 |
|
| 5864 |
Output tensors:
|
| 5865 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -5869,19 +5870,19 @@ Output tensors:
|
|
| 5869 |
Iterations: 50
|
| 5870 |
|
| 5871 |
Latency Statistics:
|
| 5872 |
-
Average:
|
| 5873 |
-
Min:
|
| 5874 |
-
Max:
|
| 5875 |
-
Std Dev: 3.
|
| 5876 |
|
| 5877 |
Percentiles:
|
| 5878 |
-
P50 (median):
|
| 5879 |
-
P95:
|
| 5880 |
-
P99:
|
| 5881 |
|
| 5882 |
Throughput:
|
| 5883 |
-
Tokens/sec:
|
| 5884 |
-
Std Dev:
|
| 5885 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5886 |
|
| 5887 |
Saved benchmark results to gptoss_results.json
|
|
@@ -5891,23 +5892,23 @@ Output sum: 11.532237
|
|
| 5891 |
<div class="uv-install-logs" id="uv-logs-gptoss_run">
|
| 5892 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5893 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
| 5894 |
Downloading setuptools (1.1MiB)
|
| 5895 |
-
Downloading nvidia-
|
| 5896 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5897 |
-
Downloading numpy (16.2MiB)
|
| 5898 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5899 |
-
Downloading sympy (6.0MiB)
|
| 5900 |
-
Downloading torch (846.9MiB)
|
| 5901 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 5902 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5903 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5904 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5905 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5906 |
-
Downloading networkx (1.9MiB)
|
| 5907 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 5908 |
Downloading triton (148.3MiB)
|
| 5909 |
-
Downloading nvidia-
|
| 5910 |
-
Downloading nvidia-
|
|
|
|
|
|
|
| 5911 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5912 |
Downloading nvidia-cufile-cu12
|
| 5913 |
Downloading setuptools
|
|
@@ -5921,13 +5922,13 @@ Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
| 5921 |
Downloading triton
|
| 5922 |
Downloading nvidia-cufft-cu12
|
| 5923 |
Downloading nvidia-cusolver-cu12
|
| 5924 |
-
Downloading nvidia-cusparselt-cu12
|
| 5925 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 5926 |
Downloading nvidia-nccl-cu12
|
| 5927 |
Downloading nvidia-cublas-cu12
|
| 5928 |
Downloading nvidia-cudnn-cu12
|
| 5929 |
Downloading torch
|
| 5930 |
-
Installed 26 packages in
|
| 5931 |
</div>
|
| 5932 |
</div>
|
| 5933 |
<div class="cell-artifacts">
|
|
@@ -5946,7 +5947,7 @@ Installed 26 packages in 442ms
|
|
| 5946 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5947 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5948 |
</span> |
|
| 5949 |
-
Cell: gptoss_training_run | deps: torch, numpy |
|
| 5950 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5951 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5952 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6247,10 +6248,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 6247 |
|
| 6248 |
Warming up (10 iterations)...
|
| 6249 |
Benchmarking (50 iterations)...
|
| 6250 |
-
Progress: 20% complete (avg: 48.
|
| 6251 |
-
Progress: 40% complete (avg:
|
| 6252 |
-
Progress: 60% complete (avg: 47.
|
| 6253 |
-
Progress: 80% complete (avg:
|
| 6254 |
|
| 6255 |
Output tensors:
|
| 6256 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -6260,19 +6261,19 @@ Output tensors:
|
|
| 6260 |
Iterations: 50
|
| 6261 |
|
| 6262 |
Latency Statistics:
|
| 6263 |
-
Average:
|
| 6264 |
-
Min: 38.
|
| 6265 |
-
Max: 49.
|
| 6266 |
-
Std Dev: 2.
|
| 6267 |
|
| 6268 |
Percentiles:
|
| 6269 |
-
P50 (median):
|
| 6270 |
-
P95: 48.
|
| 6271 |
-
P99:
|
| 6272 |
|
| 6273 |
Throughput:
|
| 6274 |
-
Tokens/sec:
|
| 6275 |
-
Std Dev:
|
| 6276 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6277 |
|
| 6278 |
Saved benchmark results to gptoss_training_results.json
|
|
@@ -6282,24 +6283,24 @@ Output sum: 11.532237
|
|
| 6282 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6283 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6284 |
<div class="uv-logs-content" style="display: none;">
|
| 6285 |
-
Downloading nvidia-
|
| 6286 |
-
Downloading networkx (1.9MiB)
|
| 6287 |
Downloading setuptools (1.1MiB)
|
| 6288 |
-
Downloading
|
|
|
|
| 6289 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6290 |
-
Downloading nvidia-
|
|
|
|
|
|
|
| 6291 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6292 |
-
Downloading
|
|
|
|
|
|
|
| 6293 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6294 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6295 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6296 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6297 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
|
|
|
| 6298 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6299 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6300 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6301 |
Downloading triton (148.3MiB)
|
| 6302 |
-
Downloading torch (846.9MiB)
|
| 6303 |
Downloading nvidia-cufile-cu12
|
| 6304 |
Downloading setuptools
|
| 6305 |
Downloading networkx
|
|
@@ -6318,7 +6319,7 @@ Downloading torch (846.9MiB)
|
|
| 6318 |
Downloading nvidia-cublas-cu12
|
| 6319 |
Downloading nvidia-cudnn-cu12
|
| 6320 |
Downloading torch
|
| 6321 |
-
Installed 26 packages in
|
| 6322 |
</div>
|
| 6323 |
</div>
|
| 6324 |
<div class="cell-artifacts">
|
|
@@ -6337,7 +6338,7 @@ Installed 26 packages in 448ms
|
|
| 6337 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6338 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6339 |
</span> |
|
| 6340 |
-
Cell: megablocks_run | deps: torch, numpy, kernels |
|
| 6341 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6342 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6343 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6492,7 +6493,7 @@ Cell: megablocks_run | deps: torch, numpy, kernels | 41.38s | FAILED
|
|
| 6492 |
<span class="c1"># Attach loaded expert weights to the experts container</span>
|
| 6493 |
<span class="n">e</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">experts</span>
|
| 6494 |
<span class="n">e</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
|
| 6495 |
-
<span class="n">e</span><span class="o">.</span><span class="n">capacity_factor</span> <span class="o">=</span> <span class="mi">
|
| 6496 |
<span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
| 6497 |
<span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
| 6498 |
<span class="n">e</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
|
@@ -6569,25 +6570,25 @@ Warming up (10 iterations)...
|
|
| 6569 |
<div class="uv-install-logs" id="uv-logs-megablocks_run">
|
| 6570 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6571 |
<div class="uv-logs-content" style="display: none;">
|
| 6572 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6573 |
-
Downloading setuptools (1.1MiB)
|
| 6574 |
Downloading numpy (16.2MiB)
|
| 6575 |
-
Downloading networkx (1.9MiB)
|
| 6576 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6577 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6578 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6579 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6580 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6581 |
-
Downloading torch (846.9MiB)
|
| 6582 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6583 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6584 |
Downloading hf-xet (3.0MiB)
|
| 6585 |
-
Downloading
|
|
|
|
| 6586 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6587 |
-
Downloading nvidia-
|
| 6588 |
-
Downloading triton (148.3MiB)
|
| 6589 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6590 |
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6591 |
Downloading nvidia-cufile-cu12
|
| 6592 |
Downloading hf-xet
|
| 6593 |
Downloading setuptools
|
|
@@ -6601,26 +6602,25 @@ Downloading sympy (6.0MiB)
|
|
| 6601 |
Downloading triton
|
| 6602 |
Downloading nvidia-cufft-cu12
|
| 6603 |
Downloading nvidia-cusolver-cu12
|
| 6604 |
-
Downloading nvidia-cusparselt-cu12
|
| 6605 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 6606 |
Downloading nvidia-nccl-cu12
|
| 6607 |
Downloading nvidia-cublas-cu12
|
| 6608 |
Downloading nvidia-cudnn-cu12
|
| 6609 |
Downloading torch
|
| 6610 |
-
Installed 37 packages in
|
| 6611 |
</div>
|
| 6612 |
</div>
|
| 6613 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6614 |
-
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:
|
| 6615 |
-
Fetching 66 files:
|
| 6616 |
-
Fetching 66 files:
|
| 6617 |
-
Fetching 66 files:
|
| 6618 |
-
Fetching 66 files:
|
| 6619 |
-
Fetching 66 files:
|
| 6620 |
-
Fetching 66 files:
|
| 6621 |
-
Fetching 66 files: 100%|██████████| 66/66 [00:
|
| 6622 |
-
|
| 6623 |
-
/tmp/tmpq5pei8xr/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
|
| 6624 |
5 | #include <Python.h>
|
| 6625 |
| ^~~~~~~~~~
|
| 6626 |
compilation terminated.
|
|
@@ -6637,87 +6637,87 @@ Traceback (most recent call last):
|
|
| 6637 |
File "/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py", line 177, in <lambda>
|
| 6638 |
call = lambda x: fn(x, *args[1:], **kwargs)
|
| 6639 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6640 |
-
File "/tmp/uvnote-run-
|
| 6641 |
return self._call_impl(*args, **kwargs)
|
| 6642 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6643 |
-
File "/tmp/uvnote-run-
|
| 6644 |
return forward_call(*args, **kwargs)
|
| 6645 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6646 |
File "/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/megablocks_run.py", line 81, in forward
|
| 6647 |
output, dummy_routing_weights = self.model(hidden_states)
|
| 6648 |
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6649 |
-
File "/tmp/uvnote-run-
|
| 6650 |
return self._call_impl(*args, **kwargs)
|
| 6651 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6652 |
-
File "/tmp/uvnote-run-
|
| 6653 |
return forward_call(*args, **kwargs)
|
| 6654 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6655 |
-
File "/tmp/uvnote-run-
|
| 6656 |
output, expert_weights_out, *_ = moe_forward(
|
| 6657 |
^^^^^^^^^^^^
|
| 6658 |
-
File "/tmp/uvnote-run-
|
| 6659 |
x, tokens_per_expert = forward_fn(**forward_args)
|
| 6660 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6661 |
-
File "/tmp/uvnote-run-
|
| 6662 |
x = permute_and_compute(
|
| 6663 |
^^^^^^^^^^^^^^^^^^^^
|
| 6664 |
-
File "/tmp/uvnote-run-
|
| 6665 |
x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
|
| 6666 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6667 |
-
File "/tmp/uvnote-run-
|
| 6668 |
return super().apply(*args, **kwargs) # type: ignore[misc]
|
| 6669 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6670 |
-
File "/tmp/uvnote-run-
|
| 6671 |
return fwd(*args, **kwargs)
|
| 6672 |
^^^^^^^^^^^^^^^^^^^^
|
| 6673 |
-
File "/tmp/uvnote-run-
|
| 6674 |
return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
|
| 6675 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6676 |
-
File "/tmp/uvnote-run-
|
| 6677 |
_binned_copy[(num_experts, expert_capacity)](
|
| 6678 |
-
File "/tmp/uvnote-run-
|
| 6679 |
return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
|
| 6680 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6681 |
-
File "/tmp/uvnote-run-
|
| 6682 |
benchmark()
|
| 6683 |
-
File "/tmp/uvnote-run-
|
| 6684 |
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
|
| 6685 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6686 |
-
File "/tmp/uvnote-run-
|
| 6687 |
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
|
| 6688 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6689 |
-
File "/tmp/uvnote-run-
|
| 6690 |
return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
|
| 6691 |
^^^^^^^^^^^^^
|
| 6692 |
File "/usr/lib/python3.11/functools.py", line 1001, in __get__
|
| 6693 |
val = self.func(instance)
|
| 6694 |
^^^^^^^^^^^^^^^^^^^
|
| 6695 |
-
File "/tmp/uvnote-run-
|
| 6696 |
return driver.active.get_benchmarker()
|
| 6697 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6698 |
-
File "/tmp/uvnote-run-
|
| 6699 |
return getattr(self._initialize_obj(), name)
|
| 6700 |
^^^^^^^^^^^^^^^^^^^^^^
|
| 6701 |
-
File "/tmp/uvnote-run-
|
| 6702 |
self._obj = self._init_fn()
|
| 6703 |
^^^^^^^^^^^^^^^
|
| 6704 |
-
File "/tmp/uvnote-run-
|
| 6705 |
return active_drivers[0]()
|
| 6706 |
^^^^^^^^^^^^^^^^^^^
|
| 6707 |
-
File "/tmp/uvnote-run-
|
| 6708 |
self.utils = CudaUtils() # TODO: make static
|
| 6709 |
^^^^^^^^^^^
|
| 6710 |
-
File "/tmp/uvnote-run-
|
| 6711 |
mod = compile_module_from_src(
|
| 6712 |
^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6713 |
-
File "/tmp/uvnote-run-
|
| 6714 |
so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
|
| 6715 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6716 |
-
File "/tmp/uvnote-run-
|
| 6717 |
subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
|
| 6718 |
File "/usr/lib/python3.11/subprocess.py", line 413, in check_call
|
| 6719 |
raise CalledProcessError(retcode, cmd)
|
| 6720 |
-
subprocess.CalledProcessError: Command '['/usr/bin/gcc', '/tmp/
|
| 6721 |
</div>
|
| 6722 |
</div>
|
| 6723 |
|
|
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
+
Cell: utils | deps: torch, numpy | 35.29s
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3794 |
<div class="uv-install-logs" id="uv-logs-utils">
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
| 3797 |
+
Downloading networkx (1.9MiB)
|
| 3798 |
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3799 |
Downloading numpy (16.2MiB)
|
| 3800 |
+
Downloading sympy (6.0MiB)
|
|
|
|
| 3801 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3802 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3803 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3804 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3805 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3806 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3807 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3808 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3809 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3810 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3811 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3812 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
| 3813 |
Downloading torch (846.9MiB)
|
| 3814 |
+
Downloading triton (148.3MiB)
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
| 3817 |
Downloading networkx
|
|
|
|
| 3824 |
Downloading triton
|
| 3825 |
Downloading nvidia-cufft-cu12
|
| 3826 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3827 |
Downloading nvidia-cusparse-cu12
|
| 3828 |
+
Downloading nvidia-cusparselt-cu12
|
| 3829 |
Downloading nvidia-nccl-cu12
|
| 3830 |
Downloading nvidia-cublas-cu12
|
| 3831 |
Downloading nvidia-cudnn-cu12
|
| 3832 |
Downloading torch
|
| 3833 |
+
Installed 26 packages in 455ms
|
| 3834 |
</div>
|
| 3835 |
</div>
|
| 3836 |
</div>
|
|
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: bench_utils | deps: torch, numpy | 34.44s
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4334 |
Downloading setuptools (1.1MiB)
|
| 4335 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4336 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4337 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4338 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4339 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4340 |
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
| 4341 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4342 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4343 |
+
Downloading triton (148.3MiB)
|
| 4344 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4345 |
+
Downloading torch (846.9MiB)
|
| 4346 |
+
Downloading networkx (1.9MiB)
|
| 4347 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4348 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4349 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4350 |
+
Downloading numpy (16.2MiB)
|
| 4351 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
|
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
| 4369 |
Downloading torch
|
| 4370 |
+
Installed 26 packages in 447ms
|
| 4371 |
</div>
|
| 4372 |
</div>
|
| 4373 |
</div>
|
|
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
+
Cell: config | deps: torch, numpy | 34.69s
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4441 |
<div class="uv-install-logs" id="uv-logs-config">
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
| 4444 |
+
Downloading numpy (16.2MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4445 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4446 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4447 |
+
Downloading torch (846.9MiB)
|
| 4448 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4449 |
Downloading setuptools (1.1MiB)
|
| 4450 |
+
Downloading triton (148.3MiB)
|
| 4451 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4452 |
+
Downloading networkx (1.9MiB)
|
| 4453 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4454 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4455 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4456 |
+
Downloading sympy (6.0MiB)
|
| 4457 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4458 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4459 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4460 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4461 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
|
|
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
| 4464 |
Downloading networkx
|
|
|
|
| 4471 |
Downloading triton
|
| 4472 |
Downloading nvidia-cufft-cu12
|
| 4473 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4474 |
Downloading nvidia-cusparselt-cu12
|
| 4475 |
+
Downloading nvidia-cusparse-cu12
|
| 4476 |
Downloading nvidia-nccl-cu12
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
Downloading nvidia-cudnn-cu12
|
| 4479 |
Downloading torch
|
| 4480 |
+
Installed 26 packages in 526ms
|
| 4481 |
</div>
|
| 4482 |
</div>
|
| 4483 |
</div>
|
|
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
+
Cell: save_data | deps: torch, numpy | 40.40s
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
| 4588 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4589 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4590 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4591 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
| 4592 |
Downloading setuptools (1.1MiB)
|
|
|
|
| 4593 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4594 |
+
Downloading numpy (16.2MiB)
|
| 4595 |
+
Downloading triton (148.3MiB)
|
| 4596 |
+
Downloading networkx (1.9MiB)
|
| 4597 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4598 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4599 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4600 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
| 4601 |
Downloading sympy (6.0MiB)
|
| 4602 |
Downloading torch (846.9MiB)
|
| 4603 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4604 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4605 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
|
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
+
Installed 26 packages in 563ms
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
|
|
|
| 4629 |
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
| 4630 |
<a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
|
| 4631 |
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4632 |
<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
|
| 4633 |
+
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
| 4634 |
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
| 4635 |
</div>
|
| 4636 |
</div>
|
|
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
+
Cell: yamoe_run | deps: torch, kernels, numpy | 38.77s
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
+
Progress: 20% complete (avg: 4.248 ms)
|
| 4942 |
+
Progress: 40% complete (avg: 4.246 ms)
|
| 4943 |
+
Progress: 60% complete (avg: 4.247 ms)
|
| 4944 |
+
Progress: 80% complete (avg: 4.247 ms)
|
| 4945 |
|
| 4946 |
Output tensors:
|
| 4947 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 4951 |
Iterations: 50
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
+
Average: 4.248 ms
|
| 4955 |
+
Min: 4.129 ms
|
| 4956 |
+
Max: 4.266 ms
|
| 4957 |
+
Std Dev: 0.021 ms
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
P50 (median): 4.252 ms
|
| 4961 |
+
P95: 4.264 ms
|
| 4962 |
+
P99: 4.265 ms
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
+
Tokens/sec: 23542.6
|
| 4966 |
+
Std Dev: 117.1
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
|
|
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 4976 |
Downloading networkx (1.9MiB)
|
| 4977 |
+
Downloading sympy (6.0MiB)
|
|
|
|
| 4978 |
Downloading setuptools (1.1MiB)
|
| 4979 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4980 |
+
Downloading hf-xet (3.0MiB)
|
| 4981 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4982 |
+
Downloading triton (148.3MiB)
|
| 4983 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4984 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4985 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4986 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4987 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4988 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4989 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
| 4990 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 4991 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4992 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4993 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4994 |
+
Downloading numpy (16.2MiB)
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
|
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
+
Installed 37 packages in 449ms
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
+
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:00, 5.90it/s]
|
| 5019 |
+
Fetching 6 files: 33%|███▎ | 2/6 [00:00<00:00, 7.70it/s]
|
| 5020 |
+
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 4.70it/s]
|
| 5021 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 10.28it/s]</div>
|
| 5022 |
<div class="cell-artifacts">
|
| 5023 |
<h4>Artifacts:</h4>
|
| 5024 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
|
|
| 5035 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5036 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5037 |
</span> |
|
| 5038 |
+
Cell: binned_run | deps: torch, numpy | 38.76s
|
| 5039 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5040 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5041 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5449 |
|
| 5450 |
Warming up (10 iterations)...
|
| 5451 |
Benchmarking (50 iterations)...
|
| 5452 |
+
Progress: 20% complete (avg: 37.794 ms)
|
| 5453 |
+
Progress: 40% complete (avg: 37.656 ms)
|
| 5454 |
+
Progress: 60% complete (avg: 37.188 ms)
|
| 5455 |
+
Progress: 80% complete (avg: 36.704 ms)
|
| 5456 |
|
| 5457 |
Output tensors:
|
| 5458 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 5462 |
Iterations: 50
|
| 5463 |
|
| 5464 |
Latency Statistics:
|
| 5465 |
+
Average: 36.215 ms
|
| 5466 |
+
Min: 33.172 ms
|
| 5467 |
+
Max: 38.754 ms
|
| 5468 |
+
Std Dev: 1.401 ms
|
| 5469 |
|
| 5470 |
Percentiles:
|
| 5471 |
+
P50 (median): 36.364 ms
|
| 5472 |
+
P95: 38.061 ms
|
| 5473 |
+
P99: 38.464 ms
|
| 5474 |
|
| 5475 |
Throughput:
|
| 5476 |
+
Tokens/sec: 2761.3
|
| 5477 |
+
Std Dev: 108.1
|
| 5478 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5479 |
|
| 5480 |
Saved benchmark results to binned_results.json
|
|
|
|
| 5484 |
<div class="uv-install-logs" id="uv-logs-binned_run">
|
| 5485 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5486 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5487 |
Downloading networkx (1.9MiB)
|
| 5488 |
+
Downloading numpy (16.2MiB)
|
| 5489 |
+
Downloading setuptools (1.1MiB)
|
| 5490 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5491 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5492 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5493 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5494 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5495 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5496 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5497 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5498 |
Downloading triton (148.3MiB)
|
| 5499 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5500 |
+
Downloading torch (846.9MiB)
|
| 5501 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5502 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5503 |
+
Downloading sympy (6.0MiB)
|
| 5504 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5505 |
Downloading nvidia-cufile-cu12
|
| 5506 |
Downloading setuptools
|
| 5507 |
Downloading networkx
|
|
|
|
| 5514 |
Downloading triton
|
| 5515 |
Downloading nvidia-cufft-cu12
|
| 5516 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 5517 |
Downloading nvidia-cusparse-cu12
|
| 5518 |
+
Downloading nvidia-cusparselt-cu12
|
| 5519 |
Downloading nvidia-nccl-cu12
|
| 5520 |
Downloading nvidia-cublas-cu12
|
| 5521 |
Downloading nvidia-cudnn-cu12
|
| 5522 |
Downloading torch
|
| 5523 |
+
Installed 26 packages in 455ms
|
| 5524 |
</div>
|
| 5525 |
</div>
|
| 5526 |
<div class="cell-artifacts">
|
|
|
|
| 5539 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5540 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5541 |
</span> |
|
| 5542 |
+
Cell: gptoss_run | deps: torch, numpy | 39.76s
|
| 5543 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5544 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5545 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5857 |
|
| 5858 |
Warming up (10 iterations)...
|
| 5859 |
Benchmarking (50 iterations)...
|
| 5860 |
+
Progress: 20% complete (avg: 51.012 ms)
|
| 5861 |
+
Progress: 40% complete (avg: 49.954 ms)
|
| 5862 |
+
Progress: 60% complete (avg: 48.390 ms)
|
| 5863 |
+
Progress: 80% complete (avg: 46.993 ms)
|
| 5864 |
|
| 5865 |
Output tensors:
|
| 5866 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 5870 |
Iterations: 50
|
| 5871 |
|
| 5872 |
Latency Statistics:
|
| 5873 |
+
Average: 45.950 ms
|
| 5874 |
+
Min: 40.765 ms
|
| 5875 |
+
Max: 52.300 ms
|
| 5876 |
+
Std Dev: 3.623 ms
|
| 5877 |
|
| 5878 |
Percentiles:
|
| 5879 |
+
P50 (median): 45.469 ms
|
| 5880 |
+
P95: 51.353 ms
|
| 5881 |
+
P99: 52.122 ms
|
| 5882 |
|
| 5883 |
Throughput:
|
| 5884 |
+
Tokens/sec: 2176.3
|
| 5885 |
+
Std Dev: 169.8
|
| 5886 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5887 |
|
| 5888 |
Saved benchmark results to gptoss_results.json
|
|
|
|
| 5892 |
<div class="uv-install-logs" id="uv-logs-gptoss_run">
|
| 5893 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5894 |
<div class="uv-logs-content" style="display: none;">
|
| 5895 |
+
Downloading numpy (16.2MiB)
|
| 5896 |
+
Downloading networkx (1.9MiB)
|
| 5897 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5898 |
Downloading setuptools (1.1MiB)
|
| 5899 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5900 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 5901 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
| 5902 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5903 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5904 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5905 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5906 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5907 |
Downloading triton (148.3MiB)
|
| 5908 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5909 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5910 |
+
Downloading torch (846.9MiB)
|
| 5911 |
+
Downloading sympy (6.0MiB)
|
| 5912 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5913 |
Downloading nvidia-cufile-cu12
|
| 5914 |
Downloading setuptools
|
|
|
|
| 5922 |
Downloading triton
|
| 5923 |
Downloading nvidia-cufft-cu12
|
| 5924 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 5925 |
Downloading nvidia-cusparse-cu12
|
| 5926 |
+
Downloading nvidia-cusparselt-cu12
|
| 5927 |
Downloading nvidia-nccl-cu12
|
| 5928 |
Downloading nvidia-cublas-cu12
|
| 5929 |
Downloading nvidia-cudnn-cu12
|
| 5930 |
Downloading torch
|
| 5931 |
+
Installed 26 packages in 524ms
|
| 5932 |
</div>
|
| 5933 |
</div>
|
| 5934 |
<div class="cell-artifacts">
|
|
|
|
| 5947 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5948 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5949 |
</span> |
|
| 5950 |
+
Cell: gptoss_training_run | deps: torch, numpy | 40.42s
|
| 5951 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5952 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5953 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6248 |
|
| 6249 |
Warming up (10 iterations)...
|
| 6250 |
Benchmarking (50 iterations)...
|
| 6251 |
+
Progress: 20% complete (avg: 48.387 ms)
|
| 6252 |
+
Progress: 40% complete (avg: 48.249 ms)
|
| 6253 |
+
Progress: 60% complete (avg: 47.887 ms)
|
| 6254 |
+
Progress: 80% complete (avg: 47.011 ms)
|
| 6255 |
|
| 6256 |
Output tensors:
|
| 6257 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 6261 |
Iterations: 50
|
| 6262 |
|
| 6263 |
Latency Statistics:
|
| 6264 |
+
Average: 46.098 ms
|
| 6265 |
+
Min: 38.839 ms
|
| 6266 |
+
Max: 49.404 ms
|
| 6267 |
+
Std Dev: 2.469 ms
|
| 6268 |
|
| 6269 |
Percentiles:
|
| 6270 |
+
P50 (median): 47.240 ms
|
| 6271 |
+
P95: 48.725 ms
|
| 6272 |
+
P99: 49.168 ms
|
| 6273 |
|
| 6274 |
Throughput:
|
| 6275 |
+
Tokens/sec: 2169.3
|
| 6276 |
+
Std Dev: 122.3
|
| 6277 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6278 |
|
| 6279 |
Saved benchmark results to gptoss_training_results.json
|
|
|
|
| 6283 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6284 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6285 |
<div class="uv-logs-content" style="display: none;">
|
| 6286 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 6287 |
Downloading setuptools (1.1MiB)
|
| 6288 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6289 |
+
Downloading sympy (6.0MiB)
|
| 6290 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6291 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6292 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6293 |
+
Downloading torch (846.9MiB)
|
| 6294 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6295 |
+
Downloading networkx (1.9MiB)
|
| 6296 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6297 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6298 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
|
|
|
| 6299 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6300 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6301 |
+
Downloading numpy (16.2MiB)
|
| 6302 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
| 6303 |
Downloading triton (148.3MiB)
|
|
|
|
| 6304 |
Downloading nvidia-cufile-cu12
|
| 6305 |
Downloading setuptools
|
| 6306 |
Downloading networkx
|
|
|
|
| 6319 |
Downloading nvidia-cublas-cu12
|
| 6320 |
Downloading nvidia-cudnn-cu12
|
| 6321 |
Downloading torch
|
| 6322 |
+
Installed 26 packages in 451ms
|
| 6323 |
</div>
|
| 6324 |
</div>
|
| 6325 |
<div class="cell-artifacts">
|
|
|
|
| 6338 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6339 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6340 |
</span> |
|
| 6341 |
+
Cell: megablocks_run | deps: torch, numpy, kernels | 40.19s | FAILED
|
| 6342 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6343 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6344 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6493 |
<span class="c1"># Attach loaded expert weights to the experts container</span>
|
| 6494 |
<span class="n">e</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">experts</span>
|
| 6495 |
<span class="n">e</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
|
| 6496 |
+
<span class="n">e</span><span class="o">.</span><span class="n">capacity_factor</span> <span class="o">=</span> <span class="mi">32</span>
|
| 6497 |
<span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
| 6498 |
<span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
| 6499 |
<span class="n">e</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
|
|
|
| 6570 |
<div class="uv-install-logs" id="uv-logs-megablocks_run">
|
| 6571 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6572 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 6573 |
Downloading numpy (16.2MiB)
|
|
|
|
|
|
|
| 6574 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
| 6575 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
|
|
|
| 6576 |
Downloading hf-xet (3.0MiB)
|
| 6577 |
+
Downloading networkx (1.9MiB)
|
| 6578 |
+
Downloading torch (846.9MiB)
|
| 6579 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6580 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
| 6581 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6582 |
+
Downloading triton (148.3MiB)
|
| 6583 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6584 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6585 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6586 |
Downloading sympy (6.0MiB)
|
| 6587 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6588 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6589 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6590 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6591 |
+
Downloading setuptools (1.1MiB)
|
| 6592 |
Downloading nvidia-cufile-cu12
|
| 6593 |
Downloading hf-xet
|
| 6594 |
Downloading setuptools
|
|
|
|
| 6602 |
Downloading triton
|
| 6603 |
Downloading nvidia-cufft-cu12
|
| 6604 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 6605 |
Downloading nvidia-cusparse-cu12
|
| 6606 |
+
Downloading nvidia-cusparselt-cu12
|
| 6607 |
Downloading nvidia-nccl-cu12
|
| 6608 |
Downloading nvidia-cublas-cu12
|
| 6609 |
Downloading nvidia-cudnn-cu12
|
| 6610 |
Downloading torch
|
| 6611 |
+
Installed 37 packages in 449ms
|
| 6612 |
</div>
|
| 6613 |
</div>
|
| 6614 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6615 |
+
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:23, 2.74it/s]
|
| 6616 |
+
Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:03, 17.38it/s]
|
| 6617 |
+
Fetching 66 files: 26%|██▌ | 17/66 [00:01<00:02, 17.85it/s]
|
| 6618 |
+
Fetching 66 files: 55%|█████▍ | 36/66 [00:01<00:00, 42.23it/s]
|
| 6619 |
+
Fetching 66 files: 65%|██████▌ | 43/66 [00:01<00:00, 38.03it/s]
|
| 6620 |
+
Fetching 66 files: 74%|███████▍ | 49/66 [00:01<00:00, 30.77it/s]
|
| 6621 |
+
Fetching 66 files: 97%|█████████▋| 64/66 [00:01<00:00, 48.18it/s]
|
| 6622 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 34.40it/s]
|
| 6623 |
+
/tmp/tmptrubhjfl/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
|
|
|
|
| 6624 |
5 | #include <Python.h>
|
| 6625 |
| ^~~~~~~~~~
|
| 6626 |
compilation terminated.
|
|
|
|
| 6637 |
File "/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py", line 177, in <lambda>
|
| 6638 |
call = lambda x: fn(x, *args[1:], **kwargs)
|
| 6639 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6640 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
|
| 6641 |
return self._call_impl(*args, **kwargs)
|
| 6642 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6643 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
|
| 6644 |
return forward_call(*args, **kwargs)
|
| 6645 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6646 |
File "/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/megablocks_run.py", line 81, in forward
|
| 6647 |
output, dummy_routing_weights = self.model(hidden_states)
|
| 6648 |
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6649 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
|
| 6650 |
return self._call_impl(*args, **kwargs)
|
| 6651 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6652 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
|
| 6653 |
return forward_call(*args, **kwargs)
|
| 6654 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6655 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py", line 896, in forward
|
| 6656 |
output, expert_weights_out, *_ = moe_forward(
|
| 6657 |
^^^^^^^^^^^^
|
| 6658 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py", line 730, in moe_forward
|
| 6659 |
x, tokens_per_expert = forward_fn(**forward_args)
|
| 6660 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6661 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py", line 457, in forward_once
|
| 6662 |
x = permute_and_compute(
|
| 6663 |
^^^^^^^^^^^^^^^^^^^^
|
| 6664 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py", line 401, in permute_and_compute
|
| 6665 |
x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
|
| 6666 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6667 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/autograd/function.py", line 576, in apply
|
| 6668 |
return super().apply(*args, **kwargs) # type: ignore[misc]
|
| 6669 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6670 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/stk_autocast.py", line 30, in decorate_fwd
|
| 6671 |
return fwd(*args, **kwargs)
|
| 6672 |
^^^^^^^^^^^^^^^^^^^^
|
| 6673 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/binned_gather.py", line 26, in forward
|
| 6674 |
return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
|
| 6675 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6676 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py", line 419, in binned_gather
|
| 6677 |
_binned_copy[(num_experts, expert_capacity)](
|
| 6678 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/jit.py", line 390, in <lambda>
|
| 6679 |
return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
|
| 6680 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6681 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py", line 239, in run
|
| 6682 |
benchmark()
|
| 6683 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py", line 228, in benchmark
|
| 6684 |
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
|
| 6685 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6686 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py", line 228, in <dictcomp>
|
| 6687 |
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
|
| 6688 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6689 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py", line 160, in _bench
|
| 6690 |
return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
|
| 6691 |
^^^^^^^^^^^^^
|
| 6692 |
File "/usr/lib/python3.11/functools.py", line 1001, in __get__
|
| 6693 |
val = self.func(instance)
|
| 6694 |
^^^^^^^^^^^^^^^^^^^
|
| 6695 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py", line 121, in do_bench
|
| 6696 |
return driver.active.get_benchmarker()
|
| 6697 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6698 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py", line 30, in __getattr__
|
| 6699 |
return getattr(self._initialize_obj(), name)
|
| 6700 |
^^^^^^^^^^^^^^^^^^^^^^
|
| 6701 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py", line 26, in _initialize_obj
|
| 6702 |
self._obj = self._init_fn()
|
| 6703 |
^^^^^^^^^^^^^^^
|
| 6704 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py", line 12, in _create_driver
|
| 6705 |
return active_drivers[0]()
|
| 6706 |
^^^^^^^^^^^^^^^^^^^
|
| 6707 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py", line 715, in __init__
|
| 6708 |
self.utils = CudaUtils() # TODO: make static
|
| 6709 |
^^^^^^^^^^^
|
| 6710 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py", line 62, in __init__
|
| 6711 |
mod = compile_module_from_src(
|
| 6712 |
^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6713 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py", line 88, in compile_module_from_src
|
| 6714 |
so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
|
| 6715 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6716 |
+
File "/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py", line 51, in _build
|
| 6717 |
subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
|
| 6718 |
File "/usr/lib/python3.11/subprocess.py", line 413, in check_call
|
| 6719 |
raise CalledProcessError(retcode, cmd)
|
| 6720 |
+
subprocess.CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmptrubhjfl/cuda_utils.c', '-O3', '-shared', '-fPIC', '-Wno-psabi', '-o', '/tmp/tmptrubhjfl/cuda_utils.cpython-311-x86_64-linux-gnu.so', '-lcuda', '-L/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/lib', '-L/usr/lib/x86_64-linux-gnu', '-I/tmp/uvnote-run-68wjowzh/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/include', '-I/tmp/tmptrubhjfl', '-I/usr/include/python3.11']' returned non-zero exit status 1.</div>
|
| 6721 |
</div>
|
| 6722 |
</div>
|
| 6723 |
|