# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-# "accelerate>=1.10.1",
-# "torch>=2.7.0",
-# "kernels==0.10.0",
-# "transformers@https://github.com/huggingface/transformers.git",
-# "ipdb>=0.13.13",
-# "matplotlib>=3.7.2",
-# "numpy>=1.24.3",
-# ]
-# ///
-
-importtorch
-fromtransformersimportGptOssForCausalLM,PreTrainedTokenizerFast,Mxfp4Config
-importtime
-importtorch.nnasnn
-fromkernelsimportregister_kernel_mapping,Mode,LayerRepository,replace_kernel_forward_from_hub
-importsys
-importtorch.profiler
-importgc
-importlogging
-fromtransformers.models.gpt_oss.modeling_gpt_ossimportGptOssRMSNorm
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-defreset_peak_memory_stats():
-"""Clear CUDA cache and reset memory allocation counters."""
- torch.cuda.empty_cache()
- iftorch.cuda.is_available():
- torch.cuda.reset_peak_memory_stats()
- gc.collect()
-
-defget_memory_stats():
-"""Get current and peak CUDA memory usage."""
- ifnottorch.cuda.is_available():
- return{"allocated_gb":0,"peak_gb":0,"reserved_gb":0}
- return{
- "allocated_gb":torch.cuda.memory_allocated()/1e9,
- "peak_gb":torch.cuda.max_memory_allocated()/1e9,
- "reserved_gb":torch.cuda.memory_reserved()/1e9,
- }
-
-defoverride_kernel_layer_name(cls_name:str,value)->bool:
-"""Helper to dynamically override the kernel_layer_name in a model class."""
- formodinsys.modules.values():
- ifmodisNone:
- continue
- obj=getattr(mod,cls_name,None)
- ifisinstance(obj,type)andissubclass(obj,nn.Module):
- setattr(obj,"kernel_layer_name",value)
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
- returnTrue
- returnFalse
-
-
-# Init the model the normal way
-model_id="openai/gpt-oss-20b"
-tokenizer=PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config=Mxfp4Config(dequantize=True)
-
-
-
-model=GptOssForCausalLM.from_pretrained(
- model_id,
- dtype="bfloat16",
- device_map="auto",
- use_kernels=False,
- quantization_config=quantization_config,
-).eval()
-
-messages=[
- {"role":"system","content":"What is Tensor Parallelism?"},
-]
-
-inputs=tokenizer.apply_chat_template(
- messages,
- add_generation_prompt=True,
- return_tensors="pt",
- return_dict=True,
- reasoning_effort="low",
-).to("cuda")
-
-max_tokens=256
-
-withtorch.inference_mode():
- start_time=time.perf_counter()
- generated=model.generate(
- **inputs,
- max_new_tokens=max_tokens,
- do_sample=False,
- temperature=None,
- )
- end_time=time.perf_counter()
-
-print(tokenizer.decode(generated[0],skip_special_tokens=False))
-print(f"Generation took {end_time-start_time:.2f} seconds")
-
-
-
-
-
-
-
-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
-Knowledge cutoff: 2024-06
-Current date: 2025-09-24
-
-Reasoning: low
-
-# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions
-
-What is Tensor Parallelism?
-
-<|end|><|start|>assistant<|channel|>analysis<|message|>Need to explain tensor parallelism.<|end|><|start|>assistant<|channel|>final<|message|>**Tensor Parallelism** is a technique used to distribute the computation of a single tensor (or a set of tensors) across multiple devices (GPUs, TPUs, or other accelerators) in order to speed up large‑scale deep‑learning training and inference. It is one of several parallelism strategies—alongside data parallelism, pipeline parallelism, and model parallelism—designed to scale models that exceed the memory or compute capacity of a single device.
-
----
-
-## 1. Why Tensor Parallelism?
-
-- **Memory constraints**: Modern transformer models (e.g., GPT‑4, PaLM, LLaMA‑70B) can have billions of parameters. A single GPU may not hold the entire model or its intermediate activations.
-- **Compute bottlenecks**: Even if memory were sufficient, the sheer number of floating‑point operations can make training prohibitively slow on a single device.
-- **Scalability**: Tensor parallelism allows you to add more devices to reduce per‑device memory usage and accelerate matrix operations.
-
----
-
-## 2. Core Idea
-
-In tensor parallelism, a *tensor* (typically a weight matrix or an activation tensor
-Generation took 25.73 seconds
-
Next, we'll attempt to run a forward and backward pass without any custom kernels. This will likely run out of memory since the default implementation is not optimized for memory usage.
@@ -4116,7 +3727,7 @@ Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00
▼ output▶ uv-logs
|
-Cell: forward_and_backward_no_kernel | 99.86s | FAILED
+Cell: forward_and_backward_no_kernel | 99.38s | FAILED
|
Raw
@@ -4549,7 +4160,7 @@ Post-forward memory: {'allocated_gb': 9.487933952, 'peak_gb'
Loss: 1.9761
Running backward pass...
Pre-backward memory: {'allocated_gb': 9.405890048, 'peak_gb': 9.514059776, 'reserved_gb': 17.177772032}
-OOM during forward/backward pass: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 34932 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+OOM during forward/backward pass: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 25557 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Try reducing max_tokens or max_seq_len
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
-Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:14, 7.40s/it]
-Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.77s/it]
-Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.88s/it]
+Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:15, 7.84s/it]
+Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.40s/it]
+Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.71s/it]
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.34s/it]
@@ -4629,14 +4240,14 @@ Traceback (most recent call last):
File "/repo/moe_benchmarks/megablocks/.uvnote/cells/forward_and_backward_no_kernel.py", line 154, in <module>
loss.backward()
~~~~~~~~~~~~~^^
- File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/_tensor.py", line 647, in backward
+ File "/tmp/uvnote-run-yr7p57do/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/_tensor.py", line 647, in backward
torch.autograd.backward(
~~~~~~~~~~~~~~~~~~~~~~~^
self, gradient, retain_graph, create_graph, inputs=inputs
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
- File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/__init__.py", line 354, in backward
+ File "/tmp/uvnote-run-yr7p57do/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/__init__.py", line 354, in backward
_engine_run_backward(
~~~~~~~~~~~~~~~~~~~~^
tensors,
@@ -4646,19 +4257,19 @@ Traceback (most recent call last):
^^^^^^^^^^^^^^^^^^^^^
)
^
- File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
+ File "/tmp/uvnote-run-yr7p57do/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
t_outputs, *args, **kwargs
^^^^^^^^^^^^^^^^^^^^^^^^^^
) # Calls into the C++ engine to run the backward pass
^
- File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/function.py", line 311, in apply
+ File "/tmp/uvnote-run-yr7p57do/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/function.py", line 311, in apply
return user_fn(self, *args)
- File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/utils/checkpoint.py", line 319, in backward
+ File "/tmp/uvnote-run-yr7p57do/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/utils/checkpoint.py", line 319, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/__init__.py", line 354, in backward
+ File "/tmp/uvnote-run-yr7p57do/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/__init__.py", line 354, in backward
_engine_run_backward(
~~~~~~~~~~~~~~~~~~~~^
tensors,
@@ -4668,14 +4279,14 @@ Traceback (most recent call last):
^^^^^^^^^^^^^^^^^^^^^
)
^
- File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
+ File "/tmp/uvnote-run-yr7p57do/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
t_outputs, *args, **kwargs
^^^^^^^^^^^^^^^^^^^^^^^^^^
) # Calls into the C++ engine to run the backward pass
^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 34932 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 25557 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
@@ -4683,1033 +4294,8 @@ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2
Next we can run with Megablocks kernels enabled.
Forward
First, we run a forward pass with Megablocks kernels.
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-# "accelerate>=1.10.1",
-# "torch>=2.7.0",
-# "kernels==0.10.0",
-# "transformers@https://github.com/huggingface/transformers.git",
-# "ipdb>=0.13.13",
-# "matplotlib>=3.7.2",
-# "numpy>=1.24.3",
-# ]
-# ///
-
-importtorch
-fromtransformersimportGptOssForCausalLM,PreTrainedTokenizerFast,Mxfp4Config
-importtime
-importtorch.nnasnn
-fromkernelsimportregister_kernel_mapping,Mode,LayerRepository,replace_kernel_forward_from_hub
-importsys
-importtorch.profiler
-importgc
-importlogging
-fromtransformers.models.gpt_oss.modeling_gpt_ossimportGptOssRMSNorm
-
-
-replace_kernel_forward_from_hub(GptOssRMSNorm,None)
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-defreset_peak_memory_stats():
-"""Clear CUDA cache and reset memory allocation counters."""
- torch.cuda.empty_cache()
- iftorch.cuda.is_available():
- torch.cuda.reset_peak_memory_stats()
- gc.collect()
-
-defget_memory_stats():
-"""Get current and peak CUDA memory usage."""
- ifnottorch.cuda.is_available():
- return{"allocated_gb":0,"peak_gb":0,"reserved_gb":0}
- return{
- "allocated_gb":torch.cuda.memory_allocated()/1e9,
- "peak_gb":torch.cuda.max_memory_allocated()/1e9,
- "reserved_gb":torch.cuda.memory_reserved()/1e9,
- }
-
-defoverride_kernel_layer_name(cls_name:str,value)->bool:
-"""Helper to dynamically override the kernel_layer_name in a model class."""
- formodinsys.modules.values():
- ifmodisNone:
- continue
- obj=getattr(mod,cls_name,None)
- ifisinstance(obj,type)andissubclass(obj,nn.Module):
- setattr(obj,"kernel_layer_name",value)
- print(f"Overrode {cls_name}.kernel_layer_name to {value}")
- returnTrue
- returnFalse
-
-
-# Init the model the normal way
-model_id="openai/gpt-oss-20b"
-tokenizer=PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config=Mxfp4Config(dequantize=True)
-
-
-
-model=GptOssForCausalLM.from_pretrained(
- model_id,
- dtype="bfloat16",
- device_map="auto",
- use_kernels=True,
- quantization_config=quantization_config,
-).eval()
-
-messages=[
- {"role":"system","content":"What is Tensor Parallelism?"},
-]
-
-inputs=tokenizer.apply_chat_template(
- messages,
- add_generation_prompt=True,
- return_tensors="pt",
- return_dict=True,
- reasoning_effort="low",
-).to("cuda")
-
-max_tokens=256
-
-withtorch.inference_mode():
- start_time=time.perf_counter()
- generated=model.generate(
- **inputs,
- max_new_tokens=max_tokens,
- do_sample=False,
- temperature=None,
- )
- end_time=time.perf_counter()
-
-print(tokenizer.decode(generated[0],skip_special_tokens=False))
-print(f"Generation took {end_time-start_time:.2f} seconds")
-
-
-
-
-
-
-
-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
-Knowledge cutoff: 2024-06
-Current date: 2025-09-24
-
-Reasoning: low
-
-# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions
-
-What is Tensor Parallelism?
-
-<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it's used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it's also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide mention of "tensor parallelism" in Megatron-LM: splitting weight matrices across GPUs. Provide mention of "tensor parallelism" in DeepSpeed: "ZeRO-Offload" etc. Provide mention
-Generation took 31.31 seconds
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
-Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:14, 7.39s/it]
-Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.78s/it]
-Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.88s/it]
-You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
-
-Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
-Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.34s/it]
-Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
-Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
-Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-
-Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
-Fetching 66 files: 2%|▏ | 1/66 [00:00<00:10, 6.01it/s]
-Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:01, 31.85it/s]
-Fetching 66 files: 20%|█▉ | 13/66 [00:00<00:02, 24.06it/s]
-Fetching 66 files: 26%|██▌ | 17/66 [00:01<00:03, 12.48it/s]
-Fetching 66 files: 74%|███████▍ | 49/66 [00:01<00:00, 53.80it/s]
-Fetching 66 files: 91%|█████████ | 60/66 [00:01<00:00, 57.68it/s]
-Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 40.58it/s]
-/tmp/uvnote-run-_tyh_wp6/home/.cache/uv/environments-v2/forward-only-504a4941eac030a5/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
-No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
- warnings.warn(
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-/tmp/uvnote-run-_tyh_wp6/home/.cache/uv/environments-v2/forward-only-504a4941eac030a5/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
-No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
- warnings.warn(
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-
-
-
Forward and Backward
Next, we run a forward and backward pass with Megablocks kernels enabled. This should be more memory efficient and allow us to complete the backward pass without running out of memory.
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
-Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:15, 7.79s/it]
-Fetching 3 files: 67%|██████▋ | 2/3 [00:09<00:04, 4.50s/it]
-Fetching 3 files: 100%|██████████| 3/3 [00:09<00:00, 3.33s/it]
-You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
-
-Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
-Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.36s/it]
-Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
-Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
-Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-
-Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
-Fetching 66 files: 2%|▏ | 1/66 [00:00<00:13, 4.68it/s]
-Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:02, 26.64it/s]
-Fetching 66 files: 21%|██ | 14/66 [00:00<00:01, 33.33it/s]
-Fetching 66 files: 27%|██▋ | 18/66 [00:00<00:02, 17.59it/s]
-Fetching 66 files: 53%|█████▎ | 35/66 [00:01<00:00, 43.25it/s]
-Fetching 66 files: 64%|██████▎ | 42/66 [00:01<00:00, 43.72it/s]
-Fetching 66 files: 74%|███████▍ | 49/66 [00:01<00:00, 40.60it/s]
-Fetching 66 files: 85%|████████▍ | 56/66 [00:01<00:00, 42.33it/s]
-Fetching 66 files: 95%|█████████▌| 63/66 [00:01<00:00, 38.03it/s]
-Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 36.73it/s]
-/tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
-No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
- warnings.warn(
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-/tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
-No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
- warnings.warn(
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-/tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
-No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
- warnings.warn(
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-/tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
-No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
- warnings.warn(
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
-INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`