drbh commited on Aug 22

Commit

dcfa38d

1 Parent(s): 8176cbe

feat: bump build for torch compile

Files changed (18) hide show

build/torch27-cxx11-cu118-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu118-x86_64-linux/megablocks/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/megablocks/layers.py +258 -34
build/torch27-cxx11-cu126-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/megablocks/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/megablocks/layers.py +258 -34
build/torch27-cxx11-cu128-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu128-x86_64-linux/megablocks/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/megablocks/layers.py +258 -34
build/torch28-cxx11-cu126-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu126-x86_64-linux/megablocks/_ops.py +3 -3
build/torch28-cxx11-cu126-x86_64-linux/megablocks/layers.py +258 -34
build/torch28-cxx11-cu128-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu128-x86_64-linux/megablocks/_ops.py +3 -3
build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py +258 -34
build/torch28-cxx11-cu129-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu129-x86_64-linux/megablocks/_ops.py +3 -3
build/torch28-cxx11-cu129-x86_64-linux/megablocks/layers.py +258 -34

build/torch27-cxx11-cu118-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f00f02cb159ccecc961af4ceab76fbebd06b61569f8b109a1c63cbcf9cf4a02
 size 10513752

 version https://git-lfs.github.com/spec/v1
+oid sha256:5ee50c722d5ff355fd4e91d557dffe3be9b674dd5901748dc19286aef37d6d60
 size 10513752

build/torch27-cxx11-cu118-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_3bdb4b8_dirty
-ops = torch.ops._megablocks_3bdb4b8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_3bdb4b8_dirty::{op_name}"

 import torch
+from . import _megablocks_8176cbe_dirty
+ops = torch.ops._megablocks_8176cbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_8176cbe_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -1,11 +1,200 @@
 import torch
 import torch.distributed as dist
-from typing import Optional, Any
 from . import _layers
 from . import ops
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
@@ -80,6 +269,7 @@ def compute_top_k(scores: torch.Tensor, moe_top_k: int):
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -91,7 +281,7 @@ def route_tokens(
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
-    logits = torch.nn.functional.linear(x_flat, router_weight)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
@@ -129,6 +319,7 @@ def mlp_forward(
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
@@ -144,13 +335,13 @@ def mlp_forward(
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
-    gate, up = gate_up.chunk(2, dim=-1)
     glu = gate * torch.sigmoid(gate * alpha)
-    x = (up + 1) * glu
-    return torch.bmm(x, w2) + w2_bias[..., None, :]
 # Shared expert MLP forward pass
 def shared_mlp_forward(
@@ -184,13 +375,13 @@ def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
@@ -657,6 +848,7 @@ def parallel_forward_once(
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -682,6 +874,7 @@ def moe_forward(
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
@@ -743,6 +936,7 @@ def moe_forward(
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -775,6 +969,7 @@ def moe_forward_with_shared_expert(
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
@@ -795,7 +990,7 @@ def moe_forward_with_shared_expert(
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
@@ -807,7 +1002,7 @@ def moe_forward_with_shared_expert(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
@@ -815,9 +1010,9 @@ def moe_forward_with_shared_expert(
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
@@ -833,7 +1028,7 @@ def create_shared_expert_weights(
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
@@ -847,14 +1042,15 @@ def create_shared_expert_weights(
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
@@ -863,14 +1059,21 @@ def get_device_mesh(model):
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
-        hook = next(h for h in model.experts._forward_pre_hooks.values() if 'device_mesh' in h.__code__.co_freevars)
         # Extract the device_mesh from the closure
-        return hook.__closure__[hook.__code__.co_freevars.index('device_mesh')].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
@@ -879,7 +1082,9 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -887,15 +1092,21 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -919,8 +1130,12 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         return output, expert_weights_out
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
@@ -930,7 +1145,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
@@ -946,7 +1161,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
@@ -954,7 +1169,9 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -962,15 +1179,22 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -998,4 +1222,4 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
-        return output, expert_weights_out

 import torch
 import torch.distributed as dist
+from typing import Optional, Any, TYPE_CHECKING
 from . import _layers
 from . import ops
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+                return decorator
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+    # Create wrapper functions that check for compilation and return meta tensors
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+        ops.sort = sort_with_meta
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+        ops.histogram = histogram_with_meta
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+        ops.binned_gather = binned_gather_with_meta
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+        ops.binned_scatter = binned_scatter_with_meta
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+        ops.gather = gather_with_meta
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+        ops.scatter = scatter_with_meta
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+        ops.replicate = replicate_with_meta
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+        ops.repeat = repeat_with_meta
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
+    limit: float = 7.0,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
     glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
 # Shared expert MLP forward pass
 def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
+        router_bias,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
+        router_bias=router_bias,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
         # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
         return output, expert_weights_out
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
+        return output, expert_weights_out

build/torch27-cxx11-cu126-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f286efaecea4ae5f49c6c661285e5a8b40808908b5382f810b0941295ef0ae4d
 size 11927016

 version https://git-lfs.github.com/spec/v1
+oid sha256:eff5def5d00d090fb74d18f9d5101d8f41d71adc70fd478407380f0813a8ba44
 size 11927016

build/torch27-cxx11-cu126-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_3bdb4b8_dirty
-ops = torch.ops._megablocks_3bdb4b8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_3bdb4b8_dirty::{op_name}"

 import torch
+from . import _megablocks_8176cbe_dirty
+ops = torch.ops._megablocks_8176cbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_8176cbe_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -1,11 +1,200 @@
 import torch
 import torch.distributed as dist
-from typing import Optional, Any
 from . import _layers
 from . import ops
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
@@ -80,6 +269,7 @@ def compute_top_k(scores: torch.Tensor, moe_top_k: int):
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -91,7 +281,7 @@ def route_tokens(
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
-    logits = torch.nn.functional.linear(x_flat, router_weight)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
@@ -129,6 +319,7 @@ def mlp_forward(
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
@@ -144,13 +335,13 @@ def mlp_forward(
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
-    gate, up = gate_up.chunk(2, dim=-1)
     glu = gate * torch.sigmoid(gate * alpha)
-    x = (up + 1) * glu
-    return torch.bmm(x, w2) + w2_bias[..., None, :]
 # Shared expert MLP forward pass
 def shared_mlp_forward(
@@ -184,13 +375,13 @@ def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
@@ -657,6 +848,7 @@ def parallel_forward_once(
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -682,6 +874,7 @@ def moe_forward(
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
@@ -743,6 +936,7 @@ def moe_forward(
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -775,6 +969,7 @@ def moe_forward_with_shared_expert(
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
@@ -795,7 +990,7 @@ def moe_forward_with_shared_expert(
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
@@ -807,7 +1002,7 @@ def moe_forward_with_shared_expert(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
@@ -815,9 +1010,9 @@ def moe_forward_with_shared_expert(
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
@@ -833,7 +1028,7 @@ def create_shared_expert_weights(
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
@@ -847,14 +1042,15 @@ def create_shared_expert_weights(
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
@@ -863,14 +1059,21 @@ def get_device_mesh(model):
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
-        hook = next(h for h in model.experts._forward_pre_hooks.values() if 'device_mesh' in h.__code__.co_freevars)
         # Extract the device_mesh from the closure
-        return hook.__closure__[hook.__code__.co_freevars.index('device_mesh')].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
@@ -879,7 +1082,9 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -887,15 +1092,21 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -919,8 +1130,12 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         return output, expert_weights_out
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
@@ -930,7 +1145,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
@@ -946,7 +1161,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
@@ -954,7 +1169,9 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -962,15 +1179,22 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -998,4 +1222,4 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
-        return output, expert_weights_out

 import torch
 import torch.distributed as dist
+from typing import Optional, Any, TYPE_CHECKING
 from . import _layers
 from . import ops
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+                return decorator
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+    # Create wrapper functions that check for compilation and return meta tensors
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+        ops.sort = sort_with_meta
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+        ops.histogram = histogram_with_meta
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+        ops.binned_gather = binned_gather_with_meta
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+        ops.binned_scatter = binned_scatter_with_meta
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+        ops.gather = gather_with_meta
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+        ops.scatter = scatter_with_meta
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+        ops.replicate = replicate_with_meta
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+        ops.repeat = repeat_with_meta
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
+    limit: float = 7.0,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
     glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
 # Shared expert MLP forward pass
 def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
+        router_bias,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
+        router_bias=router_bias,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
         # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
         return output, expert_weights_out
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
+        return output, expert_weights_out

build/torch27-cxx11-cu128-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cbc2acbd9421ba25cb0972da6915168f0ff88ce1e5fce547bdd240319945b212
 size 17884448

 version https://git-lfs.github.com/spec/v1
+oid sha256:2f22a56b5e69d365a2f077c9713f4eae7d325e201fee3cf705af6663d9cb854a
 size 17884448

build/torch27-cxx11-cu128-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_3bdb4b8_dirty
-ops = torch.ops._megablocks_3bdb4b8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_3bdb4b8_dirty::{op_name}"

 import torch
+from . import _megablocks_8176cbe_dirty
+ops = torch.ops._megablocks_8176cbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_8176cbe_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -1,11 +1,200 @@
 import torch
 import torch.distributed as dist
-from typing import Optional, Any
 from . import _layers
 from . import ops
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
@@ -80,6 +269,7 @@ def compute_top_k(scores: torch.Tensor, moe_top_k: int):
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -91,7 +281,7 @@ def route_tokens(
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
-    logits = torch.nn.functional.linear(x_flat, router_weight)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
@@ -129,6 +319,7 @@ def mlp_forward(
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
@@ -144,13 +335,13 @@ def mlp_forward(
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
-    gate, up = gate_up.chunk(2, dim=-1)
     glu = gate * torch.sigmoid(gate * alpha)
-    x = (up + 1) * glu
-    return torch.bmm(x, w2) + w2_bias[..., None, :]
 # Shared expert MLP forward pass
 def shared_mlp_forward(
@@ -184,13 +375,13 @@ def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
@@ -657,6 +848,7 @@ def parallel_forward_once(
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -682,6 +874,7 @@ def moe_forward(
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
@@ -743,6 +936,7 @@ def moe_forward(
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -775,6 +969,7 @@ def moe_forward_with_shared_expert(
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
@@ -795,7 +990,7 @@ def moe_forward_with_shared_expert(
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
@@ -807,7 +1002,7 @@ def moe_forward_with_shared_expert(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
@@ -815,9 +1010,9 @@ def moe_forward_with_shared_expert(
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
@@ -833,7 +1028,7 @@ def create_shared_expert_weights(
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
@@ -847,14 +1042,15 @@ def create_shared_expert_weights(
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
@@ -863,14 +1059,21 @@ def get_device_mesh(model):
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
-        hook = next(h for h in model.experts._forward_pre_hooks.values() if 'device_mesh' in h.__code__.co_freevars)
         # Extract the device_mesh from the closure
-        return hook.__closure__[hook.__code__.co_freevars.index('device_mesh')].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
@@ -879,7 +1082,9 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -887,15 +1092,21 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -919,8 +1130,12 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         return output, expert_weights_out
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
@@ -930,7 +1145,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
@@ -946,7 +1161,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
@@ -954,7 +1169,9 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -962,15 +1179,22 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -998,4 +1222,4 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
-        return output, expert_weights_out

 import torch
 import torch.distributed as dist
+from typing import Optional, Any, TYPE_CHECKING
 from . import _layers
 from . import ops
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+                return decorator
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+    # Create wrapper functions that check for compilation and return meta tensors
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+        ops.sort = sort_with_meta
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+        ops.histogram = histogram_with_meta
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+        ops.binned_gather = binned_gather_with_meta
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+        ops.binned_scatter = binned_scatter_with_meta
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+        ops.gather = gather_with_meta
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+        ops.scatter = scatter_with_meta
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+        ops.replicate = replicate_with_meta
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+        ops.repeat = repeat_with_meta
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
+    limit: float = 7.0,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
     glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
 # Shared expert MLP forward pass
 def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
+        router_bias,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
+        router_bias=router_bias,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
         # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
         return output, expert_weights_out
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
+        return output, expert_weights_out

build/torch28-cxx11-cu126-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:289b7b6ebae763cc023856ce1f7179a8c59d6000108311f28d54758a2d3275ad
 size 11817960

 version https://git-lfs.github.com/spec/v1
+oid sha256:534a56ee5f5d1e8c1691a9644dcb42d54cbd8c41f2f29d13ff01674ef50661a7
 size 11817960

build/torch28-cxx11-cu126-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_3bdb4b8_dirty
-ops = torch.ops._megablocks_3bdb4b8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_3bdb4b8_dirty::{op_name}"

 import torch
+from . import _megablocks_8176cbe_dirty
+ops = torch.ops._megablocks_8176cbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_8176cbe_dirty::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -1,11 +1,200 @@
 import torch
 import torch.distributed as dist
-from typing import Optional, Any
 from . import _layers
 from . import ops
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
@@ -80,6 +269,7 @@ def compute_top_k(scores: torch.Tensor, moe_top_k: int):
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -91,7 +281,7 @@ def route_tokens(
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
-    logits = torch.nn.functional.linear(x_flat, router_weight)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
@@ -129,6 +319,7 @@ def mlp_forward(
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
@@ -144,13 +335,13 @@ def mlp_forward(
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
-    gate, up = gate_up.chunk(2, dim=-1)
     glu = gate * torch.sigmoid(gate * alpha)
-    x = (up + 1) * glu
-    return torch.bmm(x, w2) + w2_bias[..., None, :]
 # Shared expert MLP forward pass
 def shared_mlp_forward(
@@ -184,13 +375,13 @@ def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
@@ -657,6 +848,7 @@ def parallel_forward_once(
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -682,6 +874,7 @@ def moe_forward(
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
@@ -743,6 +936,7 @@ def moe_forward(
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -775,6 +969,7 @@ def moe_forward_with_shared_expert(
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
@@ -795,7 +990,7 @@ def moe_forward_with_shared_expert(
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
@@ -807,7 +1002,7 @@ def moe_forward_with_shared_expert(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
@@ -815,9 +1010,9 @@ def moe_forward_with_shared_expert(
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
@@ -833,7 +1028,7 @@ def create_shared_expert_weights(
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
@@ -847,14 +1042,15 @@ def create_shared_expert_weights(
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
@@ -863,14 +1059,21 @@ def get_device_mesh(model):
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
-        hook = next(h for h in model.experts._forward_pre_hooks.values() if 'device_mesh' in h.__code__.co_freevars)
         # Extract the device_mesh from the closure
-        return hook.__closure__[hook.__code__.co_freevars.index('device_mesh')].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
@@ -879,7 +1082,9 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -887,15 +1092,21 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -919,8 +1130,12 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         return output, expert_weights_out
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
@@ -930,7 +1145,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
@@ -946,7 +1161,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
@@ -954,7 +1169,9 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -962,15 +1179,22 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -998,4 +1222,4 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
-        return output, expert_weights_out

 import torch
 import torch.distributed as dist
+from typing import Optional, Any, TYPE_CHECKING
 from . import _layers
 from . import ops
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+                return decorator
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+    # Create wrapper functions that check for compilation and return meta tensors
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+        ops.sort = sort_with_meta
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+        ops.histogram = histogram_with_meta
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+        ops.binned_gather = binned_gather_with_meta
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+        ops.binned_scatter = binned_scatter_with_meta
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+        ops.gather = gather_with_meta
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+        ops.scatter = scatter_with_meta
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+        ops.replicate = replicate_with_meta
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+        ops.repeat = repeat_with_meta
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
+    limit: float = 7.0,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
     glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
 # Shared expert MLP forward pass
 def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
+        router_bias,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
+        router_bias=router_bias,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
         # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
         return output, expert_weights_out
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
+        return output, expert_weights_out

build/torch28-cxx11-cu128-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af3fadbfa9afb6d0315a5dcf57d4a2dbc043e2529dab9d3a6d0656fac4142211
 size 17770912

 version https://git-lfs.github.com/spec/v1
+oid sha256:c075d3398cea481296a0a8b47444fc49a4c984dac991abf5bd3577bc9edf1b71
 size 17770912

build/torch28-cxx11-cu128-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_3bdb4b8_dirty
-ops = torch.ops._megablocks_3bdb4b8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_3bdb4b8_dirty::{op_name}"

 import torch
+from . import _megablocks_8176cbe_dirty
+ops = torch.ops._megablocks_8176cbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_8176cbe_dirty::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -1,11 +1,200 @@
 import torch
 import torch.distributed as dist
-from typing import Optional, Any
 from . import _layers
 from . import ops
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
@@ -80,6 +269,7 @@ def compute_top_k(scores: torch.Tensor, moe_top_k: int):
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -91,7 +281,7 @@ def route_tokens(
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
-    logits = torch.nn.functional.linear(x_flat, router_weight)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
@@ -129,6 +319,7 @@ def mlp_forward(
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
@@ -144,13 +335,13 @@ def mlp_forward(
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
-    gate, up = gate_up.chunk(2, dim=-1)
     glu = gate * torch.sigmoid(gate * alpha)
-    x = (up + 1) * glu
-    return torch.bmm(x, w2) + w2_bias[..., None, :]
 # Shared expert MLP forward pass
 def shared_mlp_forward(
@@ -184,13 +375,13 @@ def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
@@ -657,6 +848,7 @@ def parallel_forward_once(
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -682,6 +874,7 @@ def moe_forward(
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
@@ -743,6 +936,7 @@ def moe_forward(
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -775,6 +969,7 @@ def moe_forward_with_shared_expert(
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
@@ -795,7 +990,7 @@ def moe_forward_with_shared_expert(
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
@@ -807,7 +1002,7 @@ def moe_forward_with_shared_expert(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
@@ -815,9 +1010,9 @@ def moe_forward_with_shared_expert(
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
@@ -833,7 +1028,7 @@ def create_shared_expert_weights(
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
@@ -847,14 +1042,15 @@ def create_shared_expert_weights(
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
@@ -863,14 +1059,21 @@ def get_device_mesh(model):
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
-        hook = next(h for h in model.experts._forward_pre_hooks.values() if 'device_mesh' in h.__code__.co_freevars)
         # Extract the device_mesh from the closure
-        return hook.__closure__[hook.__code__.co_freevars.index('device_mesh')].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
@@ -879,7 +1082,9 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -887,15 +1092,21 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -919,8 +1130,12 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         return output, expert_weights_out
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
@@ -930,7 +1145,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
@@ -946,7 +1161,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
@@ -954,7 +1169,9 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -962,15 +1179,22 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -998,4 +1222,4 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
-        return output, expert_weights_out

 import torch
 import torch.distributed as dist
+from typing import Optional, Any, TYPE_CHECKING
 from . import _layers
 from . import ops
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+                return decorator
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+    # Create wrapper functions that check for compilation and return meta tensors
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+        ops.sort = sort_with_meta
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+        ops.histogram = histogram_with_meta
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+        ops.binned_gather = binned_gather_with_meta
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+        ops.binned_scatter = binned_scatter_with_meta
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+        ops.gather = gather_with_meta
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+        ops.scatter = scatter_with_meta
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+        ops.replicate = replicate_with_meta
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+        ops.repeat = repeat_with_meta
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
+    limit: float = 7.0,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
     glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
 # Shared expert MLP forward pass
 def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
+        router_bias,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
+        router_bias=router_bias,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
         # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
         return output, expert_weights_out
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
+        return output, expert_weights_out

build/torch28-cxx11-cu129-x86_64-linux/megablocks/{_megablocks_3bdb4b8_dirty.abi3.so → _megablocks_8176cbe_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c2535eb4cc58df36b630efb47058f3d277e0f9ddeb0591156f620d73be6d848
 size 13585072

 version https://git-lfs.github.com/spec/v1
+oid sha256:b84b6d64ceb3ef6f5cca709cdca1ec8c79c500b6bbb636c003a7d72fb58e6acf
 size 13585072

build/torch28-cxx11-cu129-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_3bdb4b8_dirty
-ops = torch.ops._megablocks_3bdb4b8_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_3bdb4b8_dirty::{op_name}"

 import torch
+from . import _megablocks_8176cbe_dirty
+ops = torch.ops._megablocks_8176cbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_8176cbe_dirty::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -1,11 +1,200 @@
 import torch
 import torch.distributed as dist
-from typing import Optional, Any
 from . import _layers
 from . import ops
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
@@ -80,6 +269,7 @@ def compute_top_k(scores: torch.Tensor, moe_top_k: int):
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -91,7 +281,7 @@ def route_tokens(
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
-    logits = torch.nn.functional.linear(x_flat, router_weight)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
@@ -129,6 +319,7 @@ def mlp_forward(
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
@@ -144,13 +335,13 @@ def mlp_forward(
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
-    gate, up = gate_up.chunk(2, dim=-1)
     glu = gate * torch.sigmoid(gate * alpha)
-    x = (up + 1) * glu
-    return torch.bmm(x, w2) + w2_bias[..., None, :]
 # Shared expert MLP forward pass
 def shared_mlp_forward(
@@ -184,13 +375,13 @@ def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
@@ -657,6 +848,7 @@ def parallel_forward_once(
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -682,6 +874,7 @@ def moe_forward(
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
@@ -743,6 +936,7 @@ def moe_forward(
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
@@ -775,6 +969,7 @@ def moe_forward_with_shared_expert(
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
@@ -795,7 +990,7 @@ def moe_forward_with_shared_expert(
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
@@ -807,7 +1002,7 @@ def moe_forward_with_shared_expert(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
@@ -815,9 +1010,9 @@ def moe_forward_with_shared_expert(
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
@@ -833,7 +1028,7 @@ def create_shared_expert_weights(
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
@@ -847,14 +1042,15 @@ def create_shared_expert_weights(
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
@@ -863,14 +1059,21 @@ def get_device_mesh(model):
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
-        hook = next(h for h in model.experts._forward_pre_hooks.values() if 'device_mesh' in h.__code__.co_freevars)
         # Extract the device_mesh from the closure
-        return hook.__closure__[hook.__code__.co_freevars.index('device_mesh')].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
@@ -879,7 +1082,9 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -887,15 +1092,21 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -919,8 +1130,12 @@ class MegaBlocksMoeMLP(torch.nn.Module):
         return output, expert_weights_out
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
@@ -930,7 +1145,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
@@ -946,7 +1161,7 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
@@ -954,7 +1169,9 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
-        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
@@ -962,15 +1179,22 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
-        has_parallel = expert_parallel_group is not None and dist.is_initialized() and dist.get_world_size(expert_parallel_group) > 1
         forward_fn = parallel_forward_once if has_parallel else forward_once
-        sort_end_bit = max(int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1)
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
@@ -998,4 +1222,4 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
-        return output, expert_weights_out

 import torch
 import torch.distributed as dist
+from typing import Optional, Any, TYPE_CHECKING
 from . import _layers
 from . import ops
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+                return decorator
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+    # Create wrapper functions that check for compilation and return meta tensors
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+        ops.sort = sort_with_meta
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+        ops.histogram = histogram_with_meta
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+        ops.binned_gather = binned_gather_with_meta
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+        ops.binned_scatter = binned_scatter_with_meta
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+        ops.gather = gather_with_meta
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+        ops.scatter = scatter_with_meta
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+        ops.replicate = replicate_with_meta
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+        ops.repeat = repeat_with_meta
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
 # Set the expert model parallel attributes on a tensor
 def set_expert_model_parallel_attributes(
 def route_tokens(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
         x = apply_jitter(x, moe_jitter_eps)
     x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
     expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
     expert_weights = expert_weights.softmax(dim=-1)
     if moe_normalize_expert_weights is not None:
     w2_bias: torch.Tensor,
     gradient_scale: Optional[float] = None,
     alpha: float = 1.702,
+    limit: float = 7.0,
 ):
     # Scale weights
     w1 = scale_grad(w1, gradient_scale)
     # Forward pass
     gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
     glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
 # Shared expert MLP forward pass
 def shared_mlp_forward(
     # Up projection
     x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
     # Activation
     x = activation_fn(x)
     # Down projection
     x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
     return x
 def moe_forward(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     logits, expert_weights, expert_indices = route_tokens(
         x,
         router_weight,
+        router_bias,
         moe_top_k,
         moe_num_experts,
         moe_jitter_eps,
 def moe_forward_with_shared_expert(
     x: torch.Tensor,
     router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
     moe_top_k: int,
     moe_num_experts: int,
     moe_jitter_eps: float = None,
     expert_out, expert_weights, router_scores = moe_forward(
         x=x,
         router_weight=router_weight,
+        router_bias=router_bias,
         moe_top_k=moe_top_k,
         moe_num_experts=moe_num_experts,
         moe_jitter_eps=moe_jitter_eps,
         hidden_size=hidden_size,
         mlp_impl=mlp_impl,
     )
     # If shared expert weights provided, compute shared expert output
     if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
         shared_expert_out = shared_mlp_forward(
             activation_fn=shared_activation_fn,
             gradient_scale=gradient_scale,
         )
         # Combine expert outputs
         combined_out = combine_expert_shared_outputs(
             shared_expert_out=shared_expert_out,
             shared_expert_weighted_sum=shared_expert_weighted_sum,
             moe_top_k=moe_top_k,
         )
         return combined_out, expert_weights, router_scores
     # Return regular MoE output if no shared expert
     return expert_out, expert_weights, router_scores
     if output_layer_init_method is None:
         output_layer_init_method = init_method
     # Create weight tensors
     up_proj_weight = torch.empty(
         shared_expert_hidden_size,
         device=device,
         dtype=dtype,
     )
     # Initialize weights
     init_method(up_proj_weight)
     output_layer_init_method(down_proj_weight)
     # No bias by default
     return up_proj_weight, down_proj_weight, None, None
 # HACK: Extract device_mesh from pre-hook closure - required for transformers integration
 # This exists because device_mesh is trapped in hook closures with no model attribute
 # Fragile - breaks if hook structure changes or Python internals change
     # Extract device_mesh from child's unused pre_hook closure
     try:
         # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
         # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
     except Exception:
         return None
 class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
         return output, expert_weights_out
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
 class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
     def __init__(self):
         super().__init__()
         # Shared expert weights will be set by the user
         self.shared_down_proj_bias = None
         self.shared_expert_weighted_sum = False
         self.shared_activation_fn = None
     def set_shared_expert_weights(
         self,
         up_proj_weight: torch.Tensor,
         self.shared_down_proj_bias = down_proj_bias
         self.shared_expert_weighted_sum = weighted_sum
         self.shared_activation_fn = activation_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         moe_top_k = getattr(self.router, "top_k", 4)
         moe_num_experts = getattr(self.experts, "num_experts", 128)
         alpha = getattr(self.experts, "alpha", 1.0)
         moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
         moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
         uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
         expert_parallel_group = getattr(self, "expert_parallel_group", None)
             device_mesh = get_device_mesh(self)
             expert_parallel_group = device_mesh.get_group() if device_mesh else None
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
         forward_fn = parallel_forward_once if has_parallel else forward_once
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
         mlp_impl = getattr(self, "mlp_impl", "grouped")
         output, expert_weights_out, *_ = moe_forward_with_shared_expert(
             x=x,
             router_weight=self.router.weight,
+            router_bias=self.router.bias,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
             moe_jitter_eps=moe_jitter_eps,
             shared_expert_weighted_sum=self.shared_expert_weighted_sum,
             shared_activation_fn=self.shared_activation_fn,
         )
+        return output, expert_weights_out