AbstractPhil
/

penta-vit-experiments

Zero-Shot Classification

TensorBoard

Model card Files Files and versions

xet

Metrics Training metrics Community

AbstractPhil commited on Sep 9

Commit

d7a1a55

verified ·

1 Parent(s): 7009caa

Update penta_vit_model_v1.py

Browse files

Files changed (1) hide show

penta_vit_model_v1.py +72 -74

penta_vit_model_v1.py CHANGED Viewed

@@ -1,9 +1,7 @@
 """
 PentachoraViT: Vision Transformer with Pentachoron Geometric Structure
 Enhanced with Geometric Attention for improved head cohesion and generalization
-Author: AbstractPhil
 """
 import torch
@@ -44,7 +42,7 @@ class PentachoraConfig:
         return (self.img_size // self.patch_size) ** 2
 # ============================================
-# GEOMETRIC ATTENTION COMPONENTS (OPTIMIZED)
 # ============================================
 def perfect_4simplex(device):
@@ -70,42 +68,42 @@ class GeometricConfig:
     fuse_alpha: float = 0.7
     phases: Tuple[float, ...] = (0.0, math.pi/2, math.pi, 3*math.pi/2)
     jitter: float = 0.02
-    shift: float = 0.25
     rotate_cycle: int = 11
     use_phase_variance: bool = False
     geometry_type: str = "pentachoron"
 class GeometricNavigator(nn.Module):
-    """Maps inputs to geometric regions in 4D space - OPTIMIZED with vectorized operations."""
-    def __init__(self, input_dim: int, num_regions: int, config: GeometricConfig, num_heads: int = 1):
         super().__init__()
         self.input_dim = input_dim
         self.num_regions = num_regions
         self.config = config
         self.num_heads = num_heads
         # Create separate parameters for each head if num_heads > 1
         if num_heads > 1:
-            self.to_nav = nn.Parameter(torch.randn(num_heads, input_dim, 4) * 0.02)
-            self.vertex_w = nn.Parameter(torch.zeros(num_heads, num_regions, 5))
         else:
             self.to_nav = nn.Linear(input_dim, 4, bias=False)
-            self.vertex_w = nn.Parameter(torch.zeros(num_regions, 5))
         # Pre-compute phase tensors for vectorization
-        self.register_buffer('phase_cos', torch.cos(torch.tensor(config.phases, dtype=torch.float32)))
-        self.register_buffer('phase_sin', torch.sin(torch.tensor(config.phases, dtype=torch.float32)))
-        # Initialize geometry after module is created
-        self.register_parameter('D', None)
-        self.register_parameter('S', None)
-    def _lazy_init_geometry(self, device):
-        """Initialize geometry on first forward pass."""
-        if self.D is not None:
-            return
         base = perfect_4simplex(device)
         if self.num_heads > 1:
@@ -143,8 +141,6 @@ class GeometricNavigator(nn.Module):
     def navigate(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
         """Navigate inputs through geometric space - OPTIMIZED with vectorized phase computation."""
-        self._lazy_init_geometry(x.device)
         if self.num_heads > 1:
             # Batched navigation for multiple heads
             BT, H, head_dim = x.shape
@@ -159,8 +155,8 @@ class GeometricNavigator(nn.Module):
             s_disp = -softmin_over_last(d_disp, self.config.softmin_tau)
             # OPTIMIZED: Vectorized phase computation (no loop)
-            cos_phases = self.phase_cos.to(x.device).view(-1, 1, 1, 1, 1)
-            sin_phases = self.phase_sin.to(x.device).view(-1, 1, 1, 1, 1)
             # Compute all phase variants at once [phases, H, regions, 5, 4]
             Vt_all = cos_phases * self.D.unsqueeze(0) + sin_phases * self.S.unsqueeze(0)
@@ -193,8 +189,8 @@ class GeometricNavigator(nn.Module):
             w = F.softmax(self.vertex_w, dim=1)
             # OPTIMIZED: Vectorized phase computation for single head
-            cos_phases = self.phase_cos.to(x.device).view(-1, 1, 1, 1)
-            sin_phases = self.phase_sin.to(x.device).view(-1, 1, 1, 1)
             Vt_all = cos_phases * self.D.unsqueeze(0) + sin_phases * self.S.unsqueeze(0)
             w_expanded = w.unsqueeze(0).unsqueeze(-1)
@@ -217,10 +213,10 @@ class GeometricNavigator(nn.Module):
         return {'scores': scores, 'diagnostics': diagnostics}
 class GeometricAttention(nn.Module):
-    """Multi-head geometric attention with Q-K alignment - OPTIMIZED with batched processing."""
     def __init__(self, dim: int, num_heads: int = 8, num_regions: Optional[int] = None,
-                 config: Optional[GeometricConfig] = None, dropout: float = 0.0):
         super().__init__()
         self.dim = dim
         self.num_heads = num_heads
@@ -234,9 +230,9 @@ class GeometricAttention(nn.Module):
         self.config = config
         self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
-        # Create batched navigators
-        self.q_navigator = GeometricNavigator(self.head_dim, num_regions, config, num_heads=num_heads)
-        self.k_navigator = GeometricNavigator(self.head_dim, num_regions, config, num_heads=num_heads)
         self.out_proj = nn.Linear(dim, dim)
         self.dropout = nn.Dropout(dropout)
@@ -342,10 +338,13 @@ class HierarchicalPentachoronCLS(nn.Module):
     def forward(self, batch_size: int, class_indices: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
         """Generate CLS tokens for batch."""
         if class_indices is not None and class_indices.shape[0] == batch_size:
-            vertex_cls_vocab = self.class_pentachora[class_indices]
         else:
-            vertex_cls_vocab = self.class_pentachora.mean(dim=0, keepdim=True)
             vertex_cls_vocab = vertex_cls_vocab.expand(batch_size, -1, -1)
         # Project from vocabulary dimension to model dimension
@@ -362,7 +361,8 @@ class HierarchicalPentachoronCLS(nn.Module):
     def get_class_prototypes(self) -> torch.Tensor:
         """Get class prototypes in model dimension."""
-        pentachora_model = self.vocab_to_model(self.class_pentachora)
         weights = F.softmax(self.vertex_weights, dim=0)
         prototypes = torch.einsum('cvd,v->cd', pentachora_model, weights)
         return prototypes
@@ -453,7 +453,7 @@ class PentachoronViTBlock(nn.Module):
     """ViT block with geometric attention for structured layers."""
     def __init__(self, dim: int, heads: int = 8, mlp_ratio: float = 4.0,
                  use_mesh: bool = True, dropout: float = 0., attn_dropout: float = 0.,
-                 drop_path: float = 0.):
         super().__init__()
         self.norm1 = nn.LayerNorm(dim)
@@ -464,7 +464,8 @@ class PentachoronViTBlock(nn.Module):
                 num_heads=heads,
                 num_regions=min(dim // heads, 16),
                 config=GeometricConfig(),
-                dropout=attn_dropout
             )
         else:
             # Standard multi-head attention for later layers
@@ -578,7 +579,8 @@ class PentachoraViT(nn.Module):
                 use_mesh=(cfg.use_mesh_attention and i < cfg.preserve_structure_until_layer),
                 dropout=cfg.dropout_rate,
                 attn_dropout=cfg.dropout_rate,
-                drop_path=dpr[i]
             )
             for i in range(cfg.depth)
         ])
@@ -750,10 +752,10 @@ class PentachoraViT(nn.Module):
         vertex_flat = features['vertex_cls'].reshape(B, -1)
         aux_logits = self.head_aux(vertex_flat)
-        # Geometric alignment scores - use class_pentachora directly
         geometric_alignments = self.geometric_proj(
             features['patches'],
-            self.cls_tokens.class_pentachora  # Back to original
         )
         return {
@@ -822,18 +824,32 @@ MODEL_CONFIGS = {
         dropout_rate=0.0, drop_path_rate=0.0
     ),
     'pentachora_shock_xs_64d': PentachoraConfig(
-        dim=64, depth=2, heads=8, mlp_ratio=4.0,
         preserve_structure_until_layer=4,
         dropout_rate=0.0, drop_path_rate=0.0
     ),
     'pentachora_shock_xs_128d': PentachoraConfig(
-        dim=128, depth=2, heads=8, mlp_ratio=4.0,
         preserve_structure_until_layer=4,
         dropout_rate=0.0, drop_path_rate=0.0
     ),
     'pentachora_shock_xs_256d': PentachoraConfig(
         dim=256, depth=2, heads=8, mlp_ratio=4.0,
-        preserve_structure_until_layer=4,
         dropout_rate=0.0, drop_path_rate=0.0
     ),
     'pentachora_shock_xs_512d': PentachoraConfig(
@@ -983,7 +999,7 @@ def extract_features(model: PentachoraViT,
 def test_model():
     """Test model creation and forward pass."""
-    print("Testing Optimized PentachoraViT Model")
     print("=" * 50)
     # Test different variants
@@ -1041,10 +1057,10 @@ def test_model():
 if __name__ == "__main__":
     # Run tests
-    #test_model()
-    # Example: Create model for A100 training
-    print("\nExample: Creating optimized model for A100 training")
     model = pentachora_shock_xs_256d(
         img_size=32,
         num_classes=100,
@@ -1053,41 +1069,23 @@ if __name__ == "__main__":
         drop_path_rate=0.0
     )
-    # Move model to CUDA first if available
     if torch.cuda.is_available():
         model = model.cuda()
         print("Model moved to CUDA")
-    # Now try torch.compile (PyTorch 2.0+)
-    # Model reformatted to allow eager compiling, speeds along training substantially.
     if hasattr(torch, 'compile'):
         print("Compiling model with torch.compile...")
         try:
-            model = torch.compile(model, backend="eager")
-            print("Model compiled successfully")
         except Exception as e:
             print(f"Compilation warning: {e}")
-            print("Continuing without compilation - vectorized ops will still provide speedup")
-    # Get parameter groups for optimizer
-    param_groups = get_parameter_groups(model, weight_decay=0.05)
-    print(f"Number of parameter groups: {len(param_groups)}")
-    # Example batch - FULL PRECISION
-    images = torch.randn(4, 3, 32, 32)
-    targets = torch.randint(0, 100, (4,))
-    if torch.cuda.is_available():
-        images = images.cuda()
-        targets = targets.cuda()
-    # Forward pass in FULL PRECISION (no autocast)
-    outputs = model(images)
-    # Compute loss
-    criterion = PentachoraLoss(aux_weight=0.3, geo_weight=0.1)
-    loss = criterion(outputs, targets)
-    print(f"Training loss (full precision): {loss.item():.4f}")
-    print("\nModel ready for full precision A100 training!")
-    print("Eager initialization ensures all parameters are created upfront")

 """
 PentachoraViT: Vision Transformer with Pentachoron Geometric Structure
 Enhanced with Geometric Attention for improved head cohesion and generalization
+FIXED: All parameters initialized at module creation time (no lazy init)
 """
 import torch
         return (self.img_size // self.patch_size) ** 2
 # ============================================
+# GEOMETRIC ATTENTION COMPONENTS (FIXED INIT)
 # ============================================
 def perfect_4simplex(device):
     fuse_alpha: float = 0.7
     phases: Tuple[float, ...] = (0.0, math.pi/2, math.pi, 3*math.pi/2)
     jitter: float = 0.02
+    shift: float = 0.71
     rotate_cycle: int = 11
     use_phase_variance: bool = False
     geometry_type: str = "pentachoron"
 class GeometricNavigator(nn.Module):
+    """Maps inputs to geometric regions in 4D space - FIXED with immediate initialization."""
+    def __init__(self, input_dim: int, num_regions: int, config: GeometricConfig, num_heads: int = 1, device=None):
         super().__init__()
         self.input_dim = input_dim
         self.num_regions = num_regions
         self.config = config
         self.num_heads = num_heads
+        # Use CPU by default if device not specified
+        if device is None:
+            device = torch.device('cpu')
         # Create separate parameters for each head if num_heads > 1
         if num_heads > 1:
+            self.to_nav = nn.Parameter(torch.randn(num_heads, input_dim, 4, device=device) * 0.02)
+            self.vertex_w = nn.Parameter(torch.zeros(num_heads, num_regions, 5, device=device))
         else:
             self.to_nav = nn.Linear(input_dim, 4, bias=False)
+            self.vertex_w = nn.Parameter(torch.zeros(num_regions, 5, device=device))
         # Pre-compute phase tensors for vectorization
+        self.register_buffer('phase_cos', torch.cos(torch.tensor(config.phases, dtype=torch.float32, device=device)))
+        self.register_buffer('phase_sin', torch.sin(torch.tensor(config.phases, dtype=torch.float32, device=device)))
+        # Initialize geometry immediately at creation time
+        self._init_geometry(device)
+    def _init_geometry(self, device):
+        """Initialize geometry at module creation time."""
         base = perfect_4simplex(device)
         if self.num_heads > 1:
     def navigate(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
         """Navigate inputs through geometric space - OPTIMIZED with vectorized phase computation."""
         if self.num_heads > 1:
             # Batched navigation for multiple heads
             BT, H, head_dim = x.shape
             s_disp = -softmin_over_last(d_disp, self.config.softmin_tau)
             # OPTIMIZED: Vectorized phase computation (no loop)
+            cos_phases = self.phase_cos.view(-1, 1, 1, 1, 1)
+            sin_phases = self.phase_sin.view(-1, 1, 1, 1, 1)
             # Compute all phase variants at once [phases, H, regions, 5, 4]
             Vt_all = cos_phases * self.D.unsqueeze(0) + sin_phases * self.S.unsqueeze(0)
             w = F.softmax(self.vertex_w, dim=1)
             # OPTIMIZED: Vectorized phase computation for single head
+            cos_phases = self.phase_cos.view(-1, 1, 1, 1)
+            sin_phases = self.phase_sin.view(-1, 1, 1, 1)
             Vt_all = cos_phases * self.D.unsqueeze(0) + sin_phases * self.S.unsqueeze(0)
             w_expanded = w.unsqueeze(0).unsqueeze(-1)
         return {'scores': scores, 'diagnostics': diagnostics}
 class GeometricAttention(nn.Module):
+    """Multi-head geometric attention with Q-K alignment - FIXED with proper device handling."""
     def __init__(self, dim: int, num_heads: int = 8, num_regions: Optional[int] = None,
+                 config: Optional[GeometricConfig] = None, dropout: float = 0.0, device=None):
         super().__init__()
         self.dim = dim
         self.num_heads = num_heads
         self.config = config
         self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
+        # Create batched navigators with device
+        self.q_navigator = GeometricNavigator(self.head_dim, num_regions, config, num_heads=num_heads, device=device)
+        self.k_navigator = GeometricNavigator(self.head_dim, num_regions, config, num_heads=num_heads, device=device)
         self.out_proj = nn.Linear(dim, dim)
         self.dropout = nn.Dropout(dropout)
     def forward(self, batch_size: int, class_indices: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
         """Generate CLS tokens for batch."""
+        # Get class-specific pentachora
+        class_pentachora = self.class_pentachora  # This is now a computed property
         if class_indices is not None and class_indices.shape[0] == batch_size:
+            vertex_cls_vocab = class_pentachora[class_indices]
         else:
+            vertex_cls_vocab = class_pentachora.mean(dim=0, keepdim=True)
             vertex_cls_vocab = vertex_cls_vocab.expand(batch_size, -1, -1)
         # Project from vocabulary dimension to model dimension
     def get_class_prototypes(self) -> torch.Tensor:
         """Get class prototypes in model dimension."""
+        class_pentachora = self.class_pentachora  # Get computed pentachora
+        pentachora_model = self.vocab_to_model(class_pentachora)
         weights = F.softmax(self.vertex_weights, dim=0)
         prototypes = torch.einsum('cvd,v->cd', pentachora_model, weights)
         return prototypes
     """ViT block with geometric attention for structured layers."""
     def __init__(self, dim: int, heads: int = 8, mlp_ratio: float = 4.0,
                  use_mesh: bool = True, dropout: float = 0., attn_dropout: float = 0.,
+                 drop_path: float = 0., device=None):
         super().__init__()
         self.norm1 = nn.LayerNorm(dim)
                 num_heads=heads,
                 num_regions=min(dim // heads, 16),
                 config=GeometricConfig(),
+                dropout=attn_dropout,
+                device=device
             )
         else:
             # Standard multi-head attention for later layers
                 use_mesh=(cfg.use_mesh_attention and i < cfg.preserve_structure_until_layer),
                 dropout=cfg.dropout_rate,
                 attn_dropout=cfg.dropout_rate,
+                drop_path=dpr[i],
+                device=torch.device('cpu')  # Initialize on CPU, will be moved later
             )
             for i in range(cfg.depth)
         ])
         vertex_flat = features['vertex_cls'].reshape(B, -1)
         aux_logits = self.head_aux(vertex_flat)
+        # Geometric alignment scores
         geometric_alignments = self.geometric_proj(
             features['patches'],
+            self.cls_tokens.class_pentachora
         )
         return {
         dropout_rate=0.0, drop_path_rate=0.0
     ),
     'pentachora_shock_xs_64d': PentachoraConfig(
+        dim=64, depth=2, heads=8, mlp_ratio=1.0,
         preserve_structure_until_layer=4,
         dropout_rate=0.0, drop_path_rate=0.0
     ),
     'pentachora_shock_xs_128d': PentachoraConfig(
+        dim=128, depth=2, heads=8, mlp_ratio=2.0,
         preserve_structure_until_layer=4,
+        vocab_dim=256,
+        dropout_rate=0.0, drop_path_rate=0.0
+    ),
+    'vit_pixie_256_patch4': PentachoraConfig(
+        dim=256, depth=10, heads=16, mlp_ratio=1.0,
+        preserve_structure_until_layer=10,
+        vocab_dim=256, patch_size=4,
+        dropout_rate=0.0, drop_path_rate=0.0
+    ),
+    'vit_pixie_256_patch2': PentachoraConfig(
+        dim=256, depth=10, heads=16, mlp_ratio=1.0,
+        preserve_structure_until_layer=10,
+        vocab_dim=256, patch_size=2,
         dropout_rate=0.0, drop_path_rate=0.0
     ),
     'pentachora_shock_xs_256d': PentachoraConfig(
         dim=256, depth=2, heads=8, mlp_ratio=4.0,
+        preserve_structure_until_layer=4,
+        vocab_dim=128,
         dropout_rate=0.0, drop_path_rate=0.0
     ),
     'pentachora_shock_xs_512d': PentachoraConfig(
 def test_model():
     """Test model creation and forward pass."""
+    print("Testing Fixed PentachoraViT Model")
     print("=" * 50)
     # Test different variants
 if __name__ == "__main__":
     # Run tests
+    test_model()
+    # Example: Create model for training
+    print("\nExample: Creating model with proper initialization")
     model = pentachora_shock_xs_256d(
         img_size=32,
         num_classes=100,
         drop_path_rate=0.0
     )
+    # All parameters are initialized immediately
+    print(f"Model has {count_parameters(model)['total']:,} parameters")
+    print("All geometric parameters initialized at creation time")
+    # Move model to CUDA if available
     if torch.cuda.is_available():
         model = model.cuda()
         print("Model moved to CUDA")
+    # Now torch.compile should work without issues
     if hasattr(torch, 'compile'):
         print("Compiling model with torch.compile...")
         try:
+            model = torch.compile(model)
+            print("✓ Model compiled successfully")
         except Exception as e:
             print(f"Compilation warning: {e}")
+            print("Continuing without compilation")
+    print("\nModel ready for training with all parameters properly initialized!")