Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

App Files Files Community

ggunio commited on Sep 25

Commit

c2e3f6e

1 Parent(s): 6c26802

Fix import error by adding core module files

Browse files

Files changed (3) hide show

app.py +1 -3
core/byte_tokenizer_v6.py +298 -0
core/unified_model.py +233 -80

app.py CHANGED Viewed

@@ -13,9 +13,7 @@ import time
 from typing import List, Tuple, Dict, Generator
 # Removed matplotlib imports - using text display instead
-# Add parent directories to path
-parent_dir = Path(__file__).parent.parent.parent
-sys.path.insert(0, str(parent_dir / 'intelligent-tokenizer_v6.1.2'))
 from core.unified_model import IntelligentTokenizerModelV61
 from core.byte_tokenizer_v6 import ByteTokenizerV6

 from typing import List, Tuple, Dict, Generator
 # Removed matplotlib imports - using text display instead
+# Import from local core directory
 from core.unified_model import IntelligentTokenizerModelV61
 from core.byte_tokenizer_v6 import ByteTokenizerV6

core/byte_tokenizer_v6.py ADDED Viewed

	@@ -0,0 +1,298 @@

+"""
+Byte-Level Tokenizer V6.1.2 - Compression-First Learning
+No vocabulary, no language rules - just bytes
+"""
+import torch
+from typing import List, Dict, Union, Optional
+import numpy as np
+class ByteTokenizerV6:
+    """
+    Pure byte-level tokenizer
+    - No vocabulary needed (bytes are 0-255)
+    - No language-specific rules
+    - Model learns all patterns from data
+    """
+    def __init__(self, max_seq_len: int = 64):
+        """Initialize byte tokenizer"""
+        self.max_seq_len = max_seq_len
+        # Special tokens (beyond byte range 0-255)
+        self.PAD = 256
+        self.BOS = 257
+        self.EOS = 258
+        self.MASK = 259
+        # Total vocabulary size = 256 bytes + 4 special tokens
+        self.vocab_size = 260
+        print(f"Byte tokenizer initialized (vocab_size={self.vocab_size})")
+    def encode(self, text: str, add_special_tokens: bool = True) -> Dict:
+        """
+        Encode text to byte IDs
+        Args:
+            text: Input text
+            add_special_tokens: Whether to add BOS/EOS
+        Returns:
+            dict with 'input_ids', 'attention_mask', 'length'
+        """
+        # Convert text to UTF-8 bytes (pure bytes, no rules)
+        byte_sequence = list(text.encode('utf-8'))
+        # Truncate if necessary
+        max_len = self.max_seq_len - 2 if add_special_tokens else self.max_seq_len
+        if len(byte_sequence) > max_len:
+            byte_sequence = byte_sequence[:max_len]
+        # Add special tokens
+        if add_special_tokens:
+            input_ids = [self.BOS] + byte_sequence + [self.EOS]
+        else:
+            input_ids = byte_sequence
+        # Create attention mask (1 for real tokens, 0 for padding)
+        attention_mask = [1] * len(input_ids)
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'length': len(input_ids)
+        }
+    def encode_batch(self, texts: List[str], add_special_tokens: bool = True) -> Dict:
+        """
+        Encode multiple texts with padding
+        Args:
+            texts: List of input texts
+            add_special_tokens: Whether to add special tokens
+        Returns:
+            Batched tensors with padding
+        """
+        encoded_texts = []
+        max_length = 0
+        # Encode each text
+        for text in texts:
+            encoded = self.encode(text, add_special_tokens)
+            encoded_texts.append(encoded)
+            max_length = max(max_length, encoded['length'])
+        # Limit to max sequence length
+        max_length = min(max_length, self.max_seq_len)
+        # Initialize batch tensors
+        batch_size = len(texts)
+        input_ids = np.full((batch_size, max_length), self.PAD, dtype=np.int64)
+        attention_mask = np.zeros((batch_size, max_length), dtype=np.float32)
+        # Fill batch tensors
+        for i, encoded in enumerate(encoded_texts):
+            seq_len = min(encoded['length'], max_length)
+            input_ids[i, :seq_len] = encoded['input_ids'][:seq_len]
+            attention_mask[i, :seq_len] = 1.0
+        return {
+            'input_ids': torch.tensor(input_ids, dtype=torch.long),
+            'attention_mask': torch.tensor(attention_mask, dtype=torch.float32),
+            'lengths': torch.tensor([e['length'] for e in encoded_texts], dtype=torch.long)
+        }
+    def decode(self, input_ids: Union[List[int], torch.Tensor, np.ndarray],
+               skip_special_tokens: bool = True) -> str:
+        """
+        Decode byte IDs back to text
+        Args:
+            input_ids: Byte ID sequence
+            skip_special_tokens: Whether to skip special tokens
+        Returns:
+            Decoded text string
+        """
+        # Convert to list if needed
+        if isinstance(input_ids, torch.Tensor):
+            input_ids = input_ids.cpu().numpy().tolist()
+        elif isinstance(input_ids, np.ndarray):
+            input_ids = input_ids.tolist()
+        # Filter special tokens if requested
+        if skip_special_tokens:
+            # Only keep actual bytes (0-255)
+            input_ids = [b for b in input_ids if 0 <= b <= 255]
+        else:
+            # Replace special tokens with readable markers
+            processed = []
+            for b in input_ids:
+                if b == self.PAD:
+                    continue  # Skip padding
+                elif b == self.BOS:
+                    processed.append(ord('['))  # Use [ for BOS
+                elif b == self.EOS:
+                    processed.append(ord(']'))  # Use ] for EOS
+                elif b == self.MASK:
+                    processed.append(ord('*'))  # Use * for MASK
+                elif 0 <= b <= 255:
+                    processed.append(b)
+            input_ids = processed
+        # Convert bytes to text
+        if not input_ids:
+            return ""
+        try:
+            # 유효한 UTF-8 시퀀스만 추출
+            valid_bytes = []
+            i = 0
+            while i < len(input_ids):
+                b = input_ids[i]
+                if b < 128:  # ASCII
+                    valid_bytes.append(b)
+                    i += 1
+                elif 192 <= b < 224:  # 2-byte UTF-8
+                    if i + 1 < len(input_ids) and 128 <= input_ids[i+1] < 192:
+                        valid_bytes.extend(input_ids[i:i+2])
+                        i += 2
+                    else:
+                        i += 1  # Skip invalid
+                elif 224 <= b < 240:  # 3-byte UTF-8
+                    if i + 2 < len(input_ids) and all(128 <= input_ids[j] < 192 for j in range(i+1, min(i+3, len(input_ids)))):
+                        valid_bytes.extend(input_ids[i:i+3])
+                        i += 3
+                    else:
+                        i += 1  # Skip invalid
+                elif 240 <= b < 248:  # 4-byte UTF-8
+                    if i + 3 < len(input_ids) and all(128 <= input_ids[j] < 192 for j in range(i+1, min(i+4, len(input_ids)))):
+                        valid_bytes.extend(input_ids[i:i+4])
+                        i += 4
+                    else:
+                        i += 1  # Skip invalid
+                else:
+                    i += 1  # Skip invalid byte
+            # Decode valid bytes
+            if valid_bytes:
+                byte_array = bytes(valid_bytes)
+                text = byte_array.decode('utf-8', errors='replace')  # replace로 변경
+                return text
+            else:
+                return ""
+        except Exception as e:
+            # Fallback: convert ASCII only
+            return "".join([chr(b) if b < 128 else '' for b in input_ids])
+    def decode_batch(self, input_ids: torch.Tensor, skip_special_tokens: bool = True) -> List[str]:
+        """
+        Decode a batch of byte sequences
+        Args:
+            input_ids: Batch of byte IDs (batch_size, seq_len)
+            skip_special_tokens: Whether to skip special tokens
+        Returns:
+            List of decoded texts
+        """
+        texts = []
+        for i in range(input_ids.shape[0]):
+            text = self.decode(input_ids[i], skip_special_tokens)
+            texts.append(text)
+        return texts
+    def tokenize(self, text: str) -> List[int]:
+        """
+        Simple tokenization to byte IDs (no special tokens)
+        Args:
+            text: Input text
+        Returns:
+            List of byte IDs
+        """
+        return list(text.encode('utf-8'))
+    def detokenize(self, byte_ids: List[int]) -> str:
+        """
+        Simple detokenization from byte IDs
+        Args:
+            byte_ids: List of byte IDs
+        Returns:
+            Decoded text
+        """
+        try:
+            return bytes(byte_ids).decode('utf-8', errors='replace')
+        except:
+            return "".join([chr(b) if b < 128 else '?' for b in byte_ids])
+    def get_vocab_size(self) -> int:
+        """Get vocabulary size"""
+        return self.vocab_size
+    def get_special_tokens(self) -> Dict[str, int]:
+        """Get special token IDs"""
+        return {
+            'pad_id': self.PAD,
+            'bos_id': self.BOS,
+            'eos_id': self.EOS,
+            'mask_id': self.MASK
+        }
+# Test code
+if __name__ == "__main__":
+    # Initialize tokenizer
+    tokenizer = ByteTokenizerV6()
+    # Test texts in multiple languages
+    test_texts = [
+        "Hello World!",
+        "안녕하세요",
+        "你好世界",
+        "こんにちは",
+        "مرحبا بالعالم",
+        "Здравствуй мир"
+    ]
+    print("=" * 50)
+    print("Single Text Encoding/Decoding Test")
+    print("=" * 50)
+    for text in test_texts:
+        print(f"\nOriginal: {text}")
+        # Encode
+        encoded = tokenizer.encode(text)
+        print(f"Encoded length: {encoded['length']}")
+        print(f"First 10 bytes: {encoded['input_ids'][:10]}")
+        # Decode
+        decoded = tokenizer.decode(encoded['input_ids'])
+        print(f"Decoded: {decoded}")
+        print(f"Match: {decoded == text}")
+    print("\n" + "=" * 50)
+    print("Batch Encoding/Decoding Test")
+    print("=" * 50)
+    # Batch test
+    batch_result = tokenizer.encode_batch(test_texts)
+    print(f"Batch shape: {batch_result['input_ids'].shape}")
+    print(f"Attention mask shape: {batch_result['attention_mask'].shape}")
+    # Decode batch
+    decoded_texts = tokenizer.decode_batch(batch_result['input_ids'])
+    print("\nBatch decoding results:")
+    for orig, dec in zip(test_texts, decoded_texts):
+        print(f"Original: {orig}")
+        print(f"Decoded:  {dec}")
+        print(f"Match: {orig == dec}")
+        print()

core/unified_model.py CHANGED Viewed

@@ -1,6 +1,10 @@
 """
-Unified Intelligent Tokenizer Model v6.0
-순수 학습 기반 - 모든 핵심 코드 통합
 """
 import torch
@@ -48,7 +52,7 @@ class ByteTokenizer:
     Pure byte-level tokenizer - no language rules
     """
-    def __init__(self, max_seq_len: int = 512):
         self.max_seq_len = max_seq_len
         self.PAD = 256
         self.BOS = 257
@@ -108,44 +112,73 @@ class ByteTokenizer:
             return "".join([chr(b) if b < 128 else '?' for b in input_ids if b < 256])
-class ByteEncoder(nn.Module):
     """
-    5-Layer Encoder with Positional Encoding
-    Layer dimensions: [384, 384, 512, 640, 768] - 수정됨
     """
     def __init__(
         self,
         vocab_size: int = 260,
-        hidden_dims: List[int] = [384, 384, 512, 640, 768],  # 512 추가
-        num_heads: int = 8,
         dropout: float = 0.1,
-        max_seq_len: int = 512
     ):
         super().__init__()
-        # Byte embedding
         self.byte_embedding = nn.Embedding(vocab_size, hidden_dims[0])
-        # Positional encoding (Sinusoidal)
         self.pos_encoding = PositionalEncoding(hidden_dims[0], max_seq_len, dropout)
         # 5 Transformer layers with dimension changes
         self.layers = nn.ModuleList()
         for i in range(len(hidden_dims)):
             input_dim = hidden_dims[i-1] if i > 0 else hidden_dims[0]
             output_dim = hidden_dims[i]
             # Projection layer if dimension changes
             if input_dim != output_dim:
                 proj = nn.Linear(input_dim, output_dim)
             else:
                 proj = None
             # Transformer encoder layer
             layer = nn.TransformerEncoderLayer(
                 d_model=output_dim,
-                nhead=num_heads,
                 dim_feedforward=output_dim * 4,
                 dropout=dropout,
                 activation='gelu',
@@ -164,13 +197,31 @@ class ByteEncoder(nn.Module):
     def forward(
         self,
         input_ids: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None
     ) -> Dict[str, torch.Tensor]:
-        # Embed bytes
         x = self.byte_embedding(input_ids)
-        # Add positional encoding
         x = self.pos_encoding(x)
         # Prepare attention mask
         if attention_mask is not None:
@@ -178,17 +229,46 @@ class ByteEncoder(nn.Module):
             # It expects shape (batch_size, seq_len) and handles masking internally
             pass
-        # Process through 5 layers
         all_hidden_states = []
-        for layer_dict in self.layers:
-            # Project if needed
             if layer_dict['projection'] is not None:
                 x = layer_dict['projection'](x)
             # Transformer layer - properly handle mask
             if attention_mask is not None:
-                # TransformerEncoderLayer expects key_padding_mask (batch, seq)
-                # where True means "ignore this position"
                 key_padding_mask = (attention_mask == 0)
                 x = layer_dict['transformer'](x, src_key_padding_mask=key_padding_mask)
             else:
@@ -207,7 +287,13 @@ class ByteEncoder(nn.Module):
         return {
             'last_hidden_state': x,
             'pooled_output': pooled,
-            'all_hidden_states': all_hidden_states
         }
@@ -217,15 +303,16 @@ class CrossAttention(nn.Module):
     추론 레이어 연결을 위한 강화된 관계 학습
     """
-    def __init__(self, hidden_dim: int = 768, num_heads: int = 8, dropout: float = 0.1):
         super().__init__()
         self.cross_attn = nn.MultiheadAttention(
             hidden_dim, num_heads, dropout, batch_first=True
         )
-        # Enhanced relation classifier (8 types for richer relations)
-        # 0: identity, 1: similar, 2: different, 3: continuation
         # 4: translation, 5: summary, 6: expansion, 7: contradiction
         self.relation_head = nn.Sequential(
             nn.Linear(hidden_dim * 2, hidden_dim),
@@ -236,6 +323,12 @@ class CrossAttention(nn.Module):
             nn.Dropout(dropout),
             nn.Linear(hidden_dim // 2, 8)
         )
         # Gating mechanism for adaptive fusion
         self.gate = nn.Sequential(
@@ -274,13 +367,22 @@ class CrossAttention(nn.Module):
         # Residual connection
         attn_output = attn_output + query
         # Adaptive gating for fusion
         gate_input = torch.cat([query.mean(dim=1), key.mean(dim=1)], dim=-1)
         gate_weights = self.gate(gate_input).unsqueeze(1)
-        # Gated fusion: 적응적으로 cross-attention 결과 조절
-        fused_output = gate_weights * attn_output + (1 - gate_weights) * query
         # Pool for relation classification
         query_pooled = query.mean(dim=1) if query_mask is None else \
@@ -295,8 +397,10 @@ class CrossAttention(nn.Module):
         return {
             'cross_attention': fused_output,  # Gated fusion output
             'attention_weights': attn_weights,
             'relation_logits': relation_logits,
-            'gate_weights': gate_weights.squeeze(1)  # For analysis
         }
@@ -304,15 +408,15 @@ class TransformerDecoder(nn.Module):
     """
     Transformer Decoder with Positional Encoding
     """
     def __init__(
         self,
         vocab_size: int = 260,
-        hidden_dim: int = 768,
-        num_heads: int = 8,
-        num_layers: int = 6,
         dropout: float = 0.1,
-        max_seq_len: int = 512
     ):
         super().__init__()
@@ -408,73 +512,87 @@ class TransformerDecoder(nn.Module):
         encoder_hidden: torch.Tensor,
         encoder_mask: Optional[torch.Tensor] = None,
         max_length: int = 128,
-        temperature: float = 1.0,
-        top_k: int = 50,
         top_p: float = 0.95
     ) -> torch.Tensor:
         batch_size = encoder_hidden.size(0)
         device = encoder_hidden.device
         # Start with BOS
         decoder_input_ids = torch.full((batch_size, 1), 257, device=device)
         for _ in range(max_length - 1):
             # Forward pass
             outputs = self.forward(encoder_hidden, decoder_input_ids, encoder_mask)
             next_token_logits = outputs['logits'][:, -1, :] / temperature
             # Top-k filtering
             if top_k > 0:
                 indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
                 next_token_logits[indices_to_remove] = float('-inf')
             # Top-p filtering
             if top_p < 1.0:
                 sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                 cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                 sorted_indices_to_remove = cumulative_probs > top_p
                 sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                 sorted_indices_to_remove[..., 0] = 0
                 indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
                 next_token_logits[indices_to_remove] = float('-inf')
             # Sample
             probs = F.softmax(next_token_logits, dim=-1)
             next_tokens = torch.multinomial(probs, 1)
             decoder_input_ids = torch.cat([decoder_input_ids, next_tokens], dim=-1)
-            # Stop at EOS
-            if (next_tokens == 258).all():  # EOS token
                 break
         return decoder_input_ids
-class IntelligentTokenizerModel(nn.Module):
     """
-    Complete Intelligent Tokenizer Model v6.0
-    통합 모델 - Encoder + Decoder + Cross-Attention
     """
     def __init__(
         self,
         vocab_size: int = 260,
-        encoder_dims: List[int] = [384, 384, 512, 640, 768],  # 512 추가
-        decoder_hidden: int = 768,
-        num_heads: int = 8,
-        num_decoder_layers: int = 6,
         dropout: float = 0.1,
-        max_seq_len: int = 512
     ):
         super().__init__()
-        # Components
         self.tokenizer = ByteTokenizer(max_seq_len)
-        self.encoder = ByteEncoder(vocab_size, encoder_dims, num_heads, dropout, max_seq_len)
-        self.decoder = TransformerDecoder(vocab_size, decoder_hidden, num_heads, num_decoder_layers, dropout, max_seq_len)
-        self.cross_attention = CrossAttention(encoder_dims[-1], num_heads, dropout)
     def forward(
         self,
@@ -483,6 +601,8 @@ class IntelligentTokenizerModel(nn.Module):
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         use_cross_attention: bool = True
     ) -> Dict[str, torch.Tensor]:
         # Tokenize if text input
@@ -495,13 +615,24 @@ class IntelligentTokenizerModel(nn.Module):
         batch_size, seq_len = input_ids.shape
         device = input_ids.device
-        # Encode
-        encoder_outputs = self.encoder(input_ids, attention_mask)
-        encoder_hidden = encoder_outputs['last_hidden_state']  # [batch, seq, 768]
-        # 차원 확인
-        assert encoder_hidden.size(-1) == 768, f"Encoder dim mismatch: {encoder_hidden.size(-1)}"
         # Decode
         decoder_outputs = self.decoder(
             encoder_hidden,
@@ -542,25 +673,47 @@ class IntelligentTokenizerModel(nn.Module):
                 decoder_outputs['logits'].reshape(-1, decoder_outputs['logits'].size(-1)),
                 labels.reshape(-1)
             )
             # Relation loss (if cross-attention used)
             relation_loss = 0
             if relation_logits is not None:
                 # 자기 관계는 identity (class 0)여야 함
                 batch_identity = torch.zeros(batch_size, dtype=torch.long, device=device)
                 relation_loss = F.cross_entropy(relation_logits, batch_identity) * 0.1
-            loss = recon_loss + relation_loss
         return {
             'loss': loss,
             'logits': decoder_outputs['logits'],
             'encoder_hidden_states': encoder_hidden,
             'decoder_hidden_states': decoder_hidden,
             'pooled_output': encoder_outputs['pooled_output'],
             'cross_attention': cross_attn_outputs['cross_attention'] if cross_attn_outputs else None,
             'relation_logits': relation_logits,
-            'all_encoder_states': encoder_outputs.get('all_hidden_states', None)
         }
     def encode_text(self, text: str) -> torch.Tensor:

 """
+Unified Intelligent Tokenizer Model v6.1.2
+Compression-First Learning with Adaptive Splitting
+- 64 byte chunks for aggressive compression
+- 50 epoch checkpoints with automatic splitting
+- Group relation learning for reconstruction
+- Boundary adjustment for semantic units
 """
 import torch
     Pure byte-level tokenizer - no language rules
     """
+    def __init__(self, max_seq_len: int = 64):  # v6.1.2: 64 bytes for compression-first approach
         self.max_seq_len = max_seq_len
         self.PAD = 256
         self.BOS = 257
             return "".join([chr(b) if b < 128 else '?' for b in input_ids if b < 256])
+class ByteEncoderV61(nn.Module):
     """
+    v6.1: 5-Layer Encoder with Layer-Specialized Architecture
+    Layer 0: 768d - Byte to character (with curriculum learning)
+    Layer 1: 896d - Language pattern discovery (no labels)
+    Layer 2: 1024d - Eojeol/Word formation (+ eojeol PE)
+    Layer 3: 1152d - Small phrase grouping (2-3 eojeols)
+    Layer 4: 1280d - Final refinement (+ context PE)
+    Target: 어절(eojeol) to 구(phrase) level compression (3:1 ratio)
     """
     def __init__(
         self,
         vocab_size: int = 260,
+        hidden_dims: List[int] = [768, 896, 1024, 1152, 1280],  # v6.1 dimensions
+        num_heads: List[int] = [12, 14, 16, 18, 20],  # v6.1: Progressive heads per layer
         dropout: float = 0.1,
+        max_seq_len: int = 64  # v6.1.2: 64 chunk for compression-first
     ):
         super().__init__()
+        # Layer 0: Byte to Character with Curriculum Learning
         self.byte_embedding = nn.Embedding(vocab_size, hidden_dims[0])
+        # v6.1: Multi-level boundary predictors for hierarchical segmentation
+        # Level 1: Character boundaries (UTF-8 multi-byte)
+        self.char_boundary_predictor = nn.Linear(hidden_dims[0], 3)  # 0: continue, 1: start, 2: end
+        # Level 2: Eojeol boundaries (space + particle analysis)
+        self.eojeol_boundary_predictor = nn.Linear(hidden_dims[2], 4)  # 0: inside, 1: space, 2: particle, 3: punct
+        # Level 3: Phrase boundaries (syntactic chunks)
+        self.phrase_boundary_predictor = nn.Linear(hidden_dims[3], 3)  # 0: inside, 1: weak boundary, 2: strong boundary
+        # v6.1: Positional encoding ONLY for Layer 0
         self.pos_encoding = PositionalEncoding(hidden_dims[0], max_seq_len, dropout)
+        # v6.1: Layer 1 - Language pattern discovery (no labels!)
+        self.pattern_discoverer = nn.Linear(hidden_dims[1], 256)  # Discover patterns autonomously (from 896d)
+        self.lang_signal_generator = nn.Linear(hidden_dims[1], 128)  # Generate language signals (from 896d)
+        # v6.1: Group-aware relative position encodings for Layer 2-4
+        self.group_pe_layer2 = nn.Embedding(max_seq_len, hidden_dims[2])  # For eojeol/word units
+        self.group_pe_layer3 = nn.Embedding(max_seq_len, hidden_dims[3])  # For small phrases (2-3 eojeols)
+        self.group_pe_layer4 = nn.Embedding(max_seq_len, hidden_dims[4])  # For context/discourse
         # 5 Transformer layers with dimension changes
         self.layers = nn.ModuleList()
         for i in range(len(hidden_dims)):
             input_dim = hidden_dims[i-1] if i > 0 else hidden_dims[0]
             output_dim = hidden_dims[i]
             # Projection layer if dimension changes
             if input_dim != output_dim:
                 proj = nn.Linear(input_dim, output_dim)
             else:
                 proj = None
+            # v6.1: Layer-specific head count for optimal dimension per head
+            # Target: 64-80 dim per head
+            layer_heads = num_heads[i] if isinstance(num_heads, list) else num_heads
             # Transformer encoder layer
             layer = nn.TransformerEncoderLayer(
                 d_model=output_dim,
+                nhead=layer_heads,
                 dim_feedforward=output_dim * 4,
                 dropout=dropout,
                 activation='gelu',
     def forward(
         self,
         input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        boundary_labels: Optional[torch.Tensor] = None,
+        epoch: int = 0
     ) -> Dict[str, torch.Tensor]:
+        """
+        v6.1 Forward pass with curriculum learning
+        Args:
+            boundary_labels: UTF-8 boundary labels for curriculum learning (training only)
+            epoch: Current epoch for curriculum schedule
+        """
+        batch_size, seq_len = input_ids.shape
+        # Layer 0: Byte embedding with curriculum learning
         x = self.byte_embedding(input_ids)
+        # v6.1: Positional encoding ONLY at Layer 0
         x = self.pos_encoding(x)
+        # v6.1: Predict character boundaries (Layer 0)
+        char_boundaries = self.char_boundary_predictor(x)
+        # v6.1: Curriculum learning for character boundaries
+        # Note: boundary_labels are eojeol boundaries (4 classes), not char boundaries (3 classes)
+        # So we don't mix them with char_boundaries - they serve different purposes
+        char_boundary_weights = F.softmax(char_boundaries, dim=-1)
         # Prepare attention mask
         if attention_mask is not None:
             # It expects shape (batch_size, seq_len) and handles masking internally
             pass
+        # v6.1: Process through 5 specialized layers
         all_hidden_states = []
+        discovered_patterns = None
+        eojeol_boundaries = None
+        phrase_boundaries = None
+        for i, layer_dict in enumerate(self.layers):
+            # Project if needed (before layer-specific processing)
             if layer_dict['projection'] is not None:
                 x = layer_dict['projection'](x)
+            # Layer 1: Add language signals (autonomous discovery)
+            if i == 1:
+                # Discover language patterns WITHOUT labels (x is now 896d)
+                discovered_patterns = self.pattern_discoverer(x)
+                lang_signals = self.lang_signal_generator(x)
+            # Layer 2: Predict eojeol boundaries and add position encoding
+            elif i == 2:
+                # Predict eojeol boundaries (spaces, particles, punctuation)
+                eojeol_boundaries = self.eojeol_boundary_predictor(x)
+                positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
+                group_pe = self.group_pe_layer2(positions)
+                x = x + group_pe * 0.1  # Mild addition to preserve main signal
+            # Layer 3: Predict phrase boundaries and add position encoding
+            elif i == 3:
+                # Predict phrase boundaries (weak/strong syntactic breaks)
+                phrase_boundaries = self.phrase_boundary_predictor(x)
+                positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
+                group_pe = self.group_pe_layer3(positions)
+                x = x + group_pe * 0.1
+            elif i == 4:
+                positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
+                group_pe = self.group_pe_layer4(positions)
+                x = x + group_pe * 0.1
             # Transformer layer - properly handle mask
             if attention_mask is not None:
                 key_padding_mask = (attention_mask == 0)
                 x = layer_dict['transformer'](x, src_key_padding_mask=key_padding_mask)
             else:
         return {
             'last_hidden_state': x,
             'pooled_output': pooled,
+            'all_hidden_states': all_hidden_states,
+            # v6.1 boundary predictions
+            'char_boundaries': char_boundaries,
+            'char_boundary_weights': char_boundary_weights,
+            'eojeol_boundaries': eojeol_boundaries,
+            'phrase_boundaries': phrase_boundaries,
+            'discovered_patterns': discovered_patterns
         }
     추론 레이어 연결을 위한 강화된 관계 학습
     """
+    def __init__(self, hidden_dim: int = 1280, num_heads: int = 20, dropout: float = 0.1):
         super().__init__()
+        # v6.1: Adjusted for 1280d (64 per head with 20 heads)
         self.cross_attn = nn.MultiheadAttention(
             hidden_dim, num_heads, dropout, batch_first=True
         )
+        # v6.1: Enhanced relation classifier with reconstruction focus
+        # 0: identity (완벽한 복원), 1: similar, 2: different, 3: continuation
         # 4: translation, 5: summary, 6: expansion, 7: contradiction
         self.relation_head = nn.Sequential(
             nn.Linear(hidden_dim * 2, hidden_dim),
             nn.Dropout(dropout),
             nn.Linear(hidden_dim // 2, 8)
         )
+        # v6.1: Reconstruction-specific attention (복원 전용 어텐션)
+        # Use 10 heads for reconstruction (128 per head)
+        self.reconstruction_attn = nn.MultiheadAttention(
+            hidden_dim, 10, dropout * 0.5, batch_first=True
+        )
         # Gating mechanism for adaptive fusion
         self.gate = nn.Sequential(
         # Residual connection
         attn_output = attn_output + query
+        # v6.1: Reconstruction-focused attention (복원 최적화)
+        recon_output, recon_weights = self.reconstruction_attn(
+            query_norm, query_norm, query_norm,  # Self-attention for consistency
+            key_padding_mask=(query_mask == 0) if query_mask is not None else None
+        )
+        # Combine cross and reconstruction attention
+        combined_attn = attn_output * 0.7 + recon_output * 0.3
         # Adaptive gating for fusion
         gate_input = torch.cat([query.mean(dim=1), key.mean(dim=1)], dim=-1)
         gate_weights = self.gate(gate_input).unsqueeze(1)
+        # Gated fusion: 적응적으로 attention 결과 조절
+        fused_output = gate_weights * combined_attn + (1 - gate_weights) * query
         # Pool for relation classification
         query_pooled = query.mean(dim=1) if query_mask is None else \
         return {
             'cross_attention': fused_output,  # Gated fusion output
             'attention_weights': attn_weights,
+            'reconstruction_weights': recon_weights,  # v6.1: 복원 어텐션 가중치
             'relation_logits': relation_logits,
+            'gate_weights': gate_weights.squeeze(1),  # For analysis
+            'reconstruction_score': F.softmax(relation_logits, dim=-1)[:, 0]  # identity 확률 (복원도)
         }
     """
     Transformer Decoder with Positional Encoding
     """
     def __init__(
         self,
         vocab_size: int = 260,
+        hidden_dim: int = 1280,  # v6.1: Match final encoder dim
+        num_heads: int = 16,      # v6.1: 1280/16 = 80 per head
+        num_layers: int = 8,      # v6.1 FINAL: 8 layers for better reconstruction
         dropout: float = 0.1,
+        max_seq_len: int = 64    # v6.1.2: 64 chunk for compression-first
     ):
         super().__init__()
         encoder_hidden: torch.Tensor,
         encoder_mask: Optional[torch.Tensor] = None,
         max_length: int = 128,
+        temperature: float = 0.1,  # 토크나이저는 보수적 생성 (정확한 복원)
+        top_k: int = 10,  # 상위 10개만 고려
         top_p: float = 0.95
     ) -> torch.Tensor:
         batch_size = encoder_hidden.size(0)
         device = encoder_hidden.device
         # Start with BOS
         decoder_input_ids = torch.full((batch_size, 1), 257, device=device)
+        # Track which sequences are done
+        finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
         for _ in range(max_length - 1):
             # Forward pass
             outputs = self.forward(encoder_hidden, decoder_input_ids, encoder_mask)
             next_token_logits = outputs['logits'][:, -1, :] / temperature
             # Top-k filtering
             if top_k > 0:
                 indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
                 next_token_logits[indices_to_remove] = float('-inf')
             # Top-p filtering
             if top_p < 1.0:
                 sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                 cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                 sorted_indices_to_remove = cumulative_probs > top_p
                 sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                 sorted_indices_to_remove[..., 0] = 0
                 indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
                 next_token_logits[indices_to_remove] = float('-inf')
             # Sample
             probs = F.softmax(next_token_logits, dim=-1)
             next_tokens = torch.multinomial(probs, 1)
+            # For finished sequences, force PAD token
+            next_tokens[finished] = 256  # PAD token
             decoder_input_ids = torch.cat([decoder_input_ids, next_tokens], dim=-1)
+            # Update finished status
+            finished = finished | (next_tokens.squeeze(-1) == 258)  # Mark as finished if EOS
+            # Stop when all sequences are done
+            if finished.all():
                 break
         return decoder_input_ids
+class IntelligentTokenizerModelV61(nn.Module):
     """
+    Complete Intelligent Tokenizer Model v6.1
+    Pure learning-based with curriculum learning
+    - No language labels during training
+    - Curriculum learning for boundaries
+    - Group-aware position encodings
     """
     def __init__(
         self,
         vocab_size: int = 260,
+        encoder_dims: List[int] = [768, 896, 1024, 1152, 1280],  # v6.1 dimensions
+        encoder_heads: List[int] = [12, 14, 16, 18, 20],  # v6.1: Optimal heads per layer
+        decoder_hidden: int = 1280,  # Match final encoder dim
+        decoder_heads: int = 16,     # v6.1: 80 per head for decoder
+        num_decoder_layers: int = 8,  # v6.1 FINAL: 8 layers for better reconstruction
         dropout: float = 0.1,
+        max_seq_len: int = 64  # v6.1.2: 64 chunk for compression-first
     ):
         super().__init__()
+        # v6.1 Components with optimized head counts
         self.tokenizer = ByteTokenizer(max_seq_len)
+        self.encoder = ByteEncoderV61(vocab_size, encoder_dims, encoder_heads, dropout, max_seq_len)
+        self.decoder = TransformerDecoder(vocab_size, decoder_hidden, decoder_heads, num_decoder_layers, dropout, max_seq_len)
+        self.cross_attention = CrossAttention(encoder_dims[-1], 20, dropout)  # 20 heads for 1280d
     def forward(
         self,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
+        boundary_labels: Optional[torch.Tensor] = None,  # v6.1: for curriculum learning
+        epoch: int = 0,  # v6.1: for curriculum schedule
         use_cross_attention: bool = True
     ) -> Dict[str, torch.Tensor]:
         # Tokenize if text input
         batch_size, seq_len = input_ids.shape
         device = input_ids.device
+        # v6.1: Encode with curriculum learning
+        encoder_outputs = self.encoder(input_ids, attention_mask, boundary_labels, epoch)
+        encoder_hidden = encoder_outputs['last_hidden_state']  # v6.1: [batch, seq, 1280]
+        # v6.1: 차원 확인 - 최종 차원은 1280
+        assert encoder_hidden.size(-1) == 1280, f"Encoder dim mismatch: {encoder_hidden.size(-1)}"
+        # Prepare decoder input for teacher forcing during training
+        if decoder_input_ids is None:
+            if labels is not None:
+                # During training, use shifted labels as decoder input (teacher forcing)
+                # Add BOS at the beginning and remove last token
+                bos_tokens = torch.full((batch_size, 1), self.tokenizer.BOS, device=labels.device, dtype=labels.dtype)
+                decoder_input_ids = torch.cat([bos_tokens, labels[:, :-1]], dim=1)
+            else:
+                # For inference/test, start with BOS token
+                decoder_input_ids = torch.full((batch_size, 1), self.tokenizer.BOS, device=device, dtype=torch.long)
         # Decode
         decoder_outputs = self.decoder(
             encoder_hidden,
                 decoder_outputs['logits'].reshape(-1, decoder_outputs['logits'].size(-1)),
                 labels.reshape(-1)
             )
+            # Boundary loss (if boundary labels provided)
+            boundary_loss = 0
+            if boundary_labels is not None and encoder_outputs.get('eojeol_boundaries') is not None:
+                # Eojeol boundary loss
+                eojeol_boundaries = encoder_outputs['eojeol_boundaries']  # [batch, seq, 4]
+                if eojeol_boundaries.size(1) == boundary_labels.size(1):
+                    # Ensure boundary labels are in valid range (0-3)
+                    # Clamp to valid range to prevent CUDA errors
+                    boundary_labels_clamped = torch.clamp(boundary_labels, min=0, max=3)
+                    boundary_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)  # Use -1 for padding
+                    boundary_loss = boundary_loss_fct(
+                        eojeol_boundaries.reshape(-1, 4),
+                        boundary_labels_clamped.reshape(-1)
+                    ) * 0.5  # Weight for boundary loss
             # Relation loss (if cross-attention used)
             relation_loss = 0
             if relation_logits is not None:
                 # 자기 관계는 identity (class 0)여야 함
                 batch_identity = torch.zeros(batch_size, dtype=torch.long, device=device)
                 relation_loss = F.cross_entropy(relation_logits, batch_identity) * 0.1
+            loss = recon_loss + boundary_loss + relation_loss
         return {
             'loss': loss,
             'logits': decoder_outputs['logits'],
+            'decoder_logits': decoder_outputs['logits'],  # Add for compatibility
             'encoder_hidden_states': encoder_hidden,
             'decoder_hidden_states': decoder_hidden,
             'pooled_output': encoder_outputs['pooled_output'],
             'cross_attention': cross_attn_outputs['cross_attention'] if cross_attn_outputs else None,
             'relation_logits': relation_logits,
+            'all_encoder_states': encoder_outputs.get('all_hidden_states', None),
+            # Add boundary predictions for visualization
+            'char_boundaries': encoder_outputs.get('char_boundaries'),
+            'eojeol_boundaries': encoder_outputs.get('eojeol_boundaries'),
+            'phrase_boundaries': encoder_outputs.get('phrase_boundaries'),
+            'discovered_patterns': encoder_outputs.get('discovered_patterns')
         }
     def encode_text(self, text: str) -> torch.Tensor: