ggunio commited on
Commit
c2e3f6e
Β·
1 Parent(s): 6c26802

Fix import error by adding core module files

Browse files
Files changed (3) hide show
  1. app.py +1 -3
  2. core/byte_tokenizer_v6.py +298 -0
  3. core/unified_model.py +233 -80
app.py CHANGED
@@ -13,9 +13,7 @@ import time
13
  from typing import List, Tuple, Dict, Generator
14
  # Removed matplotlib imports - using text display instead
15
 
16
- # Add parent directories to path
17
- parent_dir = Path(__file__).parent.parent.parent
18
- sys.path.insert(0, str(parent_dir / 'intelligent-tokenizer_v6.1.2'))
19
  from core.unified_model import IntelligentTokenizerModelV61
20
  from core.byte_tokenizer_v6 import ByteTokenizerV6
21
 
 
13
  from typing import List, Tuple, Dict, Generator
14
  # Removed matplotlib imports - using text display instead
15
 
16
+ # Import from local core directory
 
 
17
  from core.unified_model import IntelligentTokenizerModelV61
18
  from core.byte_tokenizer_v6 import ByteTokenizerV6
19
 
core/byte_tokenizer_v6.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Byte-Level Tokenizer V6.1.2 - Compression-First Learning
3
+ No vocabulary, no language rules - just bytes
4
+ """
5
+
6
+ import torch
7
+ from typing import List, Dict, Union, Optional
8
+ import numpy as np
9
+
10
+
11
+ class ByteTokenizerV6:
12
+ """
13
+ Pure byte-level tokenizer
14
+ - No vocabulary needed (bytes are 0-255)
15
+ - No language-specific rules
16
+ - Model learns all patterns from data
17
+ """
18
+
19
+ def __init__(self, max_seq_len: int = 64):
20
+ """Initialize byte tokenizer"""
21
+
22
+ self.max_seq_len = max_seq_len
23
+
24
+ # Special tokens (beyond byte range 0-255)
25
+ self.PAD = 256
26
+ self.BOS = 257
27
+ self.EOS = 258
28
+ self.MASK = 259
29
+
30
+ # Total vocabulary size = 256 bytes + 4 special tokens
31
+ self.vocab_size = 260
32
+
33
+ print(f"Byte tokenizer initialized (vocab_size={self.vocab_size})")
34
+
35
+ def encode(self, text: str, add_special_tokens: bool = True) -> Dict:
36
+ """
37
+ Encode text to byte IDs
38
+
39
+ Args:
40
+ text: Input text
41
+ add_special_tokens: Whether to add BOS/EOS
42
+
43
+ Returns:
44
+ dict with 'input_ids', 'attention_mask', 'length'
45
+ """
46
+ # Convert text to UTF-8 bytes (pure bytes, no rules)
47
+ byte_sequence = list(text.encode('utf-8'))
48
+
49
+ # Truncate if necessary
50
+ max_len = self.max_seq_len - 2 if add_special_tokens else self.max_seq_len
51
+ if len(byte_sequence) > max_len:
52
+ byte_sequence = byte_sequence[:max_len]
53
+
54
+ # Add special tokens
55
+ if add_special_tokens:
56
+ input_ids = [self.BOS] + byte_sequence + [self.EOS]
57
+ else:
58
+ input_ids = byte_sequence
59
+
60
+ # Create attention mask (1 for real tokens, 0 for padding)
61
+ attention_mask = [1] * len(input_ids)
62
+
63
+ return {
64
+ 'input_ids': input_ids,
65
+ 'attention_mask': attention_mask,
66
+ 'length': len(input_ids)
67
+ }
68
+
69
+ def encode_batch(self, texts: List[str], add_special_tokens: bool = True) -> Dict:
70
+ """
71
+ Encode multiple texts with padding
72
+
73
+ Args:
74
+ texts: List of input texts
75
+ add_special_tokens: Whether to add special tokens
76
+
77
+ Returns:
78
+ Batched tensors with padding
79
+ """
80
+ encoded_texts = []
81
+ max_length = 0
82
+
83
+ # Encode each text
84
+ for text in texts:
85
+ encoded = self.encode(text, add_special_tokens)
86
+ encoded_texts.append(encoded)
87
+ max_length = max(max_length, encoded['length'])
88
+
89
+ # Limit to max sequence length
90
+ max_length = min(max_length, self.max_seq_len)
91
+
92
+ # Initialize batch tensors
93
+ batch_size = len(texts)
94
+ input_ids = np.full((batch_size, max_length), self.PAD, dtype=np.int64)
95
+ attention_mask = np.zeros((batch_size, max_length), dtype=np.float32)
96
+
97
+ # Fill batch tensors
98
+ for i, encoded in enumerate(encoded_texts):
99
+ seq_len = min(encoded['length'], max_length)
100
+ input_ids[i, :seq_len] = encoded['input_ids'][:seq_len]
101
+ attention_mask[i, :seq_len] = 1.0
102
+
103
+ return {
104
+ 'input_ids': torch.tensor(input_ids, dtype=torch.long),
105
+ 'attention_mask': torch.tensor(attention_mask, dtype=torch.float32),
106
+ 'lengths': torch.tensor([e['length'] for e in encoded_texts], dtype=torch.long)
107
+ }
108
+
109
+ def decode(self, input_ids: Union[List[int], torch.Tensor, np.ndarray],
110
+ skip_special_tokens: bool = True) -> str:
111
+ """
112
+ Decode byte IDs back to text
113
+
114
+ Args:
115
+ input_ids: Byte ID sequence
116
+ skip_special_tokens: Whether to skip special tokens
117
+
118
+ Returns:
119
+ Decoded text string
120
+ """
121
+ # Convert to list if needed
122
+ if isinstance(input_ids, torch.Tensor):
123
+ input_ids = input_ids.cpu().numpy().tolist()
124
+ elif isinstance(input_ids, np.ndarray):
125
+ input_ids = input_ids.tolist()
126
+
127
+ # Filter special tokens if requested
128
+ if skip_special_tokens:
129
+ # Only keep actual bytes (0-255)
130
+ input_ids = [b for b in input_ids if 0 <= b <= 255]
131
+ else:
132
+ # Replace special tokens with readable markers
133
+ processed = []
134
+ for b in input_ids:
135
+ if b == self.PAD:
136
+ continue # Skip padding
137
+ elif b == self.BOS:
138
+ processed.append(ord('[')) # Use [ for BOS
139
+ elif b == self.EOS:
140
+ processed.append(ord(']')) # Use ] for EOS
141
+ elif b == self.MASK:
142
+ processed.append(ord('*')) # Use * for MASK
143
+ elif 0 <= b <= 255:
144
+ processed.append(b)
145
+ input_ids = processed
146
+
147
+ # Convert bytes to text
148
+ if not input_ids:
149
+ return ""
150
+
151
+ try:
152
+ # μœ νš¨ν•œ UTF-8 μ‹œν€€μŠ€λ§Œ μΆ”μΆœ
153
+ valid_bytes = []
154
+ i = 0
155
+ while i < len(input_ids):
156
+ b = input_ids[i]
157
+ if b < 128: # ASCII
158
+ valid_bytes.append(b)
159
+ i += 1
160
+ elif 192 <= b < 224: # 2-byte UTF-8
161
+ if i + 1 < len(input_ids) and 128 <= input_ids[i+1] < 192:
162
+ valid_bytes.extend(input_ids[i:i+2])
163
+ i += 2
164
+ else:
165
+ i += 1 # Skip invalid
166
+ elif 224 <= b < 240: # 3-byte UTF-8
167
+ if i + 2 < len(input_ids) and all(128 <= input_ids[j] < 192 for j in range(i+1, min(i+3, len(input_ids)))):
168
+ valid_bytes.extend(input_ids[i:i+3])
169
+ i += 3
170
+ else:
171
+ i += 1 # Skip invalid
172
+ elif 240 <= b < 248: # 4-byte UTF-8
173
+ if i + 3 < len(input_ids) and all(128 <= input_ids[j] < 192 for j in range(i+1, min(i+4, len(input_ids)))):
174
+ valid_bytes.extend(input_ids[i:i+4])
175
+ i += 4
176
+ else:
177
+ i += 1 # Skip invalid
178
+ else:
179
+ i += 1 # Skip invalid byte
180
+
181
+ # Decode valid bytes
182
+ if valid_bytes:
183
+ byte_array = bytes(valid_bytes)
184
+ text = byte_array.decode('utf-8', errors='replace') # replace둜 λ³€κ²½
185
+ return text
186
+ else:
187
+ return ""
188
+ except Exception as e:
189
+ # Fallback: convert ASCII only
190
+ return "".join([chr(b) if b < 128 else '' for b in input_ids])
191
+
192
+ def decode_batch(self, input_ids: torch.Tensor, skip_special_tokens: bool = True) -> List[str]:
193
+ """
194
+ Decode a batch of byte sequences
195
+
196
+ Args:
197
+ input_ids: Batch of byte IDs (batch_size, seq_len)
198
+ skip_special_tokens: Whether to skip special tokens
199
+
200
+ Returns:
201
+ List of decoded texts
202
+ """
203
+ texts = []
204
+ for i in range(input_ids.shape[0]):
205
+ text = self.decode(input_ids[i], skip_special_tokens)
206
+ texts.append(text)
207
+ return texts
208
+
209
+ def tokenize(self, text: str) -> List[int]:
210
+ """
211
+ Simple tokenization to byte IDs (no special tokens)
212
+
213
+ Args:
214
+ text: Input text
215
+
216
+ Returns:
217
+ List of byte IDs
218
+ """
219
+ return list(text.encode('utf-8'))
220
+
221
+ def detokenize(self, byte_ids: List[int]) -> str:
222
+ """
223
+ Simple detokenization from byte IDs
224
+
225
+ Args:
226
+ byte_ids: List of byte IDs
227
+
228
+ Returns:
229
+ Decoded text
230
+ """
231
+ try:
232
+ return bytes(byte_ids).decode('utf-8', errors='replace')
233
+ except:
234
+ return "".join([chr(b) if b < 128 else '?' for b in byte_ids])
235
+
236
+ def get_vocab_size(self) -> int:
237
+ """Get vocabulary size"""
238
+ return self.vocab_size
239
+
240
+ def get_special_tokens(self) -> Dict[str, int]:
241
+ """Get special token IDs"""
242
+ return {
243
+ 'pad_id': self.PAD,
244
+ 'bos_id': self.BOS,
245
+ 'eos_id': self.EOS,
246
+ 'mask_id': self.MASK
247
+ }
248
+
249
+
250
+ # Test code
251
+ if __name__ == "__main__":
252
+ # Initialize tokenizer
253
+ tokenizer = ByteTokenizerV6()
254
+
255
+ # Test texts in multiple languages
256
+ test_texts = [
257
+ "Hello World!",
258
+ "μ•ˆλ…•ν•˜μ„Έμš”",
259
+ "δ½ ε₯½δΈ–η•Œ",
260
+ "こんにけは",
261
+ "Ω…Ψ±Ψ­Ψ¨Ψ§ Ψ¨Ψ§Ω„ΨΉΨ§Ω„Ω…",
262
+ "Здравствуй ΠΌΠΈΡ€"
263
+ ]
264
+
265
+ print("=" * 50)
266
+ print("Single Text Encoding/Decoding Test")
267
+ print("=" * 50)
268
+
269
+ for text in test_texts:
270
+ print(f"\nOriginal: {text}")
271
+
272
+ # Encode
273
+ encoded = tokenizer.encode(text)
274
+ print(f"Encoded length: {encoded['length']}")
275
+ print(f"First 10 bytes: {encoded['input_ids'][:10]}")
276
+
277
+ # Decode
278
+ decoded = tokenizer.decode(encoded['input_ids'])
279
+ print(f"Decoded: {decoded}")
280
+ print(f"Match: {decoded == text}")
281
+
282
+ print("\n" + "=" * 50)
283
+ print("Batch Encoding/Decoding Test")
284
+ print("=" * 50)
285
+
286
+ # Batch test
287
+ batch_result = tokenizer.encode_batch(test_texts)
288
+ print(f"Batch shape: {batch_result['input_ids'].shape}")
289
+ print(f"Attention mask shape: {batch_result['attention_mask'].shape}")
290
+
291
+ # Decode batch
292
+ decoded_texts = tokenizer.decode_batch(batch_result['input_ids'])
293
+ print("\nBatch decoding results:")
294
+ for orig, dec in zip(test_texts, decoded_texts):
295
+ print(f"Original: {orig}")
296
+ print(f"Decoded: {dec}")
297
+ print(f"Match: {orig == dec}")
298
+ print()
core/unified_model.py CHANGED
@@ -1,6 +1,10 @@
1
  """
2
- Unified Intelligent Tokenizer Model v6.0
3
- 순수 ν•™μŠ΅ 기반 - λͺ¨λ“  핡심 μ½”λ“œ 톡합
 
 
 
 
4
  """
5
 
6
  import torch
@@ -48,7 +52,7 @@ class ByteTokenizer:
48
  Pure byte-level tokenizer - no language rules
49
  """
50
 
51
- def __init__(self, max_seq_len: int = 512):
52
  self.max_seq_len = max_seq_len
53
  self.PAD = 256
54
  self.BOS = 257
@@ -108,44 +112,73 @@ class ByteTokenizer:
108
  return "".join([chr(b) if b < 128 else '?' for b in input_ids if b < 256])
109
 
110
 
111
- class ByteEncoder(nn.Module):
112
  """
113
- 5-Layer Encoder with Positional Encoding
114
- Layer dimensions: [384, 384, 512, 640, 768] - μˆ˜μ •λ¨
 
 
 
 
 
 
115
  """
116
-
117
  def __init__(
118
  self,
119
  vocab_size: int = 260,
120
- hidden_dims: List[int] = [384, 384, 512, 640, 768], # 512 μΆ”κ°€
121
- num_heads: int = 8,
122
  dropout: float = 0.1,
123
- max_seq_len: int = 512
124
  ):
125
  super().__init__()
126
 
127
- # Byte embedding
128
  self.byte_embedding = nn.Embedding(vocab_size, hidden_dims[0])
129
-
130
- # Positional encoding (Sinusoidal)
 
 
 
 
 
 
 
 
 
 
131
  self.pos_encoding = PositionalEncoding(hidden_dims[0], max_seq_len, dropout)
132
-
 
 
 
 
 
 
 
 
 
133
  # 5 Transformer layers with dimension changes
134
  self.layers = nn.ModuleList()
135
  for i in range(len(hidden_dims)):
136
  input_dim = hidden_dims[i-1] if i > 0 else hidden_dims[0]
137
  output_dim = hidden_dims[i]
138
-
139
  # Projection layer if dimension changes
140
  if input_dim != output_dim:
141
  proj = nn.Linear(input_dim, output_dim)
142
  else:
143
  proj = None
144
-
 
 
 
 
145
  # Transformer encoder layer
146
  layer = nn.TransformerEncoderLayer(
147
  d_model=output_dim,
148
- nhead=num_heads,
149
  dim_feedforward=output_dim * 4,
150
  dropout=dropout,
151
  activation='gelu',
@@ -164,13 +197,31 @@ class ByteEncoder(nn.Module):
164
  def forward(
165
  self,
166
  input_ids: torch.Tensor,
167
- attention_mask: Optional[torch.Tensor] = None
 
 
168
  ) -> Dict[str, torch.Tensor]:
169
- # Embed bytes
 
 
 
 
 
 
 
 
170
  x = self.byte_embedding(input_ids)
171
-
172
- # Add positional encoding
173
  x = self.pos_encoding(x)
 
 
 
 
 
 
 
 
174
 
175
  # Prepare attention mask
176
  if attention_mask is not None:
@@ -178,17 +229,46 @@ class ByteEncoder(nn.Module):
178
  # It expects shape (batch_size, seq_len) and handles masking internally
179
  pass
180
 
181
- # Process through 5 layers
182
  all_hidden_states = []
183
- for layer_dict in self.layers:
184
- # Project if needed
 
 
 
 
185
  if layer_dict['projection'] is not None:
186
  x = layer_dict['projection'](x)
187
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  # Transformer layer - properly handle mask
189
  if attention_mask is not None:
190
- # TransformerEncoderLayer expects key_padding_mask (batch, seq)
191
- # where True means "ignore this position"
192
  key_padding_mask = (attention_mask == 0)
193
  x = layer_dict['transformer'](x, src_key_padding_mask=key_padding_mask)
194
  else:
@@ -207,7 +287,13 @@ class ByteEncoder(nn.Module):
207
  return {
208
  'last_hidden_state': x,
209
  'pooled_output': pooled,
210
- 'all_hidden_states': all_hidden_states
 
 
 
 
 
 
211
  }
212
 
213
 
@@ -217,15 +303,16 @@ class CrossAttention(nn.Module):
217
  μΆ”λ‘  λ ˆμ΄μ–΄ 연결을 μœ„ν•œ κ°•ν™”λœ 관계 ν•™μŠ΅
218
  """
219
 
220
- def __init__(self, hidden_dim: int = 768, num_heads: int = 8, dropout: float = 0.1):
221
  super().__init__()
222
-
 
223
  self.cross_attn = nn.MultiheadAttention(
224
  hidden_dim, num_heads, dropout, batch_first=True
225
  )
226
 
227
- # Enhanced relation classifier (8 types for richer relations)
228
- # 0: identity, 1: similar, 2: different, 3: continuation
229
  # 4: translation, 5: summary, 6: expansion, 7: contradiction
230
  self.relation_head = nn.Sequential(
231
  nn.Linear(hidden_dim * 2, hidden_dim),
@@ -236,6 +323,12 @@ class CrossAttention(nn.Module):
236
  nn.Dropout(dropout),
237
  nn.Linear(hidden_dim // 2, 8)
238
  )
 
 
 
 
 
 
239
 
240
  # Gating mechanism for adaptive fusion
241
  self.gate = nn.Sequential(
@@ -274,13 +367,22 @@ class CrossAttention(nn.Module):
274
 
275
  # Residual connection
276
  attn_output = attn_output + query
277
-
 
 
 
 
 
 
 
 
 
278
  # Adaptive gating for fusion
279
  gate_input = torch.cat([query.mean(dim=1), key.mean(dim=1)], dim=-1)
280
  gate_weights = self.gate(gate_input).unsqueeze(1)
281
-
282
- # Gated fusion: μ μ‘μ μœΌλ‘œ cross-attention κ²°κ³Ό 쑰절
283
- fused_output = gate_weights * attn_output + (1 - gate_weights) * query
284
 
285
  # Pool for relation classification
286
  query_pooled = query.mean(dim=1) if query_mask is None else \
@@ -295,8 +397,10 @@ class CrossAttention(nn.Module):
295
  return {
296
  'cross_attention': fused_output, # Gated fusion output
297
  'attention_weights': attn_weights,
 
298
  'relation_logits': relation_logits,
299
- 'gate_weights': gate_weights.squeeze(1) # For analysis
 
300
  }
301
 
302
 
@@ -304,15 +408,15 @@ class TransformerDecoder(nn.Module):
304
  """
305
  Transformer Decoder with Positional Encoding
306
  """
307
-
308
  def __init__(
309
  self,
310
  vocab_size: int = 260,
311
- hidden_dim: int = 768,
312
- num_heads: int = 8,
313
- num_layers: int = 6,
314
  dropout: float = 0.1,
315
- max_seq_len: int = 512
316
  ):
317
  super().__init__()
318
 
@@ -408,73 +512,87 @@ class TransformerDecoder(nn.Module):
408
  encoder_hidden: torch.Tensor,
409
  encoder_mask: Optional[torch.Tensor] = None,
410
  max_length: int = 128,
411
- temperature: float = 1.0,
412
- top_k: int = 50,
413
  top_p: float = 0.95
414
  ) -> torch.Tensor:
415
  batch_size = encoder_hidden.size(0)
416
  device = encoder_hidden.device
417
-
418
  # Start with BOS
419
  decoder_input_ids = torch.full((batch_size, 1), 257, device=device)
420
-
 
 
 
421
  for _ in range(max_length - 1):
422
  # Forward pass
423
  outputs = self.forward(encoder_hidden, decoder_input_ids, encoder_mask)
424
  next_token_logits = outputs['logits'][:, -1, :] / temperature
425
-
426
  # Top-k filtering
427
  if top_k > 0:
428
  indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
429
  next_token_logits[indices_to_remove] = float('-inf')
430
-
431
  # Top-p filtering
432
  if top_p < 1.0:
433
  sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
434
  cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
435
-
436
  sorted_indices_to_remove = cumulative_probs > top_p
437
  sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
438
  sorted_indices_to_remove[..., 0] = 0
439
-
440
  indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
441
  next_token_logits[indices_to_remove] = float('-inf')
442
-
443
  # Sample
444
  probs = F.softmax(next_token_logits, dim=-1)
445
  next_tokens = torch.multinomial(probs, 1)
 
 
 
 
446
  decoder_input_ids = torch.cat([decoder_input_ids, next_tokens], dim=-1)
447
-
448
- # Stop at EOS
449
- if (next_tokens == 258).all(): # EOS token
 
 
 
450
  break
451
-
452
  return decoder_input_ids
453
 
454
 
455
- class IntelligentTokenizerModel(nn.Module):
456
  """
457
- Complete Intelligent Tokenizer Model v6.0
458
- 톡합 λͺ¨λΈ - Encoder + Decoder + Cross-Attention
 
 
 
459
  """
460
-
461
  def __init__(
462
  self,
463
  vocab_size: int = 260,
464
- encoder_dims: List[int] = [384, 384, 512, 640, 768], # 512 μΆ”κ°€
465
- decoder_hidden: int = 768,
466
- num_heads: int = 8,
467
- num_decoder_layers: int = 6,
 
468
  dropout: float = 0.1,
469
- max_seq_len: int = 512
470
  ):
471
  super().__init__()
472
-
473
- # Components
474
  self.tokenizer = ByteTokenizer(max_seq_len)
475
- self.encoder = ByteEncoder(vocab_size, encoder_dims, num_heads, dropout, max_seq_len)
476
- self.decoder = TransformerDecoder(vocab_size, decoder_hidden, num_heads, num_decoder_layers, dropout, max_seq_len)
477
- self.cross_attention = CrossAttention(encoder_dims[-1], num_heads, dropout)
478
 
479
  def forward(
480
  self,
@@ -483,6 +601,8 @@ class IntelligentTokenizerModel(nn.Module):
483
  attention_mask: Optional[torch.Tensor] = None,
484
  decoder_input_ids: Optional[torch.Tensor] = None,
485
  labels: Optional[torch.Tensor] = None,
 
 
486
  use_cross_attention: bool = True
487
  ) -> Dict[str, torch.Tensor]:
488
  # Tokenize if text input
@@ -495,13 +615,24 @@ class IntelligentTokenizerModel(nn.Module):
495
  batch_size, seq_len = input_ids.shape
496
  device = input_ids.device
497
 
498
- # Encode
499
- encoder_outputs = self.encoder(input_ids, attention_mask)
500
- encoder_hidden = encoder_outputs['last_hidden_state'] # [batch, seq, 768]
501
-
502
- # 차원 확인
503
- assert encoder_hidden.size(-1) == 768, f"Encoder dim mismatch: {encoder_hidden.size(-1)}"
504
-
 
 
 
 
 
 
 
 
 
 
 
505
  # Decode
506
  decoder_outputs = self.decoder(
507
  encoder_hidden,
@@ -542,25 +673,47 @@ class IntelligentTokenizerModel(nn.Module):
542
  decoder_outputs['logits'].reshape(-1, decoder_outputs['logits'].size(-1)),
543
  labels.reshape(-1)
544
  )
545
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
  # Relation loss (if cross-attention used)
547
  relation_loss = 0
548
  if relation_logits is not None:
549
  # 자기 κ΄€κ³„λŠ” identity (class 0)μ—¬μ•Ό 함
550
  batch_identity = torch.zeros(batch_size, dtype=torch.long, device=device)
551
  relation_loss = F.cross_entropy(relation_logits, batch_identity) * 0.1
552
-
553
- loss = recon_loss + relation_loss
554
 
555
  return {
556
  'loss': loss,
557
  'logits': decoder_outputs['logits'],
 
558
  'encoder_hidden_states': encoder_hidden,
559
  'decoder_hidden_states': decoder_hidden,
560
  'pooled_output': encoder_outputs['pooled_output'],
561
  'cross_attention': cross_attn_outputs['cross_attention'] if cross_attn_outputs else None,
562
  'relation_logits': relation_logits,
563
- 'all_encoder_states': encoder_outputs.get('all_hidden_states', None)
 
 
 
 
 
564
  }
565
 
566
  def encode_text(self, text: str) -> torch.Tensor:
 
1
  """
2
+ Unified Intelligent Tokenizer Model v6.1.2
3
+ Compression-First Learning with Adaptive Splitting
4
+ - 64 byte chunks for aggressive compression
5
+ - 50 epoch checkpoints with automatic splitting
6
+ - Group relation learning for reconstruction
7
+ - Boundary adjustment for semantic units
8
  """
9
 
10
  import torch
 
52
  Pure byte-level tokenizer - no language rules
53
  """
54
 
55
+ def __init__(self, max_seq_len: int = 64): # v6.1.2: 64 bytes for compression-first approach
56
  self.max_seq_len = max_seq_len
57
  self.PAD = 256
58
  self.BOS = 257
 
112
  return "".join([chr(b) if b < 128 else '?' for b in input_ids if b < 256])
113
 
114
 
115
+ class ByteEncoderV61(nn.Module):
116
  """
117
+ v6.1: 5-Layer Encoder with Layer-Specialized Architecture
118
+ Layer 0: 768d - Byte to character (with curriculum learning)
119
+ Layer 1: 896d - Language pattern discovery (no labels)
120
+ Layer 2: 1024d - Eojeol/Word formation (+ eojeol PE)
121
+ Layer 3: 1152d - Small phrase grouping (2-3 eojeols)
122
+ Layer 4: 1280d - Final refinement (+ context PE)
123
+
124
+ Target: μ–΄μ ˆ(eojeol) to ꡬ(phrase) level compression (3:1 ratio)
125
  """
126
+
127
  def __init__(
128
  self,
129
  vocab_size: int = 260,
130
+ hidden_dims: List[int] = [768, 896, 1024, 1152, 1280], # v6.1 dimensions
131
+ num_heads: List[int] = [12, 14, 16, 18, 20], # v6.1: Progressive heads per layer
132
  dropout: float = 0.1,
133
+ max_seq_len: int = 64 # v6.1.2: 64 chunk for compression-first
134
  ):
135
  super().__init__()
136
 
137
+ # Layer 0: Byte to Character with Curriculum Learning
138
  self.byte_embedding = nn.Embedding(vocab_size, hidden_dims[0])
139
+
140
+ # v6.1: Multi-level boundary predictors for hierarchical segmentation
141
+ # Level 1: Character boundaries (UTF-8 multi-byte)
142
+ self.char_boundary_predictor = nn.Linear(hidden_dims[0], 3) # 0: continue, 1: start, 2: end
143
+
144
+ # Level 2: Eojeol boundaries (space + particle analysis)
145
+ self.eojeol_boundary_predictor = nn.Linear(hidden_dims[2], 4) # 0: inside, 1: space, 2: particle, 3: punct
146
+
147
+ # Level 3: Phrase boundaries (syntactic chunks)
148
+ self.phrase_boundary_predictor = nn.Linear(hidden_dims[3], 3) # 0: inside, 1: weak boundary, 2: strong boundary
149
+
150
+ # v6.1: Positional encoding ONLY for Layer 0
151
  self.pos_encoding = PositionalEncoding(hidden_dims[0], max_seq_len, dropout)
152
+
153
+ # v6.1: Layer 1 - Language pattern discovery (no labels!)
154
+ self.pattern_discoverer = nn.Linear(hidden_dims[1], 256) # Discover patterns autonomously (from 896d)
155
+ self.lang_signal_generator = nn.Linear(hidden_dims[1], 128) # Generate language signals (from 896d)
156
+
157
+ # v6.1: Group-aware relative position encodings for Layer 2-4
158
+ self.group_pe_layer2 = nn.Embedding(max_seq_len, hidden_dims[2]) # For eojeol/word units
159
+ self.group_pe_layer3 = nn.Embedding(max_seq_len, hidden_dims[3]) # For small phrases (2-3 eojeols)
160
+ self.group_pe_layer4 = nn.Embedding(max_seq_len, hidden_dims[4]) # For context/discourse
161
+
162
  # 5 Transformer layers with dimension changes
163
  self.layers = nn.ModuleList()
164
  for i in range(len(hidden_dims)):
165
  input_dim = hidden_dims[i-1] if i > 0 else hidden_dims[0]
166
  output_dim = hidden_dims[i]
167
+
168
  # Projection layer if dimension changes
169
  if input_dim != output_dim:
170
  proj = nn.Linear(input_dim, output_dim)
171
  else:
172
  proj = None
173
+
174
+ # v6.1: Layer-specific head count for optimal dimension per head
175
+ # Target: 64-80 dim per head
176
+ layer_heads = num_heads[i] if isinstance(num_heads, list) else num_heads
177
+
178
  # Transformer encoder layer
179
  layer = nn.TransformerEncoderLayer(
180
  d_model=output_dim,
181
+ nhead=layer_heads,
182
  dim_feedforward=output_dim * 4,
183
  dropout=dropout,
184
  activation='gelu',
 
197
  def forward(
198
  self,
199
  input_ids: torch.Tensor,
200
+ attention_mask: Optional[torch.Tensor] = None,
201
+ boundary_labels: Optional[torch.Tensor] = None,
202
+ epoch: int = 0
203
  ) -> Dict[str, torch.Tensor]:
204
+ """
205
+ v6.1 Forward pass with curriculum learning
206
+ Args:
207
+ boundary_labels: UTF-8 boundary labels for curriculum learning (training only)
208
+ epoch: Current epoch for curriculum schedule
209
+ """
210
+ batch_size, seq_len = input_ids.shape
211
+
212
+ # Layer 0: Byte embedding with curriculum learning
213
  x = self.byte_embedding(input_ids)
214
+
215
+ # v6.1: Positional encoding ONLY at Layer 0
216
  x = self.pos_encoding(x)
217
+
218
+ # v6.1: Predict character boundaries (Layer 0)
219
+ char_boundaries = self.char_boundary_predictor(x)
220
+
221
+ # v6.1: Curriculum learning for character boundaries
222
+ # Note: boundary_labels are eojeol boundaries (4 classes), not char boundaries (3 classes)
223
+ # So we don't mix them with char_boundaries - they serve different purposes
224
+ char_boundary_weights = F.softmax(char_boundaries, dim=-1)
225
 
226
  # Prepare attention mask
227
  if attention_mask is not None:
 
229
  # It expects shape (batch_size, seq_len) and handles masking internally
230
  pass
231
 
232
+ # v6.1: Process through 5 specialized layers
233
  all_hidden_states = []
234
+ discovered_patterns = None
235
+ eojeol_boundaries = None
236
+ phrase_boundaries = None
237
+
238
+ for i, layer_dict in enumerate(self.layers):
239
+ # Project if needed (before layer-specific processing)
240
  if layer_dict['projection'] is not None:
241
  x = layer_dict['projection'](x)
242
+
243
+ # Layer 1: Add language signals (autonomous discovery)
244
+ if i == 1:
245
+ # Discover language patterns WITHOUT labels (x is now 896d)
246
+ discovered_patterns = self.pattern_discoverer(x)
247
+ lang_signals = self.lang_signal_generator(x)
248
+
249
+ # Layer 2: Predict eojeol boundaries and add position encoding
250
+ elif i == 2:
251
+ # Predict eojeol boundaries (spaces, particles, punctuation)
252
+ eojeol_boundaries = self.eojeol_boundary_predictor(x)
253
+ positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
254
+ group_pe = self.group_pe_layer2(positions)
255
+ x = x + group_pe * 0.1 # Mild addition to preserve main signal
256
+
257
+ # Layer 3: Predict phrase boundaries and add position encoding
258
+ elif i == 3:
259
+ # Predict phrase boundaries (weak/strong syntactic breaks)
260
+ phrase_boundaries = self.phrase_boundary_predictor(x)
261
+ positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
262
+ group_pe = self.group_pe_layer3(positions)
263
+ x = x + group_pe * 0.1
264
+
265
+ elif i == 4:
266
+ positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
267
+ group_pe = self.group_pe_layer4(positions)
268
+ x = x + group_pe * 0.1
269
+
270
  # Transformer layer - properly handle mask
271
  if attention_mask is not None:
 
 
272
  key_padding_mask = (attention_mask == 0)
273
  x = layer_dict['transformer'](x, src_key_padding_mask=key_padding_mask)
274
  else:
 
287
  return {
288
  'last_hidden_state': x,
289
  'pooled_output': pooled,
290
+ 'all_hidden_states': all_hidden_states,
291
+ # v6.1 boundary predictions
292
+ 'char_boundaries': char_boundaries,
293
+ 'char_boundary_weights': char_boundary_weights,
294
+ 'eojeol_boundaries': eojeol_boundaries,
295
+ 'phrase_boundaries': phrase_boundaries,
296
+ 'discovered_patterns': discovered_patterns
297
  }
298
 
299
 
 
303
  μΆ”λ‘  λ ˆμ΄μ–΄ 연결을 μœ„ν•œ κ°•ν™”λœ 관계 ν•™μŠ΅
304
  """
305
 
306
+ def __init__(self, hidden_dim: int = 1280, num_heads: int = 20, dropout: float = 0.1):
307
  super().__init__()
308
+
309
+ # v6.1: Adjusted for 1280d (64 per head with 20 heads)
310
  self.cross_attn = nn.MultiheadAttention(
311
  hidden_dim, num_heads, dropout, batch_first=True
312
  )
313
 
314
+ # v6.1: Enhanced relation classifier with reconstruction focus
315
+ # 0: identity (μ™„λ²½ν•œ 볡원), 1: similar, 2: different, 3: continuation
316
  # 4: translation, 5: summary, 6: expansion, 7: contradiction
317
  self.relation_head = nn.Sequential(
318
  nn.Linear(hidden_dim * 2, hidden_dim),
 
323
  nn.Dropout(dropout),
324
  nn.Linear(hidden_dim // 2, 8)
325
  )
326
+
327
+ # v6.1: Reconstruction-specific attention (볡원 μ „μš© μ–΄ν…μ…˜)
328
+ # Use 10 heads for reconstruction (128 per head)
329
+ self.reconstruction_attn = nn.MultiheadAttention(
330
+ hidden_dim, 10, dropout * 0.5, batch_first=True
331
+ )
332
 
333
  # Gating mechanism for adaptive fusion
334
  self.gate = nn.Sequential(
 
367
 
368
  # Residual connection
369
  attn_output = attn_output + query
370
+
371
+ # v6.1: Reconstruction-focused attention (볡원 μ΅œμ ν™”)
372
+ recon_output, recon_weights = self.reconstruction_attn(
373
+ query_norm, query_norm, query_norm, # Self-attention for consistency
374
+ key_padding_mask=(query_mask == 0) if query_mask is not None else None
375
+ )
376
+
377
+ # Combine cross and reconstruction attention
378
+ combined_attn = attn_output * 0.7 + recon_output * 0.3
379
+
380
  # Adaptive gating for fusion
381
  gate_input = torch.cat([query.mean(dim=1), key.mean(dim=1)], dim=-1)
382
  gate_weights = self.gate(gate_input).unsqueeze(1)
383
+
384
+ # Gated fusion: μ μ‘μ μœΌλ‘œ attention κ²°κ³Ό 쑰절
385
+ fused_output = gate_weights * combined_attn + (1 - gate_weights) * query
386
 
387
  # Pool for relation classification
388
  query_pooled = query.mean(dim=1) if query_mask is None else \
 
397
  return {
398
  'cross_attention': fused_output, # Gated fusion output
399
  'attention_weights': attn_weights,
400
+ 'reconstruction_weights': recon_weights, # v6.1: 볡원 μ–΄ν…μ…˜ κ°€μ€‘μΉ˜
401
  'relation_logits': relation_logits,
402
+ 'gate_weights': gate_weights.squeeze(1), # For analysis
403
+ 'reconstruction_score': F.softmax(relation_logits, dim=-1)[:, 0] # identity ν™•λ₯  (볡원도)
404
  }
405
 
406
 
 
408
  """
409
  Transformer Decoder with Positional Encoding
410
  """
411
+
412
  def __init__(
413
  self,
414
  vocab_size: int = 260,
415
+ hidden_dim: int = 1280, # v6.1: Match final encoder dim
416
+ num_heads: int = 16, # v6.1: 1280/16 = 80 per head
417
+ num_layers: int = 8, # v6.1 FINAL: 8 layers for better reconstruction
418
  dropout: float = 0.1,
419
+ max_seq_len: int = 64 # v6.1.2: 64 chunk for compression-first
420
  ):
421
  super().__init__()
422
 
 
512
  encoder_hidden: torch.Tensor,
513
  encoder_mask: Optional[torch.Tensor] = None,
514
  max_length: int = 128,
515
+ temperature: float = 0.1, # ν† ν¬λ‚˜μ΄μ €λŠ” 보수적 생성 (μ •ν™•ν•œ 볡원)
516
+ top_k: int = 10, # μƒμœ„ 10개만 κ³ λ €
517
  top_p: float = 0.95
518
  ) -> torch.Tensor:
519
  batch_size = encoder_hidden.size(0)
520
  device = encoder_hidden.device
521
+
522
  # Start with BOS
523
  decoder_input_ids = torch.full((batch_size, 1), 257, device=device)
524
+
525
+ # Track which sequences are done
526
+ finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
527
+
528
  for _ in range(max_length - 1):
529
  # Forward pass
530
  outputs = self.forward(encoder_hidden, decoder_input_ids, encoder_mask)
531
  next_token_logits = outputs['logits'][:, -1, :] / temperature
532
+
533
  # Top-k filtering
534
  if top_k > 0:
535
  indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
536
  next_token_logits[indices_to_remove] = float('-inf')
537
+
538
  # Top-p filtering
539
  if top_p < 1.0:
540
  sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
541
  cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
542
+
543
  sorted_indices_to_remove = cumulative_probs > top_p
544
  sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
545
  sorted_indices_to_remove[..., 0] = 0
546
+
547
  indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
548
  next_token_logits[indices_to_remove] = float('-inf')
549
+
550
  # Sample
551
  probs = F.softmax(next_token_logits, dim=-1)
552
  next_tokens = torch.multinomial(probs, 1)
553
+
554
+ # For finished sequences, force PAD token
555
+ next_tokens[finished] = 256 # PAD token
556
+
557
  decoder_input_ids = torch.cat([decoder_input_ids, next_tokens], dim=-1)
558
+
559
+ # Update finished status
560
+ finished = finished | (next_tokens.squeeze(-1) == 258) # Mark as finished if EOS
561
+
562
+ # Stop when all sequences are done
563
+ if finished.all():
564
  break
565
+
566
  return decoder_input_ids
567
 
568
 
569
+ class IntelligentTokenizerModelV61(nn.Module):
570
  """
571
+ Complete Intelligent Tokenizer Model v6.1
572
+ Pure learning-based with curriculum learning
573
+ - No language labels during training
574
+ - Curriculum learning for boundaries
575
+ - Group-aware position encodings
576
  """
577
+
578
  def __init__(
579
  self,
580
  vocab_size: int = 260,
581
+ encoder_dims: List[int] = [768, 896, 1024, 1152, 1280], # v6.1 dimensions
582
+ encoder_heads: List[int] = [12, 14, 16, 18, 20], # v6.1: Optimal heads per layer
583
+ decoder_hidden: int = 1280, # Match final encoder dim
584
+ decoder_heads: int = 16, # v6.1: 80 per head for decoder
585
+ num_decoder_layers: int = 8, # v6.1 FINAL: 8 layers for better reconstruction
586
  dropout: float = 0.1,
587
+ max_seq_len: int = 64 # v6.1.2: 64 chunk for compression-first
588
  ):
589
  super().__init__()
590
+
591
+ # v6.1 Components with optimized head counts
592
  self.tokenizer = ByteTokenizer(max_seq_len)
593
+ self.encoder = ByteEncoderV61(vocab_size, encoder_dims, encoder_heads, dropout, max_seq_len)
594
+ self.decoder = TransformerDecoder(vocab_size, decoder_hidden, decoder_heads, num_decoder_layers, dropout, max_seq_len)
595
+ self.cross_attention = CrossAttention(encoder_dims[-1], 20, dropout) # 20 heads for 1280d
596
 
597
  def forward(
598
  self,
 
601
  attention_mask: Optional[torch.Tensor] = None,
602
  decoder_input_ids: Optional[torch.Tensor] = None,
603
  labels: Optional[torch.Tensor] = None,
604
+ boundary_labels: Optional[torch.Tensor] = None, # v6.1: for curriculum learning
605
+ epoch: int = 0, # v6.1: for curriculum schedule
606
  use_cross_attention: bool = True
607
  ) -> Dict[str, torch.Tensor]:
608
  # Tokenize if text input
 
615
  batch_size, seq_len = input_ids.shape
616
  device = input_ids.device
617
 
618
+ # v6.1: Encode with curriculum learning
619
+ encoder_outputs = self.encoder(input_ids, attention_mask, boundary_labels, epoch)
620
+ encoder_hidden = encoder_outputs['last_hidden_state'] # v6.1: [batch, seq, 1280]
621
+
622
+ # v6.1: 차원 확인 - μ΅œμ’… 차원은 1280
623
+ assert encoder_hidden.size(-1) == 1280, f"Encoder dim mismatch: {encoder_hidden.size(-1)}"
624
+
625
+ # Prepare decoder input for teacher forcing during training
626
+ if decoder_input_ids is None:
627
+ if labels is not None:
628
+ # During training, use shifted labels as decoder input (teacher forcing)
629
+ # Add BOS at the beginning and remove last token
630
+ bos_tokens = torch.full((batch_size, 1), self.tokenizer.BOS, device=labels.device, dtype=labels.dtype)
631
+ decoder_input_ids = torch.cat([bos_tokens, labels[:, :-1]], dim=1)
632
+ else:
633
+ # For inference/test, start with BOS token
634
+ decoder_input_ids = torch.full((batch_size, 1), self.tokenizer.BOS, device=device, dtype=torch.long)
635
+
636
  # Decode
637
  decoder_outputs = self.decoder(
638
  encoder_hidden,
 
673
  decoder_outputs['logits'].reshape(-1, decoder_outputs['logits'].size(-1)),
674
  labels.reshape(-1)
675
  )
676
+
677
+ # Boundary loss (if boundary labels provided)
678
+ boundary_loss = 0
679
+ if boundary_labels is not None and encoder_outputs.get('eojeol_boundaries') is not None:
680
+ # Eojeol boundary loss
681
+ eojeol_boundaries = encoder_outputs['eojeol_boundaries'] # [batch, seq, 4]
682
+ if eojeol_boundaries.size(1) == boundary_labels.size(1):
683
+ # Ensure boundary labels are in valid range (0-3)
684
+ # Clamp to valid range to prevent CUDA errors
685
+ boundary_labels_clamped = torch.clamp(boundary_labels, min=0, max=3)
686
+
687
+ boundary_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) # Use -1 for padding
688
+ boundary_loss = boundary_loss_fct(
689
+ eojeol_boundaries.reshape(-1, 4),
690
+ boundary_labels_clamped.reshape(-1)
691
+ ) * 0.5 # Weight for boundary loss
692
+
693
  # Relation loss (if cross-attention used)
694
  relation_loss = 0
695
  if relation_logits is not None:
696
  # 자기 κ΄€κ³„λŠ” identity (class 0)μ—¬μ•Ό 함
697
  batch_identity = torch.zeros(batch_size, dtype=torch.long, device=device)
698
  relation_loss = F.cross_entropy(relation_logits, batch_identity) * 0.1
699
+
700
+ loss = recon_loss + boundary_loss + relation_loss
701
 
702
  return {
703
  'loss': loss,
704
  'logits': decoder_outputs['logits'],
705
+ 'decoder_logits': decoder_outputs['logits'], # Add for compatibility
706
  'encoder_hidden_states': encoder_hidden,
707
  'decoder_hidden_states': decoder_hidden,
708
  'pooled_output': encoder_outputs['pooled_output'],
709
  'cross_attention': cross_attn_outputs['cross_attention'] if cross_attn_outputs else None,
710
  'relation_logits': relation_logits,
711
+ 'all_encoder_states': encoder_outputs.get('all_hidden_states', None),
712
+ # Add boundary predictions for visualization
713
+ 'char_boundaries': encoder_outputs.get('char_boundaries'),
714
+ 'eojeol_boundaries': encoder_outputs.get('eojeol_boundaries'),
715
+ 'phrase_boundaries': encoder_outputs.get('phrase_boundaries'),
716
+ 'discovered_patterns': encoder_outputs.get('discovered_patterns')
717
  }
718
 
719
  def encode_text(self, text: str) -> torch.Tensor: