ggunio commited on
Commit
c65503b
ยท
verified ยท
1 Parent(s): 7aabfdc

Upload core/boundary_aware_model.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. core/boundary_aware_model.py +574 -0
core/boundary_aware_model.py ADDED
@@ -0,0 +1,574 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Boundary-Aware Intelligent Tokenizer Model
3
+ ๋ฐ”์ดํŠธ-๋ฌธ์ž ๊ด€๊ณ„๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ํ•™์Šตํ•˜๋Š” ๋ชจ๋ธ
4
+ """
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ from typing import Dict, List, Optional, Tuple
10
+ import math
11
+
12
+ # Import necessary components from unified_model
13
+ from .unified_model import ByteEncoder, TransformerDecoder, CrossAttention, PositionalEncoding
14
+
15
+
16
+ class BoundaryAwareEncoder(nn.Module):
17
+ """
18
+ ๋ฐ”์ดํŠธ-๋ฌธ์ž ๊ฒฝ๊ณ„๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ํ•™์Šตํ•˜๋Š” ์ธ์ฝ”๋”
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ vocab_size: int = 260,
24
+ hidden_dims: List[int] = [512, 512, 640, 768, 768], # 384โ†’512๋กœ ์ฆ๊ฐ€
25
+ num_heads: int = 8,
26
+ dropout: float = 0.1,
27
+ max_seq_len: int = 512
28
+ ):
29
+ super().__init__()
30
+
31
+ # 1. ๋ฐ”์ดํŠธ ์ž„๋ฒ ๋”ฉ
32
+ self.byte_embedding = nn.Embedding(vocab_size, hidden_dims[0])
33
+
34
+ # 2. ๊ฒฝ๊ณ„ ์ž„๋ฒ ๋”ฉ (START, CONT, END, SPECIAL) - ๋” ํฐ ์ฐจ์›
35
+ self.boundary_embedding = nn.Embedding(4, 128) # ๊ณ ์ • 128์ฐจ์›
36
+
37
+ # 3. ๋ฌธ์ž ํƒ€์ž… ์ž„๋ฒ ๋”ฉ (ASCII, Korean, Chinese, etc.) - ๋” ํฐ ์ฐจ์›
38
+ self.char_type_embedding = nn.Embedding(14, 128) # ๊ณ ์ • 128์ฐจ์›
39
+
40
+ # 4. ๋ฐ”์ดํŠธ ์นด์šดํŠธ ์ž„๋ฒ ๋”ฉ (1-4 bytes) - UTF-8 ํŒจํ„ด ์ค‘์š”
41
+ self.byte_count_embedding = nn.Embedding(5, 128) # ๊ณ ์ • 128์ฐจ์›
42
+
43
+ # 5. ๋ฌธ์ž ์ธ๋ฑ์Šค ์ž„๋ฒ ๋”ฉ (relative position within char)
44
+ self.char_position_embedding = nn.Embedding(4, 128) # ๊ณ ์ • 128์ฐจ์›
45
+
46
+ # ํ†ตํ•ฉ projection (๋ฐ”์ดํŠธ ์ž„๋ฒ ๋”ฉ 512 + ๊ตฌ์กฐ ์ž„๋ฒ ๋”ฉ 512 = 1024)
47
+ structural_dim = 128 * 4 # boundary(128) + char_type(128) + byte_count(128) + char_pos(128)
48
+ self.input_projection = nn.Linear(hidden_dims[0] + structural_dim, hidden_dims[0])
49
+
50
+ # Positional encoding
51
+ self.pos_encoding = PositionalEncoding(hidden_dims[0], max_seq_len, dropout)
52
+
53
+ # Transformer layers (๊ธฐ์กด ๊ตฌ์กฐ ์žฌ์‚ฌ์šฉ)
54
+ self.layers = nn.ModuleList()
55
+ for i in range(len(hidden_dims)):
56
+ input_dim = hidden_dims[i-1] if i > 0 else hidden_dims[0]
57
+ output_dim = hidden_dims[i]
58
+
59
+ if input_dim != output_dim:
60
+ proj = nn.Linear(input_dim, output_dim)
61
+ else:
62
+ proj = None
63
+
64
+ layer = nn.TransformerEncoderLayer(
65
+ d_model=output_dim,
66
+ nhead=num_heads,
67
+ dim_feedforward=output_dim * 4,
68
+ dropout=dropout,
69
+ activation='gelu',
70
+ batch_first=True,
71
+ norm_first=True
72
+ )
73
+
74
+ self.layers.append(nn.ModuleDict({
75
+ 'projection': proj,
76
+ 'transformer': layer,
77
+ 'norm': nn.LayerNorm(output_dim)
78
+ }))
79
+
80
+ # Hierarchical Merging Components (์ƒˆ๋กœ ์ถ”๊ฐ€)
81
+ # ๊ฐ ๋ ˆ์ด์–ด๋งˆ๋‹ค ๋ณ‘ํ•ฉ ๋ชจ๋“ˆ ์ถ”๊ฐ€ - ํŠธ๋žœ์Šคํฌ๋จธ๊ฐ€ ์Šค์Šค๋กœ ๊ฒฐ์ •
82
+ self.merging_modules = nn.ModuleList()
83
+
84
+ for i in range(len(hidden_dims)):
85
+ dim = hidden_dims[i]
86
+ # Learned merging decision - no fixed ratios!
87
+ merge_module = nn.ModuleDict({
88
+ # ๊ฒฝ๊ณ„ ํ•™์Šต์„ ์œ„ํ•œ ๋ชจ๋“ˆ
89
+ 'boundary_detector': nn.Linear(dim, 3), # START, CONT, END
90
+ 'merge_attention': nn.MultiheadAttention(dim, num_heads//2, dropout, batch_first=True),
91
+ 'merge_gate': nn.Sequential(
92
+ nn.Linear(dim * 2, dim),
93
+ nn.ReLU(),
94
+ nn.Linear(dim, 1)
95
+ ), # ๋ณ‘ํ•ฉ ๊ฒฐ์ • (ํ•™์Šต์œผ๋กœ ๊ฒฐ์ •)
96
+ 'merge_proj': nn.Linear(dim * 2, dim), # ๋ณ‘ํ•ฉ ํ›„ ํ”„๋กœ์ ์…˜
97
+ })
98
+ self.merging_modules.append(merge_module)
99
+
100
+ # ๊ฒฝ๊ณ„ ์˜ˆ์ธก ํ—ค๋“œ
101
+ self.boundary_predictor = nn.Linear(hidden_dims[-1], 4)
102
+
103
+ # ๋ฌธ์ž ํƒ€์ž… ์˜ˆ์ธก ํ—ค๋“œ
104
+ self.char_type_predictor = nn.Linear(hidden_dims[-1], 14)
105
+
106
+ def forward(
107
+ self,
108
+ input_ids: torch.Tensor,
109
+ boundary_labels: Optional[torch.Tensor] = None,
110
+ char_types: Optional[torch.Tensor] = None,
111
+ byte_counts: Optional[torch.Tensor] = None,
112
+ char_indices: Optional[torch.Tensor] = None,
113
+ attention_mask: Optional[torch.Tensor] = None
114
+ ) -> Dict[str, torch.Tensor]:
115
+
116
+ batch_size, seq_len = input_ids.shape
117
+ device = input_ids.device
118
+
119
+ # 1. ๋ฐ”์ดํŠธ ์ž„๋ฒ ๋”ฉ
120
+ byte_emb = self.byte_embedding(input_ids) # [B, S, D]
121
+
122
+ # 2. ๊ฒฝ๊ณ„ ์ •๋ณด ์ž„๋ฒ ๋”ฉ (ํ•™์Šต ์‹œ์—๋งŒ)
123
+ if boundary_labels is not None:
124
+ boundary_emb = self.boundary_embedding(boundary_labels) # [B, S, D/4]
125
+ else:
126
+ # ์ถ”๋ก  ์‹œ: ๋ฐ”์ดํŠธ ๊ฐ’์œผ๋กœ๋ถ€ํ„ฐ ๊ฒฝ๊ณ„ ์ถ”์ •
127
+ # UTF-8 ํŒจํ„ด:
128
+ # 0xxxxxxx (0-127): ASCII (START)
129
+ # 110xxxxx (192-223): 2-byte start
130
+ # 1110xxxx (224-239): 3-byte start
131
+ # 11110xxx (240-247): 4-byte start
132
+ # 10xxxxxx (128-191): continuation
133
+
134
+ estimated_boundaries = torch.zeros_like(input_ids)
135
+
136
+ # ASCII (0-127)
137
+ ascii_mask = input_ids < 128
138
+ estimated_boundaries[ascii_mask] = 1 # START
139
+
140
+ # Continuation bytes (128-191)
141
+ cont_mask = (input_ids >= 128) & (input_ids < 192)
142
+ estimated_boundaries[cont_mask] = 0 # CONT
143
+
144
+ # Multi-byte starters
145
+ mb_start_mask = input_ids >= 192
146
+ estimated_boundaries[mb_start_mask] = 1 # START
147
+
148
+ boundary_emb = self.boundary_embedding(estimated_boundaries)
149
+
150
+ # 3. ๋ฌธ์ž ํƒ€์ž… ์ž„๋ฒ ๋”ฉ
151
+ if char_types is not None:
152
+ char_type_emb = self.char_type_embedding(char_types)
153
+ else:
154
+ # ์ถ”๋ก  ์‹œ: ๊ธฐ๋ณธ๊ฐ’ ์‚ฌ์šฉ
155
+ char_type_emb = self.char_type_embedding(torch.zeros_like(input_ids))
156
+
157
+ # 4. ๋ฐ”์ดํŠธ ์นด์šดํŠธ ์ž„๋ฒ ๋”ฉ
158
+ if byte_counts is not None:
159
+ byte_count_emb = self.byte_count_embedding(torch.clamp(byte_counts, 0, 4))
160
+ else:
161
+ # ์ถ”๋ก  ์‹œ: ๋ฐ”์ดํŠธ ํŒจํ„ด์œผ๋กœ ์ถ”์ •
162
+ estimated_counts = torch.ones_like(input_ids)
163
+ # UTF-8 ํŒจํ„ด์œผ๋กœ ๋ฉ€ํ‹ฐ๋ฐ”์ดํŠธ ๊ธธ์ด ์ถ”์ •
164
+ estimated_counts[input_ids >= 240] = 4 # 4-byte
165
+ estimated_counts[(input_ids >= 224) & (input_ids < 240)] = 3 # 3-byte
166
+ estimated_counts[(input_ids >= 192) & (input_ids < 224)] = 2 # 2-byte
167
+ byte_count_emb = self.byte_count_embedding(estimated_counts)
168
+
169
+ # 5. ๋ฌธ์ž ๋‚ด ์œ„์น˜ ์ž„๋ฒ ๋”ฉ
170
+ if char_indices is not None:
171
+ # ๊ฐ™์€ ๋ฌธ์ž ๋‚ด์—์„œ์˜ ์ƒ๋Œ€ ์œ„์น˜ ๊ณ„์‚ฐ
172
+ char_positions = torch.zeros_like(char_indices)
173
+ for b in range(batch_size):
174
+ current_char = -1
175
+ position = 0
176
+ for i in range(seq_len):
177
+ if char_indices[b, i] != current_char:
178
+ current_char = char_indices[b, i]
179
+ position = 0
180
+ else:
181
+ position += 1
182
+ char_positions[b, i] = min(position, 3)
183
+
184
+ char_pos_emb = self.char_position_embedding(char_positions)
185
+ else:
186
+ char_pos_emb = self.char_position_embedding(torch.zeros_like(input_ids))
187
+
188
+ # 6. ๋ชจ๋“  ์ž„๋ฒ ๋”ฉ ํ†ตํ•ฉ
189
+ # ๋ฐ”์ดํŠธ ์ž„๋ฒ ๋”ฉ + ๊ตฌ์กฐ ์ •๋ณด
190
+ structural_emb = torch.cat([
191
+ boundary_emb,
192
+ char_type_emb,
193
+ byte_count_emb,
194
+ char_pos_emb
195
+ ], dim=-1) # [B, S, D]
196
+
197
+ combined_emb = torch.cat([byte_emb, structural_emb], dim=-1) # [B, S, 2*D]
198
+
199
+ # Projection to original dimension
200
+ x = self.input_projection(combined_emb) # [B, S, D]
201
+
202
+ # Positional encoding
203
+ x = self.pos_encoding(x)
204
+
205
+ # Transformer layers with hierarchical merging
206
+ all_hidden_states = []
207
+ boundary_predictions = []
208
+ char_type_predictions = []
209
+ merge_info = [] # ๋ณ‘ํ•ฉ ์ •๋ณด ์ €์žฅ
210
+
211
+ for i, layer_dict in enumerate(self.layers):
212
+ # Project if needed
213
+ if layer_dict['projection'] is not None:
214
+ x = layer_dict['projection'](x)
215
+
216
+ # Transformer layer
217
+ if attention_mask is not None:
218
+ # Ensure mask matches current sequence length
219
+ current_seq_len = x.size(1)
220
+ if attention_mask.size(1) != current_seq_len:
221
+ # Adjust mask to match current sequence length after merging
222
+ key_padding_mask = torch.zeros(x.size(0), current_seq_len, dtype=torch.bool, device=x.device)
223
+ # Copy valid mask values
224
+ valid_len = min(attention_mask.size(1), current_seq_len)
225
+ key_padding_mask[:, :valid_len] = (attention_mask[:, :valid_len] == 0)
226
+ else:
227
+ key_padding_mask = (attention_mask == 0)
228
+ x = layer_dict['transformer'](x, src_key_padding_mask=key_padding_mask)
229
+ else:
230
+ x = layer_dict['transformer'](x)
231
+
232
+ x = layer_dict['norm'](x)
233
+
234
+ # Store hidden state BEFORE merging (for proper gradient flow)
235
+ all_hidden_states.append(x.clone())
236
+
237
+ # Hierarchical Progressive Merging - ๊ณ„์ธต์  ์ ์ง„์  ๋ณ‘ํ•ฉ
238
+ # Layer๋ณ„๋กœ ๋‹ค๋ฅธ ์ˆ˜์ค€์˜ ๋ณ‘ํ•ฉ ํ•™์Šต (๋ฐ”์ดํŠธโ†’๋ฌธ์žโ†’๋‹จ์–ดโ†’์–ด์ ˆ)
239
+ if i < len(self.merging_modules) and self.merging_modules[i] is not None:
240
+ merge_module = self.merging_modules[i]
241
+ batch_size, seq_len, hidden_dim = x.shape
242
+
243
+ # Skip if already compressed too much
244
+ if seq_len < 4:
245
+ continue
246
+
247
+ # Layer 0: UTF-8 ๊ฒฝ๊ณ„ ๊ธฐ๋ฐ˜ ๋ณ‘ํ•ฉ (๋ฐ”์ดํŠธ โ†’ ๋ฌธ์ž)
248
+ if i == 0 and input_ids is not None:
249
+ # UTF-8 ๊ฒฝ๊ณ„ ๊ฐ์ง€๋ฅผ ์‚ฌ์šฉํ•œ ํ™•์‹คํ•œ ๋ณ‘ํ•ฉ
250
+ merge_decisions = torch.zeros(batch_size, seq_len - 1, device=x.device)
251
+
252
+ for b in range(batch_size):
253
+ for idx in range(seq_len - 1):
254
+ if idx < input_ids.shape[1] - 1:
255
+ current_byte = input_ids[b, idx].item()
256
+ next_byte = input_ids[b, idx + 1].item()
257
+
258
+ # Continuation byte (10xxxxxx) should merge with previous
259
+ if 128 <= next_byte < 192: # Next is continuation
260
+ merge_decisions[b, idx] = 1.0 # Merge with next
261
+ # Special tokens don't merge
262
+ elif current_byte >= 256 or next_byte >= 256:
263
+ merge_decisions[b, idx] = 0.0
264
+
265
+ # Also calculate merge_probs for logging
266
+ x_pairs = torch.cat([x[:, :-1], x[:, 1:]], dim=-1)
267
+ merge_scores = merge_module['merge_gate'](x_pairs).squeeze(-1)
268
+ merge_probs = torch.sigmoid(merge_scores)
269
+
270
+ # Use UTF-8 based decisions for layer 0
271
+ layer_merge_threshold = 0.5 # Not used but logged
272
+
273
+ else:
274
+ # Other layers: ํ•™์Šต ๊ธฐ๋ฐ˜ ๋ณ‘ํ•ฉ
275
+ # 1. ํŠธ๋žœ์Šคํฌ๋จธ๊ฐ€ ๋ณ‘ํ•ฉ ๊ฒฝ๊ณ„๋ฅผ ํ•™์Šต
276
+ # ์ธ์ ‘ ํ† ํฐ ์Œ์˜ ๋ณ‘ํ•ฉ ์ ์ˆ˜ ๊ณ„์‚ฐ
277
+ x_pairs = torch.cat([x[:, :-1], x[:, 1:]], dim=-1) # [B, S-1, 2*D]
278
+ merge_scores = merge_module['merge_gate'](x_pairs).squeeze(-1) # [B, S-1]
279
+ merge_probs = torch.sigmoid(merge_scores) # 0~1 ํ™•๋ฅ 
280
+
281
+ # 3. ๊ณ„์ธต๋ณ„ ๋ณ‘ํ•ฉ ๊ฐ•๋„ ์„ค์ • (ํ•™์Šต ๊ฐ€๋Šฅ)
282
+ # ์ค‘๊ฐ„ ๋ ˆ์ด์–ด: ์ค‘๊ฐ„ ๋ณ‘ํ•ฉ๋ฅ  (๋ฌธ์žโ†’๋‹จ์–ด)
283
+ # ์ตœ์ข… ๋ ˆ์ด์–ด: ๋†’์€ ๋ณ‘ํ•ฉ๋ฅ  (๋‹จ์–ดโ†’์–ด์ ˆ)
284
+ layer_merge_threshold = 0.7 + (i / len(self.merging_modules)) * 0.2 # 0.7 โ†’ 0.9
285
+
286
+ # 4. ๋ณ‘ํ•ฉ ๊ฒฐ์ • (ํ•™์Šต๋œ ํ™•๋ฅ  ๊ธฐ๋ฐ˜)
287
+ merge_decisions = (merge_probs > layer_merge_threshold).float()
288
+
289
+ # 2. Self-attention์œผ๋กœ ์ „์—ญ ์ปจํ…์ŠคํŠธ ํŒŒ์•…
290
+ attn_output, attn_weights = merge_module['merge_attention'](x, x, x)
291
+
292
+ # 5. ์‹ค์ œ ๋ณ‘ํ•ฉ ์ˆ˜ํ–‰ (GPU ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ)
293
+ # ๋ณ‘ํ•ฉ ๋งˆ์Šคํฌ ์ƒ์„ฑ
294
+ merged_indices = []
295
+ merged_x = []
296
+ new_mask = []
297
+
298
+ # Efficient parallel merging using cumsum trick
299
+ # merge_decisions๊ฐ€ 1์ธ ์œ„์น˜์—์„œ ๋‹ค์Œ ํ† ํฐ๊ณผ ๋ณ‘ํ•ฉ
300
+ # group_ids๋Š” seq_len ํฌ๊ธฐ์—ฌ์•ผ ํ•จ (merge_decisions๋Š” seq_len-1)
301
+ group_ids = torch.zeros(batch_size, seq_len, device=x.device)
302
+ group_ids[:, 0] = 0
303
+ group_ids[:, 1:] = 1 - merge_decisions # ์ƒˆ ๊ทธ๋ฃน ์‹œ์ž‘ ์œ„์น˜
304
+ group_ids = group_ids.cumsum(dim=1).long() # ๊ทธ๋ฃน ID ํ• ๋‹น
305
+
306
+ # ๊ฐ ๊ทธ๋ฃน์˜ ์ตœ๋Œ€ ID ์ฐพ๊ธฐ
307
+ max_groups = group_ids.max(dim=1)[0] + 1 # ๊ฐ ๋ฐฐ์น˜์˜ ๊ทธ๋ฃน ์ˆ˜
308
+ max_group_size = max_groups.max().item()
309
+
310
+ # ๊ทธ๋ฃน๋ณ„ aggregation (gradient-safe ๋ฐฉ๋ฒ•)
311
+ # Use index_add instead of scatter for better gradient flow
312
+ new_x_list = []
313
+ new_mask_list = []
314
+
315
+ for b in range(batch_size):
316
+ # Create mapping from old to new indices
317
+ unique_groups, inverse_indices = torch.unique(group_ids[b], return_inverse=True)
318
+ num_groups = len(unique_groups)
319
+
320
+ # Initialize new tensor for this batch
321
+ batch_new_x = torch.zeros(num_groups, hidden_dim, device=x.device)
322
+ group_counts = torch.zeros(num_groups, device=x.device)
323
+
324
+ # Sum tokens belonging to same group
325
+ batch_new_x = batch_new_x.index_add(0, inverse_indices, x[b])
326
+ group_counts = group_counts.index_add(0, inverse_indices, torch.ones(seq_len, device=x.device))
327
+
328
+ # Average
329
+ batch_new_x = batch_new_x / group_counts.unsqueeze(-1).clamp(min=1)
330
+
331
+ new_x_list.append(batch_new_x)
332
+ new_mask_list.append(torch.ones(num_groups, device=x.device))
333
+
334
+ # Pad to same size for batching
335
+ max_new_len = max(t.size(0) for t in new_x_list)
336
+ padded_x_list = []
337
+ padded_mask_list = []
338
+
339
+ for batch_x, batch_mask in zip(new_x_list, new_mask_list):
340
+ pad_len = max_new_len - batch_x.size(0)
341
+ if pad_len > 0:
342
+ batch_x = torch.cat([batch_x, torch.zeros(pad_len, hidden_dim, device=x.device)], dim=0)
343
+ batch_mask = torch.cat([batch_mask, torch.zeros(pad_len, device=x.device)], dim=0)
344
+ padded_x_list.append(batch_x)
345
+ padded_mask_list.append(batch_mask)
346
+
347
+ new_x = torch.stack(padded_x_list)
348
+ valid_mask = torch.stack(padded_mask_list)
349
+
350
+ # Trim to actual size (important for gradient flow)
351
+ actual_len = valid_mask.sum(dim=1).max().long().item()
352
+ new_x = new_x[:, :actual_len]
353
+ valid_mask = valid_mask[:, :actual_len]
354
+
355
+ # Attention ์ •๋ณด ์ถ”๊ฐ€ (์„ ํƒ์ )
356
+ new_x = new_x + attn_output.mean(dim=1, keepdim=True).expand(-1, actual_len, -1) * 0.1
357
+
358
+ # Update x and attention_mask
359
+ x = new_x
360
+ attention_mask = valid_mask
361
+
362
+ # Note: DO NOT re-apply positional encoding after merging
363
+ # The transformer already learned position-aware representations
364
+
365
+ # Store merge mapping for cross-attention and decoder
366
+ # ์›๋ณธ ์œ„์น˜ โ†’ ๋ณ‘ํ•ฉ ํ›„ ์œ„์น˜ ๋งคํ•‘ ์ €์žฅ (๋””์ฝ”๋” ๋ณต์›์šฉ)
367
+ merge_mapping = {
368
+ 'original_positions': torch.arange(seq_len, device=x.device),
369
+ 'merged_groups': group_ids,
370
+ 'group_sizes': None # No longer using counts
371
+ }
372
+
373
+ # ์ •๋ณด ๊ธฐ๋ก (actual_len already computed above)
374
+ merge_info.append({
375
+ 'layer': i,
376
+ 'original_len': seq_len,
377
+ 'merged_len': actual_len,
378
+ 'compression_ratio': seq_len / max(actual_len, 1),
379
+ 'merge_threshold': layer_merge_threshold,
380
+ 'avg_merge_prob': merge_probs.mean().item(),
381
+ 'merge_mapping': merge_mapping # ๋ณต์›์„ ์œ„ํ•œ ๋งคํ•‘ ์ •๋ณด
382
+ })
383
+
384
+ # ์ค‘๊ฐ„ ์ธต์—์„œ๋„ ๊ฒฝ๊ณ„ ์˜ˆ์ธก (auxiliary loss) - ๋งˆ์ง€๋ง‰ ์ธต์—์„œ๋งŒ
385
+ if i == len(self.layers) - 1: # ๋งˆ์ง€๋ง‰ ์ธต์—์„œ๋งŒ ์˜ˆ์ธก
386
+ boundary_pred = self.boundary_predictor(x)
387
+ char_type_pred = self.char_type_predictor(x)
388
+ boundary_predictions.append(boundary_pred)
389
+ char_type_predictions.append(char_type_pred)
390
+
391
+ # Pool for sequence representation
392
+ if attention_mask is not None:
393
+ mask = attention_mask.unsqueeze(-1)
394
+ pooled = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
395
+ else:
396
+ pooled = x.mean(dim=1)
397
+
398
+ return {
399
+ 'last_hidden_state': x,
400
+ 'pooled_output': pooled,
401
+ 'all_hidden_states': all_hidden_states,
402
+ 'boundary_predictions': boundary_predictions, # ๊ฒฝ๊ณ„ ์˜ˆ์ธก (์—ฌ๋Ÿฌ ์ธต)
403
+ 'char_type_predictions': char_type_predictions, # ๋ฌธ์ž ํƒ€์ž… ์˜ˆ์ธก
404
+ 'boundary_logits': self.boundary_predictor(x), # ์ตœ์ข… ๊ฒฝ๊ณ„ ์˜ˆ์ธก
405
+ 'char_type_logits': self.char_type_predictor(x), # ์ตœ์ข… ๋ฌธ์ž ํƒ€์ž… ์˜ˆ์ธก
406
+ 'merge_info': merge_info, # ๋ณ‘ํ•ฉ ์ •๋ณด (์ƒˆ๋กœ ์ถ”๊ฐ€)
407
+ 'attention_mask': attention_mask # ์—…๋ฐ์ดํŠธ๋œ ๋งˆ์Šคํฌ ๋ฐ˜ํ™˜
408
+ }
409
+
410
+
411
+ class BoundaryAwareTokenizerModel(nn.Module):
412
+ """
413
+ ๋ฐ”์ดํŠธ-๋ฌธ์ž ๊ด€๊ณ„๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ํ•™์Šตํ•˜๋Š” ํ†ตํ•ฉ ๋ชจ๋ธ
414
+ """
415
+
416
+ def __init__(
417
+ self,
418
+ vocab_size: int = 260,
419
+ encoder_dims: List[int] = [512, 512, 640, 768, 768], # 384โ†’512๋กœ ์ฆ๊ฐ€
420
+ decoder_hidden: int = 768,
421
+ num_heads: int = 8,
422
+ num_decoder_layers: int = 6,
423
+ dropout: float = 0.1,
424
+ max_seq_len: int = 512
425
+ ):
426
+ super().__init__()
427
+
428
+ # Boundary-aware encoder
429
+ self.encoder = BoundaryAwareEncoder(
430
+ vocab_size, encoder_dims, num_heads, dropout, max_seq_len
431
+ )
432
+
433
+ # Standard decoder (์žฌ์‚ฌ์šฉ)
434
+ self.decoder = TransformerDecoder(
435
+ vocab_size, decoder_hidden, num_heads, num_decoder_layers, dropout, max_seq_len
436
+ )
437
+
438
+ # Cross-attention (์žฌ์‚ฌ์šฉ)
439
+ self.cross_attention = CrossAttention(encoder_dims[-1], num_heads, dropout)
440
+
441
+ def forward(
442
+ self,
443
+ input_ids: torch.Tensor,
444
+ attention_mask: Optional[torch.Tensor] = None,
445
+ boundary_labels: Optional[torch.Tensor] = None,
446
+ char_types: Optional[torch.Tensor] = None,
447
+ byte_counts: Optional[torch.Tensor] = None,
448
+ char_indices: Optional[torch.Tensor] = None,
449
+ decoder_input_ids: Optional[torch.Tensor] = None,
450
+ labels: Optional[torch.Tensor] = None,
451
+ use_cross_attention: bool = True
452
+ ) -> Dict[str, torch.Tensor]:
453
+
454
+ # 1. Boundary-aware encoding
455
+ encoder_outputs = self.encoder(
456
+ input_ids=input_ids,
457
+ boundary_labels=boundary_labels,
458
+ char_types=char_types,
459
+ byte_counts=byte_counts,
460
+ char_indices=char_indices,
461
+ attention_mask=attention_mask
462
+ )
463
+
464
+ encoder_hidden = encoder_outputs['last_hidden_state']
465
+
466
+ # 2. Decoding
467
+ # Pass the updated attention_mask from encoder (after merging)
468
+ encoder_mask = encoder_outputs.get('attention_mask', attention_mask)
469
+
470
+ # Use input_ids as decoder_input_ids for teacher forcing if not provided
471
+ if decoder_input_ids is None and input_ids is not None:
472
+ decoder_input_ids = input_ids
473
+
474
+ decoder_outputs = self.decoder(
475
+ encoder_hidden,
476
+ decoder_input_ids,
477
+ encoder_mask # Use encoder's updated mask
478
+ )
479
+
480
+ # 3. Cross-attention (optional)
481
+ cross_attn_outputs = None
482
+ relation_logits = None
483
+
484
+ if use_cross_attention and decoder_outputs['hidden_states'] is not None:
485
+ decoder_hidden = decoder_outputs['hidden_states']
486
+
487
+ cross_attn_outputs = self.cross_attention(
488
+ query=decoder_hidden,
489
+ key=encoder_hidden,
490
+ query_mask=None,
491
+ key_mask=attention_mask
492
+ )
493
+
494
+ relation_logits = cross_attn_outputs['relation_logits']
495
+
496
+ # Enhanced decoder with cross-attention
497
+ enhanced_decoder = decoder_hidden + cross_attn_outputs['cross_attention']
498
+ decoder_outputs['logits'] = self.decoder.output_projection(enhanced_decoder)
499
+
500
+ # 4. Loss calculation
501
+ total_loss = None
502
+ if labels is not None:
503
+ # Reconstruction loss
504
+ loss_fct = nn.CrossEntropyLoss(ignore_index=256) # PAD
505
+ recon_loss = loss_fct(
506
+ decoder_outputs['logits'].reshape(-1, decoder_outputs['logits'].size(-1)),
507
+ labels.reshape(-1)
508
+ )
509
+
510
+ total_loss = recon_loss
511
+
512
+ # Boundary prediction loss
513
+ if boundary_labels is not None and 'boundary_logits' in encoder_outputs:
514
+ boundary_logits = encoder_outputs['boundary_logits']
515
+ # Check if dimensions match
516
+ logits_size = boundary_logits.size(0) * boundary_logits.size(1)
517
+ labels_size = boundary_labels.numel()
518
+
519
+ if logits_size == labels_size:
520
+ boundary_loss_fct = nn.CrossEntropyLoss(ignore_index=3) # special
521
+ boundary_loss = boundary_loss_fct(
522
+ boundary_logits.reshape(-1, 4),
523
+ boundary_labels.reshape(-1)
524
+ )
525
+ total_loss = total_loss + boundary_loss * 0.3
526
+ # If encoder changed sequence length (due to merging), skip boundary loss
527
+ # This is expected behavior when boundary-aware merging is active
528
+
529
+ # Character type prediction loss
530
+ if char_types is not None and 'char_type_logits' in encoder_outputs:
531
+ char_type_logits = encoder_outputs['char_type_logits']
532
+ # Check if dimensions match
533
+ logits_size = char_type_logits.size(0) * char_type_logits.size(1)
534
+ labels_size = char_types.numel()
535
+
536
+ if logits_size == labels_size:
537
+ char_type_loss_fct = nn.CrossEntropyLoss(ignore_index=13) # special
538
+ char_type_loss = char_type_loss_fct(
539
+ char_type_logits.reshape(-1, 14),
540
+ char_types.reshape(-1)
541
+ )
542
+ total_loss = total_loss + char_type_loss * 0.2
543
+ # If encoder changed sequence length (due to merging), skip char type loss
544
+
545
+ # Auxiliary losses from intermediate layers
546
+ if encoder_outputs.get('boundary_predictions') and boundary_labels is not None:
547
+ # boundary_loss_fct๋Š” ์œ„์—์„œ ์ •์˜๋œ ๊ฒฝ์šฐ์—๋งŒ ์‚ฌ์šฉ
548
+ if 'boundary_loss_fct' in locals():
549
+ for boundary_pred in encoder_outputs['boundary_predictions']:
550
+ # Ensure batch sizes match
551
+ pred_batch_size = boundary_pred.size(0) * boundary_pred.size(1)
552
+ label_batch_size = boundary_labels.numel()
553
+
554
+ if pred_batch_size == label_batch_size:
555
+ aux_boundary_loss = boundary_loss_fct(
556
+ boundary_pred.reshape(-1, 4),
557
+ boundary_labels.reshape(-1)
558
+ )
559
+ total_loss = total_loss + aux_boundary_loss * 0.1
560
+ else:
561
+ # Skip if dimensions don't match (different layer sizes)
562
+ continue
563
+
564
+ return {
565
+ 'loss': total_loss,
566
+ 'logits': decoder_outputs['logits'],
567
+ 'encoder_hidden_states': encoder_hidden,
568
+ 'decoder_hidden_states': decoder_outputs['hidden_states'],
569
+ 'boundary_logits': encoder_outputs['boundary_logits'],
570
+ 'char_type_logits': encoder_outputs['char_type_logits'],
571
+ 'boundary_predictions': encoder_outputs.get('boundary_predictions'),
572
+ 'relation_logits': relation_logits,
573
+ 'cross_attention': cross_attn_outputs['cross_attention'] if cross_attn_outputs else None
574
+ }