intelligent-tokenizer-v6-demo / core /boundary_aware_model.py
ggunio's picture
Upload core/boundary_aware_model.py with huggingface_hub
c65503b verified
"""
Boundary-Aware Intelligent Tokenizer Model
๋ฐ”์ดํŠธ-๋ฌธ์ž ๊ด€๊ณ„๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ํ•™์Šตํ•˜๋Š” ๋ชจ๋ธ
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, List, Optional, Tuple
import math
# Import necessary components from unified_model
from .unified_model import ByteEncoder, TransformerDecoder, CrossAttention, PositionalEncoding
class BoundaryAwareEncoder(nn.Module):
"""
๋ฐ”์ดํŠธ-๋ฌธ์ž ๊ฒฝ๊ณ„๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ํ•™์Šตํ•˜๋Š” ์ธ์ฝ”๋”
"""
def __init__(
self,
vocab_size: int = 260,
hidden_dims: List[int] = [512, 512, 640, 768, 768], # 384โ†’512๋กœ ์ฆ๊ฐ€
num_heads: int = 8,
dropout: float = 0.1,
max_seq_len: int = 512
):
super().__init__()
# 1. ๋ฐ”์ดํŠธ ์ž„๋ฒ ๋”ฉ
self.byte_embedding = nn.Embedding(vocab_size, hidden_dims[0])
# 2. ๊ฒฝ๊ณ„ ์ž„๋ฒ ๋”ฉ (START, CONT, END, SPECIAL) - ๋” ํฐ ์ฐจ์›
self.boundary_embedding = nn.Embedding(4, 128) # ๊ณ ์ • 128์ฐจ์›
# 3. ๋ฌธ์ž ํƒ€์ž… ์ž„๋ฒ ๋”ฉ (ASCII, Korean, Chinese, etc.) - ๋” ํฐ ์ฐจ์›
self.char_type_embedding = nn.Embedding(14, 128) # ๊ณ ์ • 128์ฐจ์›
# 4. ๋ฐ”์ดํŠธ ์นด์šดํŠธ ์ž„๋ฒ ๋”ฉ (1-4 bytes) - UTF-8 ํŒจํ„ด ์ค‘์š”
self.byte_count_embedding = nn.Embedding(5, 128) # ๊ณ ์ • 128์ฐจ์›
# 5. ๋ฌธ์ž ์ธ๋ฑ์Šค ์ž„๋ฒ ๋”ฉ (relative position within char)
self.char_position_embedding = nn.Embedding(4, 128) # ๊ณ ์ • 128์ฐจ์›
# ํ†ตํ•ฉ projection (๋ฐ”์ดํŠธ ์ž„๋ฒ ๋”ฉ 512 + ๊ตฌ์กฐ ์ž„๋ฒ ๋”ฉ 512 = 1024)
structural_dim = 128 * 4 # boundary(128) + char_type(128) + byte_count(128) + char_pos(128)
self.input_projection = nn.Linear(hidden_dims[0] + structural_dim, hidden_dims[0])
# Positional encoding
self.pos_encoding = PositionalEncoding(hidden_dims[0], max_seq_len, dropout)
# Transformer layers (๊ธฐ์กด ๊ตฌ์กฐ ์žฌ์‚ฌ์šฉ)
self.layers = nn.ModuleList()
for i in range(len(hidden_dims)):
input_dim = hidden_dims[i-1] if i > 0 else hidden_dims[0]
output_dim = hidden_dims[i]
if input_dim != output_dim:
proj = nn.Linear(input_dim, output_dim)
else:
proj = None
layer = nn.TransformerEncoderLayer(
d_model=output_dim,
nhead=num_heads,
dim_feedforward=output_dim * 4,
dropout=dropout,
activation='gelu',
batch_first=True,
norm_first=True
)
self.layers.append(nn.ModuleDict({
'projection': proj,
'transformer': layer,
'norm': nn.LayerNorm(output_dim)
}))
# Hierarchical Merging Components (์ƒˆ๋กœ ์ถ”๊ฐ€)
# ๊ฐ ๋ ˆ์ด์–ด๋งˆ๋‹ค ๋ณ‘ํ•ฉ ๋ชจ๋“ˆ ์ถ”๊ฐ€ - ํŠธ๋žœ์Šคํฌ๋จธ๊ฐ€ ์Šค์Šค๋กœ ๊ฒฐ์ •
self.merging_modules = nn.ModuleList()
for i in range(len(hidden_dims)):
dim = hidden_dims[i]
# Learned merging decision - no fixed ratios!
merge_module = nn.ModuleDict({
# ๊ฒฝ๊ณ„ ํ•™์Šต์„ ์œ„ํ•œ ๋ชจ๋“ˆ
'boundary_detector': nn.Linear(dim, 3), # START, CONT, END
'merge_attention': nn.MultiheadAttention(dim, num_heads//2, dropout, batch_first=True),
'merge_gate': nn.Sequential(
nn.Linear(dim * 2, dim),
nn.ReLU(),
nn.Linear(dim, 1)
), # ๋ณ‘ํ•ฉ ๊ฒฐ์ • (ํ•™์Šต์œผ๋กœ ๊ฒฐ์ •)
'merge_proj': nn.Linear(dim * 2, dim), # ๋ณ‘ํ•ฉ ํ›„ ํ”„๋กœ์ ์…˜
})
self.merging_modules.append(merge_module)
# ๊ฒฝ๊ณ„ ์˜ˆ์ธก ํ—ค๋“œ
self.boundary_predictor = nn.Linear(hidden_dims[-1], 4)
# ๋ฌธ์ž ํƒ€์ž… ์˜ˆ์ธก ํ—ค๋“œ
self.char_type_predictor = nn.Linear(hidden_dims[-1], 14)
def forward(
self,
input_ids: torch.Tensor,
boundary_labels: Optional[torch.Tensor] = None,
char_types: Optional[torch.Tensor] = None,
byte_counts: Optional[torch.Tensor] = None,
char_indices: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None
) -> Dict[str, torch.Tensor]:
batch_size, seq_len = input_ids.shape
device = input_ids.device
# 1. ๋ฐ”์ดํŠธ ์ž„๋ฒ ๋”ฉ
byte_emb = self.byte_embedding(input_ids) # [B, S, D]
# 2. ๊ฒฝ๊ณ„ ์ •๋ณด ์ž„๋ฒ ๋”ฉ (ํ•™์Šต ์‹œ์—๋งŒ)
if boundary_labels is not None:
boundary_emb = self.boundary_embedding(boundary_labels) # [B, S, D/4]
else:
# ์ถ”๋ก  ์‹œ: ๋ฐ”์ดํŠธ ๊ฐ’์œผ๋กœ๋ถ€ํ„ฐ ๊ฒฝ๊ณ„ ์ถ”์ •
# UTF-8 ํŒจํ„ด:
# 0xxxxxxx (0-127): ASCII (START)
# 110xxxxx (192-223): 2-byte start
# 1110xxxx (224-239): 3-byte start
# 11110xxx (240-247): 4-byte start
# 10xxxxxx (128-191): continuation
estimated_boundaries = torch.zeros_like(input_ids)
# ASCII (0-127)
ascii_mask = input_ids < 128
estimated_boundaries[ascii_mask] = 1 # START
# Continuation bytes (128-191)
cont_mask = (input_ids >= 128) & (input_ids < 192)
estimated_boundaries[cont_mask] = 0 # CONT
# Multi-byte starters
mb_start_mask = input_ids >= 192
estimated_boundaries[mb_start_mask] = 1 # START
boundary_emb = self.boundary_embedding(estimated_boundaries)
# 3. ๋ฌธ์ž ํƒ€์ž… ์ž„๋ฒ ๋”ฉ
if char_types is not None:
char_type_emb = self.char_type_embedding(char_types)
else:
# ์ถ”๋ก  ์‹œ: ๊ธฐ๋ณธ๊ฐ’ ์‚ฌ์šฉ
char_type_emb = self.char_type_embedding(torch.zeros_like(input_ids))
# 4. ๋ฐ”์ดํŠธ ์นด์šดํŠธ ์ž„๋ฒ ๋”ฉ
if byte_counts is not None:
byte_count_emb = self.byte_count_embedding(torch.clamp(byte_counts, 0, 4))
else:
# ์ถ”๋ก  ์‹œ: ๋ฐ”์ดํŠธ ํŒจํ„ด์œผ๋กœ ์ถ”์ •
estimated_counts = torch.ones_like(input_ids)
# UTF-8 ํŒจํ„ด์œผ๋กœ ๋ฉ€ํ‹ฐ๋ฐ”์ดํŠธ ๊ธธ์ด ์ถ”์ •
estimated_counts[input_ids >= 240] = 4 # 4-byte
estimated_counts[(input_ids >= 224) & (input_ids < 240)] = 3 # 3-byte
estimated_counts[(input_ids >= 192) & (input_ids < 224)] = 2 # 2-byte
byte_count_emb = self.byte_count_embedding(estimated_counts)
# 5. ๋ฌธ์ž ๋‚ด ์œ„์น˜ ์ž„๋ฒ ๋”ฉ
if char_indices is not None:
# ๊ฐ™์€ ๋ฌธ์ž ๋‚ด์—์„œ์˜ ์ƒ๋Œ€ ์œ„์น˜ ๊ณ„์‚ฐ
char_positions = torch.zeros_like(char_indices)
for b in range(batch_size):
current_char = -1
position = 0
for i in range(seq_len):
if char_indices[b, i] != current_char:
current_char = char_indices[b, i]
position = 0
else:
position += 1
char_positions[b, i] = min(position, 3)
char_pos_emb = self.char_position_embedding(char_positions)
else:
char_pos_emb = self.char_position_embedding(torch.zeros_like(input_ids))
# 6. ๋ชจ๋“  ์ž„๋ฒ ๋”ฉ ํ†ตํ•ฉ
# ๋ฐ”์ดํŠธ ์ž„๋ฒ ๋”ฉ + ๊ตฌ์กฐ ์ •๋ณด
structural_emb = torch.cat([
boundary_emb,
char_type_emb,
byte_count_emb,
char_pos_emb
], dim=-1) # [B, S, D]
combined_emb = torch.cat([byte_emb, structural_emb], dim=-1) # [B, S, 2*D]
# Projection to original dimension
x = self.input_projection(combined_emb) # [B, S, D]
# Positional encoding
x = self.pos_encoding(x)
# Transformer layers with hierarchical merging
all_hidden_states = []
boundary_predictions = []
char_type_predictions = []
merge_info = [] # ๋ณ‘ํ•ฉ ์ •๋ณด ์ €์žฅ
for i, layer_dict in enumerate(self.layers):
# Project if needed
if layer_dict['projection'] is not None:
x = layer_dict['projection'](x)
# Transformer layer
if attention_mask is not None:
# Ensure mask matches current sequence length
current_seq_len = x.size(1)
if attention_mask.size(1) != current_seq_len:
# Adjust mask to match current sequence length after merging
key_padding_mask = torch.zeros(x.size(0), current_seq_len, dtype=torch.bool, device=x.device)
# Copy valid mask values
valid_len = min(attention_mask.size(1), current_seq_len)
key_padding_mask[:, :valid_len] = (attention_mask[:, :valid_len] == 0)
else:
key_padding_mask = (attention_mask == 0)
x = layer_dict['transformer'](x, src_key_padding_mask=key_padding_mask)
else:
x = layer_dict['transformer'](x)
x = layer_dict['norm'](x)
# Store hidden state BEFORE merging (for proper gradient flow)
all_hidden_states.append(x.clone())
# Hierarchical Progressive Merging - ๊ณ„์ธต์  ์ ์ง„์  ๋ณ‘ํ•ฉ
# Layer๋ณ„๋กœ ๋‹ค๋ฅธ ์ˆ˜์ค€์˜ ๋ณ‘ํ•ฉ ํ•™์Šต (๋ฐ”์ดํŠธโ†’๋ฌธ์žโ†’๋‹จ์–ดโ†’์–ด์ ˆ)
if i < len(self.merging_modules) and self.merging_modules[i] is not None:
merge_module = self.merging_modules[i]
batch_size, seq_len, hidden_dim = x.shape
# Skip if already compressed too much
if seq_len < 4:
continue
# Layer 0: UTF-8 ๊ฒฝ๊ณ„ ๊ธฐ๋ฐ˜ ๋ณ‘ํ•ฉ (๋ฐ”์ดํŠธ โ†’ ๋ฌธ์ž)
if i == 0 and input_ids is not None:
# UTF-8 ๊ฒฝ๊ณ„ ๊ฐ์ง€๋ฅผ ์‚ฌ์šฉํ•œ ํ™•์‹คํ•œ ๋ณ‘ํ•ฉ
merge_decisions = torch.zeros(batch_size, seq_len - 1, device=x.device)
for b in range(batch_size):
for idx in range(seq_len - 1):
if idx < input_ids.shape[1] - 1:
current_byte = input_ids[b, idx].item()
next_byte = input_ids[b, idx + 1].item()
# Continuation byte (10xxxxxx) should merge with previous
if 128 <= next_byte < 192: # Next is continuation
merge_decisions[b, idx] = 1.0 # Merge with next
# Special tokens don't merge
elif current_byte >= 256 or next_byte >= 256:
merge_decisions[b, idx] = 0.0
# Also calculate merge_probs for logging
x_pairs = torch.cat([x[:, :-1], x[:, 1:]], dim=-1)
merge_scores = merge_module['merge_gate'](x_pairs).squeeze(-1)
merge_probs = torch.sigmoid(merge_scores)
# Use UTF-8 based decisions for layer 0
layer_merge_threshold = 0.5 # Not used but logged
else:
# Other layers: ํ•™์Šต ๊ธฐ๋ฐ˜ ๋ณ‘ํ•ฉ
# 1. ํŠธ๋žœ์Šคํฌ๋จธ๊ฐ€ ๋ณ‘ํ•ฉ ๊ฒฝ๊ณ„๋ฅผ ํ•™์Šต
# ์ธ์ ‘ ํ† ํฐ ์Œ์˜ ๋ณ‘ํ•ฉ ์ ์ˆ˜ ๊ณ„์‚ฐ
x_pairs = torch.cat([x[:, :-1], x[:, 1:]], dim=-1) # [B, S-1, 2*D]
merge_scores = merge_module['merge_gate'](x_pairs).squeeze(-1) # [B, S-1]
merge_probs = torch.sigmoid(merge_scores) # 0~1 ํ™•๋ฅ 
# 3. ๊ณ„์ธต๋ณ„ ๋ณ‘ํ•ฉ ๊ฐ•๋„ ์„ค์ • (ํ•™์Šต ๊ฐ€๋Šฅ)
# ์ค‘๊ฐ„ ๋ ˆ์ด์–ด: ์ค‘๊ฐ„ ๋ณ‘ํ•ฉ๋ฅ  (๋ฌธ์žโ†’๋‹จ์–ด)
# ์ตœ์ข… ๋ ˆ์ด์–ด: ๋†’์€ ๋ณ‘ํ•ฉ๋ฅ  (๋‹จ์–ดโ†’์–ด์ ˆ)
layer_merge_threshold = 0.7 + (i / len(self.merging_modules)) * 0.2 # 0.7 โ†’ 0.9
# 4. ๋ณ‘ํ•ฉ ๊ฒฐ์ • (ํ•™์Šต๋œ ํ™•๋ฅ  ๊ธฐ๋ฐ˜)
merge_decisions = (merge_probs > layer_merge_threshold).float()
# 2. Self-attention์œผ๋กœ ์ „์—ญ ์ปจํ…์ŠคํŠธ ํŒŒ์•…
attn_output, attn_weights = merge_module['merge_attention'](x, x, x)
# 5. ์‹ค์ œ ๋ณ‘ํ•ฉ ์ˆ˜ํ–‰ (GPU ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ)
# ๋ณ‘ํ•ฉ ๋งˆ์Šคํฌ ์ƒ์„ฑ
merged_indices = []
merged_x = []
new_mask = []
# Efficient parallel merging using cumsum trick
# merge_decisions๊ฐ€ 1์ธ ์œ„์น˜์—์„œ ๋‹ค์Œ ํ† ํฐ๊ณผ ๋ณ‘ํ•ฉ
# group_ids๋Š” seq_len ํฌ๊ธฐ์—ฌ์•ผ ํ•จ (merge_decisions๋Š” seq_len-1)
group_ids = torch.zeros(batch_size, seq_len, device=x.device)
group_ids[:, 0] = 0
group_ids[:, 1:] = 1 - merge_decisions # ์ƒˆ ๊ทธ๋ฃน ์‹œ์ž‘ ์œ„์น˜
group_ids = group_ids.cumsum(dim=1).long() # ๊ทธ๋ฃน ID ํ• ๋‹น
# ๊ฐ ๊ทธ๋ฃน์˜ ์ตœ๋Œ€ ID ์ฐพ๊ธฐ
max_groups = group_ids.max(dim=1)[0] + 1 # ๊ฐ ๋ฐฐ์น˜์˜ ๊ทธ๋ฃน ์ˆ˜
max_group_size = max_groups.max().item()
# ๊ทธ๋ฃน๋ณ„ aggregation (gradient-safe ๋ฐฉ๋ฒ•)
# Use index_add instead of scatter for better gradient flow
new_x_list = []
new_mask_list = []
for b in range(batch_size):
# Create mapping from old to new indices
unique_groups, inverse_indices = torch.unique(group_ids[b], return_inverse=True)
num_groups = len(unique_groups)
# Initialize new tensor for this batch
batch_new_x = torch.zeros(num_groups, hidden_dim, device=x.device)
group_counts = torch.zeros(num_groups, device=x.device)
# Sum tokens belonging to same group
batch_new_x = batch_new_x.index_add(0, inverse_indices, x[b])
group_counts = group_counts.index_add(0, inverse_indices, torch.ones(seq_len, device=x.device))
# Average
batch_new_x = batch_new_x / group_counts.unsqueeze(-1).clamp(min=1)
new_x_list.append(batch_new_x)
new_mask_list.append(torch.ones(num_groups, device=x.device))
# Pad to same size for batching
max_new_len = max(t.size(0) for t in new_x_list)
padded_x_list = []
padded_mask_list = []
for batch_x, batch_mask in zip(new_x_list, new_mask_list):
pad_len = max_new_len - batch_x.size(0)
if pad_len > 0:
batch_x = torch.cat([batch_x, torch.zeros(pad_len, hidden_dim, device=x.device)], dim=0)
batch_mask = torch.cat([batch_mask, torch.zeros(pad_len, device=x.device)], dim=0)
padded_x_list.append(batch_x)
padded_mask_list.append(batch_mask)
new_x = torch.stack(padded_x_list)
valid_mask = torch.stack(padded_mask_list)
# Trim to actual size (important for gradient flow)
actual_len = valid_mask.sum(dim=1).max().long().item()
new_x = new_x[:, :actual_len]
valid_mask = valid_mask[:, :actual_len]
# Attention ์ •๋ณด ์ถ”๊ฐ€ (์„ ํƒ์ )
new_x = new_x + attn_output.mean(dim=1, keepdim=True).expand(-1, actual_len, -1) * 0.1
# Update x and attention_mask
x = new_x
attention_mask = valid_mask
# Note: DO NOT re-apply positional encoding after merging
# The transformer already learned position-aware representations
# Store merge mapping for cross-attention and decoder
# ์›๋ณธ ์œ„์น˜ โ†’ ๋ณ‘ํ•ฉ ํ›„ ์œ„์น˜ ๋งคํ•‘ ์ €์žฅ (๋””์ฝ”๋” ๋ณต์›์šฉ)
merge_mapping = {
'original_positions': torch.arange(seq_len, device=x.device),
'merged_groups': group_ids,
'group_sizes': None # No longer using counts
}
# ์ •๋ณด ๊ธฐ๋ก (actual_len already computed above)
merge_info.append({
'layer': i,
'original_len': seq_len,
'merged_len': actual_len,
'compression_ratio': seq_len / max(actual_len, 1),
'merge_threshold': layer_merge_threshold,
'avg_merge_prob': merge_probs.mean().item(),
'merge_mapping': merge_mapping # ๋ณต์›์„ ์œ„ํ•œ ๋งคํ•‘ ์ •๋ณด
})
# ์ค‘๊ฐ„ ์ธต์—์„œ๋„ ๊ฒฝ๊ณ„ ์˜ˆ์ธก (auxiliary loss) - ๋งˆ์ง€๋ง‰ ์ธต์—์„œ๋งŒ
if i == len(self.layers) - 1: # ๋งˆ์ง€๋ง‰ ์ธต์—์„œ๋งŒ ์˜ˆ์ธก
boundary_pred = self.boundary_predictor(x)
char_type_pred = self.char_type_predictor(x)
boundary_predictions.append(boundary_pred)
char_type_predictions.append(char_type_pred)
# Pool for sequence representation
if attention_mask is not None:
mask = attention_mask.unsqueeze(-1)
pooled = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
else:
pooled = x.mean(dim=1)
return {
'last_hidden_state': x,
'pooled_output': pooled,
'all_hidden_states': all_hidden_states,
'boundary_predictions': boundary_predictions, # ๊ฒฝ๊ณ„ ์˜ˆ์ธก (์—ฌ๋Ÿฌ ์ธต)
'char_type_predictions': char_type_predictions, # ๋ฌธ์ž ํƒ€์ž… ์˜ˆ์ธก
'boundary_logits': self.boundary_predictor(x), # ์ตœ์ข… ๊ฒฝ๊ณ„ ์˜ˆ์ธก
'char_type_logits': self.char_type_predictor(x), # ์ตœ์ข… ๋ฌธ์ž ํƒ€์ž… ์˜ˆ์ธก
'merge_info': merge_info, # ๋ณ‘ํ•ฉ ์ •๋ณด (์ƒˆ๋กœ ์ถ”๊ฐ€)
'attention_mask': attention_mask # ์—…๋ฐ์ดํŠธ๋œ ๋งˆ์Šคํฌ ๋ฐ˜ํ™˜
}
class BoundaryAwareTokenizerModel(nn.Module):
"""
๋ฐ”์ดํŠธ-๋ฌธ์ž ๊ด€๊ณ„๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ํ•™์Šตํ•˜๋Š” ํ†ตํ•ฉ ๋ชจ๋ธ
"""
def __init__(
self,
vocab_size: int = 260,
encoder_dims: List[int] = [512, 512, 640, 768, 768], # 384โ†’512๋กœ ์ฆ๊ฐ€
decoder_hidden: int = 768,
num_heads: int = 8,
num_decoder_layers: int = 6,
dropout: float = 0.1,
max_seq_len: int = 512
):
super().__init__()
# Boundary-aware encoder
self.encoder = BoundaryAwareEncoder(
vocab_size, encoder_dims, num_heads, dropout, max_seq_len
)
# Standard decoder (์žฌ์‚ฌ์šฉ)
self.decoder = TransformerDecoder(
vocab_size, decoder_hidden, num_heads, num_decoder_layers, dropout, max_seq_len
)
# Cross-attention (์žฌ์‚ฌ์šฉ)
self.cross_attention = CrossAttention(encoder_dims[-1], num_heads, dropout)
def forward(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
boundary_labels: Optional[torch.Tensor] = None,
char_types: Optional[torch.Tensor] = None,
byte_counts: Optional[torch.Tensor] = None,
char_indices: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cross_attention: bool = True
) -> Dict[str, torch.Tensor]:
# 1. Boundary-aware encoding
encoder_outputs = self.encoder(
input_ids=input_ids,
boundary_labels=boundary_labels,
char_types=char_types,
byte_counts=byte_counts,
char_indices=char_indices,
attention_mask=attention_mask
)
encoder_hidden = encoder_outputs['last_hidden_state']
# 2. Decoding
# Pass the updated attention_mask from encoder (after merging)
encoder_mask = encoder_outputs.get('attention_mask', attention_mask)
# Use input_ids as decoder_input_ids for teacher forcing if not provided
if decoder_input_ids is None and input_ids is not None:
decoder_input_ids = input_ids
decoder_outputs = self.decoder(
encoder_hidden,
decoder_input_ids,
encoder_mask # Use encoder's updated mask
)
# 3. Cross-attention (optional)
cross_attn_outputs = None
relation_logits = None
if use_cross_attention and decoder_outputs['hidden_states'] is not None:
decoder_hidden = decoder_outputs['hidden_states']
cross_attn_outputs = self.cross_attention(
query=decoder_hidden,
key=encoder_hidden,
query_mask=None,
key_mask=attention_mask
)
relation_logits = cross_attn_outputs['relation_logits']
# Enhanced decoder with cross-attention
enhanced_decoder = decoder_hidden + cross_attn_outputs['cross_attention']
decoder_outputs['logits'] = self.decoder.output_projection(enhanced_decoder)
# 4. Loss calculation
total_loss = None
if labels is not None:
# Reconstruction loss
loss_fct = nn.CrossEntropyLoss(ignore_index=256) # PAD
recon_loss = loss_fct(
decoder_outputs['logits'].reshape(-1, decoder_outputs['logits'].size(-1)),
labels.reshape(-1)
)
total_loss = recon_loss
# Boundary prediction loss
if boundary_labels is not None and 'boundary_logits' in encoder_outputs:
boundary_logits = encoder_outputs['boundary_logits']
# Check if dimensions match
logits_size = boundary_logits.size(0) * boundary_logits.size(1)
labels_size = boundary_labels.numel()
if logits_size == labels_size:
boundary_loss_fct = nn.CrossEntropyLoss(ignore_index=3) # special
boundary_loss = boundary_loss_fct(
boundary_logits.reshape(-1, 4),
boundary_labels.reshape(-1)
)
total_loss = total_loss + boundary_loss * 0.3
# If encoder changed sequence length (due to merging), skip boundary loss
# This is expected behavior when boundary-aware merging is active
# Character type prediction loss
if char_types is not None and 'char_type_logits' in encoder_outputs:
char_type_logits = encoder_outputs['char_type_logits']
# Check if dimensions match
logits_size = char_type_logits.size(0) * char_type_logits.size(1)
labels_size = char_types.numel()
if logits_size == labels_size:
char_type_loss_fct = nn.CrossEntropyLoss(ignore_index=13) # special
char_type_loss = char_type_loss_fct(
char_type_logits.reshape(-1, 14),
char_types.reshape(-1)
)
total_loss = total_loss + char_type_loss * 0.2
# If encoder changed sequence length (due to merging), skip char type loss
# Auxiliary losses from intermediate layers
if encoder_outputs.get('boundary_predictions') and boundary_labels is not None:
# boundary_loss_fct๋Š” ์œ„์—์„œ ์ •์˜๋œ ๊ฒฝ์šฐ์—๋งŒ ์‚ฌ์šฉ
if 'boundary_loss_fct' in locals():
for boundary_pred in encoder_outputs['boundary_predictions']:
# Ensure batch sizes match
pred_batch_size = boundary_pred.size(0) * boundary_pred.size(1)
label_batch_size = boundary_labels.numel()
if pred_batch_size == label_batch_size:
aux_boundary_loss = boundary_loss_fct(
boundary_pred.reshape(-1, 4),
boundary_labels.reshape(-1)
)
total_loss = total_loss + aux_boundary_loss * 0.1
else:
# Skip if dimensions don't match (different layer sizes)
continue
return {
'loss': total_loss,
'logits': decoder_outputs['logits'],
'encoder_hidden_states': encoder_hidden,
'decoder_hidden_states': decoder_outputs['hidden_states'],
'boundary_logits': encoder_outputs['boundary_logits'],
'char_type_logits': encoder_outputs['char_type_logits'],
'boundary_predictions': encoder_outputs.get('boundary_predictions'),
'relation_logits': relation_logits,
'cross_attention': cross_attn_outputs['cross_attention'] if cross_attn_outputs else None
}