Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

App Files Files Community

intelligent-tokenizer-v6-demo / core /intelligent_loss.py

ggunio

Upload folder using huggingface_hub

ff85374 verified about 1 month ago

raw

history blame contribute delete

21.3 kB

	"""
	Intelligent Loss Functions for v6.2.0
	Multi-objective loss with GPT-5 suggested improvements
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from typing import Dict, Optional, Tuple
	import math


	class IntelligentLoss(nn.Module):
	"""
	Comprehensive loss function for progressive splitting tokenizer
	Combines multiple objectives with dynamic weighting
	"""

	def __init__(self, config: Optional[Dict] = None):
	super().__init__()

	# Default configuration
	self.config = config or {}

	# Special tokens (must match tokenizer)
	self.PAD = 256
	self.BOS = 257
	self.EOS = 258
	self.MASK = 259

	# Loss components
	self.reconstruction_loss = ReconstructionLoss(self.PAD)
	self.compression_loss = CompressionLoss()
	self.boundary_loss = BoundaryLoss()
	self.language_loss = LanguageLoss()
	self.consistency_loss = ConsistencyLoss()

	# Dynamic weight adjustment
	self.use_dynamic_weights = True
	self.weight_history = {
	'reconstruction': [],
	'compression': [],
	'boundary': [],
	'language': [],
	'consistency': []
	}

	def estimate_language_difficulty(self, targets: Dict) -> float:
	"""Estimate language difficulty based on input characteristics"""
	if 'input_ids' not in targets:
	return 1.0

	input_ids = targets['input_ids']
	if input_ids.numel() == 0:
	return 1.0

	# Higher entropy = more complex language
	unique_tokens = input_ids.unique().numel()
	total_tokens = input_ids.numel()
	diversity = min(1.0, (unique_tokens / total_tokens) * 2)

	return diversity

	def forward(self,
	outputs: Dict[str, torch.Tensor],
	targets: Dict[str, torch.Tensor],
	weights: Optional[Dict[str, float]] = None) -> Dict[str, torch.Tensor]:
	"""
	Compute combined loss with all objectives

	Args:
	outputs: Model outputs dictionary
	targets: Target values dictionary
	weights: Optional weight overrides

	Returns:
	Dictionary with total loss and individual components
	"""
	losses = {}

	# 1. Reconstruction loss (primary objective)
	if 'logits' in outputs and 'input_ids' in targets:
	losses['reconstruction'] = self.reconstruction_loss(
	outputs['logits'],
	targets['input_ids'],
	targets.get('attention_mask')
	)

	# 2. Compression loss (encourage optimal compression)
	if 'compression_ratio' in outputs:
	losses['compression'] = self.compression_loss(
	outputs['compression_ratio'],
	outputs.get('num_tokens')
	)

	# 3. Boundary loss (learn meaningful boundaries)
	if 'boundaries' in outputs and 'boundary_targets' in targets:
	losses['boundary'] = self.boundary_loss(
	outputs['boundaries'],
	targets['boundary_targets'],
	targets.get('boundary_mask')
	)

	# 4. Language loss (language identification/clustering)
	if 'language_clusters' in outputs and 'language_targets' in targets:
	losses['language'] = self.language_loss(
	outputs['language_clusters'],
	targets['language_targets']
	)

	# 5. Consistency loss (encoder-decoder consistency)
	if 'encoder_hidden' in outputs and 'decoder_hidden' in outputs:
	losses['consistency'] = self.consistency_loss(
	outputs['encoder_hidden'],
	outputs['decoder_hidden']
	)

	# Apply weights (either provided or dynamic)
	if weights is None and self.use_dynamic_weights:
	weights = self.compute_dynamic_weights(losses)
	elif weights is None:
	weights = {
	'reconstruction': 1.0,
	'compression': 1.0,
	'boundary': 1.0,
	'language': 0.5,
	'consistency': 0.5
	}

	# Weighted sum
	total_loss = torch.tensor(0.0, device=next(iter(losses.values())).device)
	for key, loss in losses.items():
	weight = weights.get(key, 1.0)
	total_loss = total_loss + weight * loss
	losses[f'{key}_weighted'] = weight * loss

	losses['total'] = total_loss

	# Update weight history
	for key in self.weight_history:
	if key in losses:
	self.weight_history[key].append(losses[key].item())

	return losses

	def compute_dynamic_weights(self, losses: Dict[str, torch.Tensor]) -> Dict[str, float]:
	"""
	Dynamically adjust weights based on loss magnitudes and progress
	GPT-5 suggestion: balance loss magnitudes for stable training
	"""
	weights = {}
	eps = 1e-8 # GPT fix: prevent division by zero

	# Get loss magnitudes with NaN protection
	magnitudes = {}
	for k, v in losses.items():
	if torch.isnan(v) or torch.isinf(v):
	magnitudes[k] = 1.0 # Default safe value
	else:
	magnitudes[k] = v.item()

	# Compute relative scales (GPT fix: add epsilon)
	avg_magnitude = max(eps, sum(magnitudes.values()) / len(magnitudes))

	for key, magnitude in magnitudes.items():
	# Inverse scaling to balance magnitudes (GPT fix: add epsilon)
	weights[key] = avg_magnitude / max(eps, magnitude)

	# Dynamic adjustment based on loss ratios
	if 'reconstruction' in magnitudes and 'compression' in magnitudes:
	recon_loss = magnitudes['reconstruction']
	comp_loss = magnitudes['compression']

	# If reconstruction loss is too high relative to compression
	if recon_loss > comp_loss * 10:
	# Drastically reduce compression pressure
	weights['compression'] *= 0.1
	weights['reconstruction'] *= 5.0
	elif recon_loss > comp_loss * 5:
	# Moderate adjustment
	weights['compression'] *= 0.5
	weights['reconstruction'] *= 2.0
	elif recon_loss < comp_loss * 0.5:
	# Good reconstruction, can push compression
	weights['compression'] *= 2.0
	weights['reconstruction'] *= 0.5

	# Normalize weights to prevent explosion
	total_weight = sum(weights.values())
	if total_weight > 0:
	weights = {k: min(10.0, v / total_weight * len(weights)) for k, v in weights.items()}

	return weights


	class ReconstructionLoss(nn.Module):
	"""
	Cross-entropy loss for sequence reconstruction
	With label smoothing and focal loss options
	"""

	def __init__(self, pad_token: int = 256, label_smoothing: float = 0.1):
	super().__init__()
	self.pad_token = pad_token
	self.label_smoothing = label_smoothing
	self.focal_alpha = 0.25
	self.focal_gamma = 2.0
	self.use_focal = False

	def forward(self,
	logits: torch.Tensor,
	targets: torch.Tensor,
	mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	"""
	Compute reconstruction loss

	Args:
	logits: [batch, seq_len, vocab_size]
	targets: [batch, seq_len]
	mask: [batch, seq_len] attention mask
	"""
	batch_size, seq_len, vocab_size = logits.shape

	# Reshape for loss computation
	logits_flat = logits.reshape(-1, vocab_size)
	targets_flat = targets.reshape(-1)

	if self.use_focal:
	# Focal loss for hard examples
	ce_loss = F.cross_entropy(logits_flat, targets_flat, reduction='none')
	pt = torch.exp(-ce_loss)
	focal_loss = self.focal_alpha * (1 - pt) ** self.focal_gamma * ce_loss

	if mask is not None:
	mask_flat = mask.reshape(-1)
	focal_loss = focal_loss * mask_flat
	loss = focal_loss.sum() / mask_flat.sum()
	else:
	loss = focal_loss.mean()
	else:
	# Standard cross-entropy with label smoothing
	if mask is not None:
	mask_flat = mask.reshape(-1).bool() # GPT fix: ensure bool dtype
	loss = F.cross_entropy(
	logits_flat[mask_flat],
	targets_flat[mask_flat],
	ignore_index=self.pad_token,
	label_smoothing=self.label_smoothing
	)
	else:
	loss = F.cross_entropy(
	logits_flat,
	targets_flat,
	ignore_index=self.pad_token,
	label_smoothing=self.label_smoothing
	)

	return loss


	class CompressionLoss(nn.Module):
	"""
	Aggressive compression loss - push for high compression
	Must beat existing tokenizers (4 bytes/token = 4:1)
	"""

	def __init__(self):
	super().__init__()
	# Dynamic compression based on token count
	# 1 token = 48:1, 2 = 24:1, 3 = 16:1, 4 = 12:1
	self.min_ratio = 12.0 # 4 tokens (worst case, still 3x better than BPE)
	self.target_ratio = 24.0 # 2 tokens (optimal balance)
	self.max_ratio = 48.0 # 1 token (best compression)

	def forward(self,
	compression_ratio: torch.Tensor,
	num_tokens: Optional[torch.Tensor] = None) -> torch.Tensor:
	"""
	Compute compression loss (GPT fix: fully vectorized)

	Args:
	compression_ratio: Current compression ratio (scalar or batch)
	num_tokens: Number of tokens used (for additional penalty)
	"""
	# Ensure tensor (GPT fix: handle device properly)
	if not torch.is_tensor(compression_ratio):
	device = num_tokens.device if torch.is_tensor(num_tokens) else torch.device('cpu')
	compression_ratio = torch.tensor(compression_ratio, dtype=torch.float32, device=device)

	# Aggressive compression enforcement
	# MUST achieve at least 16:1 to be viable
	if compression_ratio < self.min_ratio:
	# Moderate penalty for falling below minimum (reduced for stability)
	under_loss = ((self.min_ratio - compression_ratio) / self.min_ratio) * 0.5
	else:
	under_loss = torch.tensor(0.0, dtype=compression_ratio.dtype, device=compression_ratio.device)

	# Reward getting close to target (24:1)
	if self.min_ratio <= compression_ratio < self.target_ratio:
	# Encourage reaching target
	target_loss = ((self.target_ratio - compression_ratio) / self.target_ratio) * 0.5
	elif compression_ratio >= self.target_ratio:
	# Excellent compression - small reward for going higher
	target_loss = -0.1 * torch.log(compression_ratio / self.target_ratio + 1.0)
	else:
	target_loss = torch.tensor(0.0, dtype=compression_ratio.dtype, device=compression_ratio.device)

	# Only mild penalty for extreme compression (>48:1)
	if compression_ratio > self.max_ratio:
	over_loss = ((compression_ratio - self.max_ratio) / self.max_ratio) * 0.2
	else:
	over_loss = torch.tensor(0.0, dtype=compression_ratio.dtype, device=compression_ratio.device)

	loss = under_loss + target_loss + over_loss

	# Additional penalty based on token count (GPT fix: vectorized)
	if num_tokens is not None:
	if not torch.is_tensor(num_tokens):
	num_tokens = torch.tensor(num_tokens, dtype=torch.float32, device=compression_ratio.device)
	token_penalty = 0.1 * torch.clamp(num_tokens - 8, min=0.0) ** 2
	loss = loss + token_penalty

	return loss.mean() if loss.dim() > 0 else loss


	class BoundaryLoss(nn.Module):
	"""
	Learn meaningful chunk boundaries
	Combines multiple boundary objectives
	"""

	def __init__(self):
	super().__init__()
	self.bce_loss = nn.BCEWithLogitsLoss(reduction='none')

	def forward(self,
	predicted: torch.Tensor,
	target: torch.Tensor,
	mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	"""
	Compute boundary loss

	Args:
	predicted: [batch, seq_len, boundary_classes] predicted boundaries
	target: [batch, seq_len, boundary_classes] target boundaries
	mask: [batch, seq_len] valid positions mask
	"""
	# Binary cross-entropy for boundary prediction
	loss = self.bce_loss(predicted, target.float())

	if mask is not None:
	# Apply mask
	mask_expanded = mask.unsqueeze(-1).expand_as(loss)
	loss = loss * mask_expanded
	loss = loss.sum() / mask_expanded.sum()
	else:
	loss = loss.mean()

	# Add regularization for boundary sparsity
	# (boundaries should be relatively rare)
	boundary_probs = torch.sigmoid(predicted)
	sparsity_loss = 0.01 * boundary_probs.mean()

	# Add smoothness regularization
	# (boundaries should be somewhat smooth/continuous)
	if predicted.size(1) > 1:
	diff = predicted[:, 1:] - predicted[:, :-1]
	smoothness_loss = 0.01 * (diff ** 2).mean()
	else:
	smoothness_loss = 0.0

	total_loss = loss + sparsity_loss + smoothness_loss

	return total_loss


	class LanguageLoss(nn.Module):
	"""
	Language identification/clustering loss
	Supports both classification and clustering objectives
	"""

	def __init__(self, num_languages: int = 128, temperature: float = 0.07):
	super().__init__()
	self.num_languages = num_languages
	self.temperature = temperature

	# For supervised language classification
	self.ce_loss = nn.CrossEntropyLoss()

	def forward(self,
	predicted: torch.Tensor,
	target: torch.Tensor,
	mode: str = 'classification') -> torch.Tensor:
	"""
	Compute language loss

	Args:
	predicted: [batch, seq_len, num_languages] or [batch, num_languages]
	target: Language labels or cluster assignments
	mode: 'classification' or 'clustering'
	"""
	if mode == 'classification':
	# Standard classification loss
	if predicted.dim() == 3:
	# Sequence-level predictions
	batch_size, seq_len, _ = predicted.shape
	predicted = predicted.reshape(-1, self.num_languages)
	target = target.reshape(-1)

	loss = self.ce_loss(predicted, target)

	elif mode == 'clustering':
	# Contrastive clustering loss (similar to SimCLR)
	# Normalize embeddings
	predicted = F.normalize(predicted, dim=-1)

	# Compute similarity matrix
	sim_matrix = torch.matmul(predicted, predicted.t()) / self.temperature

	# Create labels (assuming batch contains similar samples)
	batch_size = predicted.size(0)
	labels = torch.arange(batch_size, device=predicted.device)

	# Contrastive loss
	loss = F.cross_entropy(sim_matrix, labels)

	else:
	raise ValueError(f"Unknown mode: {mode}")

	return loss


	class ConsistencyLoss(nn.Module):
	"""
	Ensure consistency between encoder and decoder representations
	GPT-5 suggestion: helps with training stability
	"""

	def __init__(self, margin: float = 0.5):
	super().__init__()
	self.margin = margin

	def forward(self,
	encoder_hidden: torch.Tensor,
	decoder_hidden: torch.Tensor) -> torch.Tensor:
	"""
	Compute consistency loss between encoder and decoder

	Args:
	encoder_hidden: [batch, seq_len, hidden_dim]
	decoder_hidden: [batch, seq_len, hidden_dim]
	"""
	# Ensure same shape
	if encoder_hidden.shape != decoder_hidden.shape:
	# Align sequence lengths if different
	min_len = min(encoder_hidden.size(1), decoder_hidden.size(1))
	encoder_hidden = encoder_hidden[:, :min_len]
	decoder_hidden = decoder_hidden[:, :min_len]

	# L2 distance
	l2_loss = F.mse_loss(encoder_hidden, decoder_hidden)

	# Cosine similarity loss
	encoder_norm = F.normalize(encoder_hidden, dim=-1)
	decoder_norm = F.normalize(decoder_hidden, dim=-1)
	cosine_sim = (encoder_norm * decoder_norm).sum(dim=-1)
	cosine_loss = 1.0 - cosine_sim.mean()

	# Combined loss
	loss = l2_loss + 0.5 * cosine_loss

	return loss


	class AdaptiveLossScheduler:
	"""
	Dynamically adjust loss weights during training
	Based on training progress and performance
	"""

	def __init__(self, config: Dict):
	self.config = config
	self.current_phase = 0
	self.phase_epochs = [30, 60, 100] # Phase transition points

	# Define phase-specific weights
	self.phase_weights = [
	# Phase 1: Boundary mastery
	{
	'reconstruction': 2.0,
	'compression': 0.5,
	'boundary': 3.0,
	'language': 0.5,
	'consistency': 0.5
	},
	# Phase 2: Compression focus
	{
	'reconstruction': 2.0,
	'compression': 3.0,
	'boundary': 1.0,
	'language': 1.0,
	'consistency': 1.0
	},
	# Phase 3: Balanced optimization
	{
	'reconstruction': 3.0,
	'compression': 2.0,
	'boundary': 1.0,
	'language': 1.0,
	'consistency': 1.5
	}
	]

	def get_weights(self, epoch: int, metrics: Optional[Dict] = None) -> Dict[str, float]:
	"""
	Get current loss weights based on training phase

	Args:
	epoch: Current training epoch
	metrics: Optional performance metrics for adaptive adjustment
	"""
	# Determine current phase
	for i, phase_end in enumerate(self.phase_epochs):
	if epoch <= phase_end:
	self.current_phase = i
	break

	weights = self.phase_weights[self.current_phase].copy()

	# Adaptive adjustments based on metrics
	if metrics:
	# If reconstruction is poor, increase its weight
	if metrics.get('reconstruction_accuracy', 1.0) < 0.9:
	weights['reconstruction'] *= 1.5

	# If compression is off target, adjust weight
	compression_ratio = metrics.get('compression_ratio', 16.0)
	if compression_ratio < 8.0 or compression_ratio > 20.0:
	weights['compression'] *= 1.5

	return weights


	if __name__ == "__main__":
	# Test losses
	print("Testing Intelligent Loss Functions")

	# Create loss module
	loss_fn = IntelligentLoss()

	# Create dummy data
	batch_size = 2
	seq_len = 48
	vocab_size = 260
	hidden_dim = 1280

	outputs = {
	'logits': torch.randn(batch_size, seq_len, vocab_size),
	'compression_ratio': torch.tensor(16.0),
	'num_tokens': torch.tensor(3),
	'boundaries': torch.randn(batch_size, seq_len, 4),
	'language_clusters': torch.randn(batch_size, 128),
	'encoder_hidden': torch.randn(batch_size, seq_len, hidden_dim),
	'decoder_hidden': torch.randn(batch_size, seq_len, hidden_dim)
	}

	targets = {
	'input_ids': torch.randint(0, 256, (batch_size, seq_len)),
	'attention_mask': torch.ones(batch_size, seq_len),
	'boundary_targets': torch.zeros(batch_size, seq_len, 4),
	'language_targets': torch.randint(0, 128, (batch_size,))
	}

	# Compute losses
	losses = loss_fn(outputs, targets)

	print("\nLoss components:")
	for key, value in losses.items():
	if isinstance(value, torch.Tensor):
	print(f" {key}: {value.item():.4f}")

	# Test adaptive scheduler
	scheduler = AdaptiveLossScheduler({})

	print("\nPhase weights:")
	for epoch in [10, 40, 70]:
	weights = scheduler.get_weights(epoch)
	print(f" Epoch {epoch}: {weights}")