Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

App Files Files Community

intelligent-tokenizer-v6-demo / core /encoder.py

ggunio

Upload folder using huggingface_hub

ff85374 verified about 1 month ago

raw

history blame contribute delete

24.1 kB

	"""
	Intelligent Tokenizer v6.2.0 - Progressive Splitting Encoder
	With GPT-5 suggested improvements
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from typing import Dict, List, Optional, Tuple
	import math


	class RoPEPositionalEncoding(nn.Module):
	"""
	Rotary Position Embedding (RoPE) - GPT-5 suggestion
	Better for handling chunk boundaries and variable sequence lengths
	"""

	def __init__(self, dim: int, max_seq_len: int = 48, base: int = 10000):
	super().__init__()
	self.dim = dim
	self.max_seq_len = max_seq_len
	self.base = base

	# Precompute sinusoidal frequencies
	inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
	self.register_buffer('inv_freq', inv_freq)

	# Precompute positional encodings
	t = torch.arange(max_seq_len).type_as(self.inv_freq)
	freqs = torch.outer(t, self.inv_freq)
	self.register_buffer('cos_cached', freqs.cos())
	self.register_buffer('sin_cached', freqs.sin())

	def forward(self, x: torch.Tensor, seq_len: int = None) -> torch.Tensor:
	"""
	Apply RoPE to input tensor
	Handles chunk boundary corrections as suggested by GPT-5
	"""
	if seq_len is None:
	seq_len = x.shape[1]

	# Get cached cos/sin values
	cos = self.cos_cached[:seq_len]
	sin = self.sin_cached[:seq_len]

	# Apply rotary embedding
	x_rot = self._apply_rotary_emb(x, cos, sin)

	return x_rot

	def _apply_rotary_emb(self, x, cos, sin):
	"""Apply rotary embedding to input"""
	x1, x2 = x[..., ::2], x[..., 1::2]
	x_rot = torch.stack([
	x1 * cos - x2 * sin,
	x1 * sin + x2 * cos
	], dim=-1).flatten(-2)
	return x_rot


	class GatedCrossAttention(nn.Module):
	"""
	Gated Cross-Attention with MQA - GPT-5 suggestion
	Monitor gate values for quality assessment
	16Q → 2K/V for 8x memory reduction
	"""

	def __init__(self, hidden_dim: int = 1280, num_heads: int = 16, kv_heads: int = 2):
	super().__init__()
	self.hidden_dim = hidden_dim
	self.num_heads = num_heads
	self.kv_heads = kv_heads # Reduced KV heads (GPT suggestion)
	self.head_dim = hidden_dim // num_heads # 80

	# Multi-Query Attention projections
	self.q_proj = nn.Linear(hidden_dim, hidden_dim) # 16 heads
	self.k_proj = nn.Linear(hidden_dim, kv_heads * self.head_dim) # 2 heads
	self.v_proj = nn.Linear(hidden_dim, kv_heads * self.head_dim) # 2 heads
	self.o_proj = nn.Linear(hidden_dim, hidden_dim)

	# Gating mechanism (GPT-5 suggestion)
	self.gate = nn.Sequential(
	nn.Linear(hidden_dim * 2, hidden_dim),
	nn.Sigmoid()
	)

	# Gate monitoring (for analysis)
	self.register_buffer('gate_values', torch.zeros(1))

	# Warmup factor (GPT suggestion)
	self.register_buffer('warmup_alpha', torch.tensor(1.0))

	def forward(self,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	mask: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Forward pass with gate monitoring
	Returns: (output, gate_values)
	"""
	batch_size, seq_len = query.shape[:2]

	# Multi-head attention projections
	Q = self.q_proj(query).view(batch_size, seq_len, self.num_heads, self.head_dim)
	K = self.k_proj(key).view(batch_size, -1, self.kv_heads, self.head_dim)
	V = self.v_proj(value).view(batch_size, -1, self.kv_heads, self.head_dim)

	# Transpose for attention computation
	Q = Q.transpose(1, 2) # [batch, heads, seq, dim]
	K = K.transpose(1, 2) # [batch, kv_heads, seq, dim]
	V = V.transpose(1, 2)

	# Repeat KV heads to match Q heads if necessary
	if self.kv_heads < self.num_heads:
	repeat_factor = self.num_heads // self.kv_heads
	K = K.repeat_interleave(repeat_factor, dim=1)
	V = V.repeat_interleave(repeat_factor, dim=1)

	# Scaled dot-product attention
	scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)

	if mask is not None:
	scores = scores.masked_fill(mask == 0, -1e9)

	attn_weights = F.softmax(scores, dim=-1)
	attn_output = torch.matmul(attn_weights, V)

	# Reshape back
	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.view(batch_size, seq_len, self.hidden_dim)
	attn_output = self.o_proj(attn_output)

	# Gating mechanism
	gate_input = torch.cat([query, attn_output], dim=-1)
	gate_values = self.gate(gate_input)

	# Store gate values for monitoring (keep tensor shape consistent)
	self.gate_values[0] = gate_values.mean().detach()

	# Apply gate with warmup factor (GPT suggestion)
	gate_values = gate_values * self.warmup_alpha
	output = gate_values * attn_output + (1 - gate_values) * query

	return output, gate_values



	class ProgressiveSplittingLayer(nn.Module):
	"""
	Core innovation: 48 bytes → 1 token → N tokens → M tokens
	"""

	def __init__(self, hidden_dim: int = 1280, config: Optional[Dict] = None):
	super().__init__()
	self.hidden_dim = hidden_dim
	self.config = config or {}

	# Dynamic splitting: 1~4 tokens for efficiency
	# 48 bytes / 4 tokens = 12:1 compression (still beats BPE's 4:1)
	self.min_tokens = 1 # 48:1 compression
	self.max_tokens = 4 # 12:1 compression (still 3x better than BPE)

	# Initial compression: 48 bytes → 1 super token
	self.byte_embed = nn.Embedding(260, 64) # Small embedding
	self.initial_compressor = nn.Sequential(
	nn.Linear(48 * 64, 2048),
	nn.LayerNorm(2048),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(2048, hidden_dim),
	nn.LayerNorm(hidden_dim)
	)

	# Language-aware splitting: 1 → N tokens (config-based)
	self.language_splitter = nn.ModuleDict({
	'analyzer': nn.Sequential(
	nn.Linear(hidden_dim, 512),
	nn.ReLU(),
	nn.Linear(512, 256) # Language features
	),
	'split_predictor': nn.Linear(256, self.max_tokens), # Predict 1~4 tokens
	# Single unified expander that can produce any number of tokens
	'dynamic_expander': nn.Sequential(
	nn.Linear(hidden_dim, hidden_dim * 2),
	nn.LayerNorm(hidden_dim * 2),
	nn.GELU(), # Better than ReLU for transformers
	nn.Linear(hidden_dim * 2, hidden_dim * self.max_tokens) # Can produce up to 4 tokens
	),
	# Token-wise importance predictor
	'importance_predictor': nn.Sequential(
	nn.Linear(hidden_dim, 256),
	nn.ReLU(),
	nn.Linear(256, self.max_tokens), # Importance for each potential token
	nn.Softmax(dim=-1)
	)
	})

	# Boundary refinement: N → M tokens with linguistic awareness
	self.boundary_refiner = nn.ModuleDict({
	'scorer': nn.Sequential(
	nn.Linear(hidden_dim, 512),
	nn.ReLU(),
	nn.Linear(512, 1)
	),
	'morpheme_detector': nn.Conv1d(256, 64, 3), # 형태소
	'word_detector': nn.Conv1d(256, 64, 5), # 단어
	'phrase_detector': nn.Conv1d(256, 64, 7), # 구
	'adjuster': nn.TransformerEncoderLayer(
	d_model=hidden_dim,
	nhead=16,
	dim_feedforward=4 * hidden_dim,
	dropout=0.1,
	batch_first=True
	)
	})

	# Initialize split_predictor bias to prefer 1 token initially
	# This ensures untrained model starts with maximum compression
	with torch.no_grad():
	self.language_splitter['split_predictor'].bias.data = torch.tensor([2.0, -1.0, -1.0, -1.0])
	# High bias for 1 token, negative for others

	def forward(self, input_ids: torch.Tensor, temperature: float = 1.0) -> Dict[str, torch.Tensor]:
	"""
	Progressive splitting forward pass

	Args:
	input_ids: Input byte sequence [batch, seq_len]
	temperature: Gumbel-Softmax temperature for annealing
	"""
	batch_size = input_ids.size(0)

	# Step 1: 48 bytes → 1 super token
	byte_embeddings = self.byte_embed(input_ids) # [batch, 48, 64]
	flattened = byte_embeddings.view(batch_size, -1) # [batch, 3072]
	super_token = self.initial_compressor(flattened) # [batch, 1280]
	super_token = super_token.unsqueeze(1) # [batch, 1, 1280]

	# Step 2: Language analysis and splitting (1 → N)
	lang_features = self.language_splitter['analyzer'](super_token)
	split_logits = self.language_splitter['split_predictor'](lang_features)
	split_weights = F.softmax(split_logits, dim=-1) # [batch, 1, 8]

	# Direct transformation from super token to initial representation
	# No hardcoded splits - let the model learn everything
	lang_tokens = super_token # Start with compressed representation

	# TRUE Adaptive expansion - Model learns optimal split (1~4 tokens)
	# Analyze content to decide how many tokens needed
	expansion_features = self.language_splitter['analyzer'](lang_tokens) # [batch, 1, 256]

	# Dynamic expansion: generate up to 4 tokens from super token
	expanded = self.language_splitter['dynamic_expander'](lang_tokens.squeeze(1)) # [batch, hidden_dim*4]
	expanded = expanded.reshape(batch_size, self.max_tokens, self.hidden_dim) # [batch, 4, hidden_dim]

	# Predict how many tokens we actually need (1~4)
	split_logits = self.language_splitter['split_predictor'](expansion_features.squeeze(1)) # [batch, 4]
	# Clamp logits to prevent extreme values that cause NaN
	split_logits = torch.clamp(split_logits, min=-10, max=10)
	# Ensure minimum temperature to prevent instability
	safe_temperature = max(temperature, 0.5)
	split_weights = F.gumbel_softmax(split_logits, tau=safe_temperature, hard=False, dim=-1) # [batch, 4]

	# Predict importance for each potential token position
	importance = self.language_splitter['importance_predictor'](lang_tokens.squeeze(1)) # [batch, 4]

	# Dynamic token selection with importance-weighted allocation
	# Create cumulative mask for progressive token usage
	# If split_weights = [0.1, 0.2, 0.6, 0.1], we mainly use 3 tokens

	# Create progressive masks for 1, 2, 3, 4 tokens
	masks = []
	for n in range(1, self.max_tokens + 1):
	mask = torch.zeros(batch_size, self.max_tokens, 1, device=expanded.device)
	mask[:, :n, :] = 1.0
	masks.append(mask)

	# Apply importance-weighted masking
	# Important parts get more tokens, less important parts get fewer
	weighted_outputs = []
	for i, mask in enumerate(masks):
	num_tokens = i + 1
	# Weight by both split decision and importance
	token_weight = split_weights[:, i:i+1].unsqueeze(-1) # [batch, 1, 1]

	# Apply importance modulation for asymmetric splits
	if num_tokens > 1:
	# Redistribute tokens based on importance
	importance_adjusted = importance[:, :num_tokens].unsqueeze(-1) # [batch, n, 1]
	masked = expanded[:, :num_tokens] * importance_adjusted
	else:
	masked = expanded[:, :num_tokens]

	# Pad to max length
	if num_tokens < self.max_tokens:
	padding = torch.zeros(batch_size, self.max_tokens - num_tokens, self.hidden_dim,
	device=expanded.device)
	masked = torch.cat([masked, padding], dim=1)

	weighted_outputs.append(masked * token_weight)

	# Sum all weighted possibilities (differentiable selection)
	lang_tokens = sum(weighted_outputs)

	# Determine effective number of tokens (for monitoring)
	# Weighted average of token counts
	token_counts = torch.arange(1, self.max_tokens + 1, device=split_weights.device, dtype=torch.float32)
	avg_tokens = (split_weights * token_counts).sum(dim=-1).mean().item()

	k = lang_tokens.size(1)

	# Step 3: Boundary refinement (N → M)
	# Calculate boundary scores for each token position
	boundary_scores = self.boundary_refiner['scorer'](lang_tokens) # [batch, N, 1]

	# Detect linguistic boundaries (morpheme, word, phrase)
	# Extract features for boundary detection
	if hasattr(lang_tokens, 'shape') and len(lang_tokens.shape) == 3:
	batch_size, num_tokens, hidden_dim = lang_tokens.shape

	# For boundary detection, we need to consider the original byte sequence
	# But we're working with compressed tokens here
	# So we detect boundaries based on learned representations

	# Apply boundary adjustment with TransformerEncoderLayer
	# This learns to adjust token boundaries based on context
	refined_tokens = self.boundary_refiner['adjuster'](lang_tokens)

	# The adjuster should learn to:
	# 1. Respect UTF-8 boundaries (learned during training)
	# 2. Align with word/phrase boundaries (learned from language patterns)
	# 3. Maintain semantic coherence within each token
	else:
	refined_tokens = lang_tokens

	# Determine actual number of tokens based on highest probability
	# During inference, use argmax. During training, use weighted average.
	if self.training:
	# During training, use weighted average for differentiability
	actual_num_tokens = avg_tokens
	else:
	# During inference, select the split with highest probability
	split_decision = torch.argmax(split_weights, dim=-1) # [batch]
	actual_num_tokens = (split_decision.float().mean() + 1).item() # +1 because indices are 0-3

	# Calculate compression ratio based on actual tokens used
	compression_ratio = 48.0 / max(1, actual_num_tokens)

	return {
	'tokens': refined_tokens,
	'num_tokens': actual_num_tokens,
	'compression_ratio': torch.tensor(compression_ratio, device=refined_tokens.device),
	'gate_values': None, # Will be filled by cross-attention
	'language_features': lang_features,
	'split_weights': split_weights,
	'avg_tokens': avg_tokens if 'avg_tokens' in locals() else refined_tokens.size(1),
	'split_distribution': split_weights.mean(dim=0) if 'split_weights' in locals() else None
	}


	class EncoderV62(nn.Module):
	"""
	4-Layer Progressive Splitting Encoder with Cross-Attention
	All layers: 1280 dimensions
	"""

	def __init__(self, config: Optional[Dict] = None):
	super().__init__()

	# Store config for later use
	self.config = config or {}

	# Configuration
	self.hidden_dim = 1280
	self.num_heads = 16
	self.num_layers = 4
	self.max_seq_len = 48
	self.dropout = 0.1

	# RoPE positional encoding (GPT-5 suggestion)
	self.rope = RoPEPositionalEncoding(self.hidden_dim, self.max_seq_len)

	# Layer 0: Progressive Splitting (48→1→N→M) - Pass config
	self.progressive_splitter = ProgressiveSplittingLayer(self.hidden_dim, config)

	# Layers 1-3: Transformer encoders with cross-attention
	self.encoder_layers = nn.ModuleList([
	nn.TransformerEncoderLayer(
	d_model=self.hidden_dim,
	nhead=self.num_heads,
	dim_feedforward=4 * self.hidden_dim, # 5120
	dropout=self.dropout,
	batch_first=True
	) for _ in range(3)
	])

	# Cross-attention between layers with MQA (GPT-5 suggestion)
	self.cross_attentions = nn.ModuleList([
	GatedCrossAttention(self.hidden_dim, self.num_heads, kv_heads=2) # 8x memory reduction
	for _ in range(3)
	])

	# Output heads for different tasks
	self.boundary_head = nn.Linear(self.hidden_dim, 4)
	self.language_head = nn.Linear(self.hidden_dim, 128) # Reduced from 512 (GPT suggestion)
	self.compression_head = nn.Linear(self.hidden_dim, self.hidden_dim)

	# Monitoring metrics (GPT-5 suggestion)
	self.register_buffer('compression_ratios', torch.zeros(1))
	self.register_buffer('gate_averages', torch.zeros(3))

	def forward(self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	temperature: float = 1.0) -> Dict[str, torch.Tensor]:
	"""
	Forward pass through the encoder

	Args:
	input_ids: Input byte sequence
	attention_mask: Optional attention mask
	temperature: Gumbel-Softmax temperature for annealing
	"""
	# Layer 0: Progressive splitting with temperature
	split_output = self.progressive_splitter(input_ids, temperature)
	x = split_output['tokens'] # [batch, M, 1280]

	# Apply RoPE
	x = self.rope(x, x.size(1))

	# Store all hidden states for decoder
	all_hidden_states = [x]
	gate_values_list = []

	# Layers 1-3 with cross-attention
	for i, (encoder_layer, cross_attn) in enumerate(
	zip(self.encoder_layers, self.cross_attentions)
	):
	# Self-attention through transformer layer
	# GPT final check: Don't pass mask after progressive splitting changes sequence length
	x = encoder_layer(x) # No mask needed (no padding after compression)

	# Cross-attention with previous layer
	if i > 0:
	# Cross-attention with previous layer
	x, gate_values = cross_attn(
	query=x,
	key=all_hidden_states[-1],
	value=all_hidden_states[-1],
	mask=None # Mask not applicable after compression
	)
	gate_values_list.append(gate_values)
	# Keep tensor shape consistent - store in existing buffer element
	self.gate_averages[i-1] = gate_values.mean().detach().item() # Fix indexing

	all_hidden_states.append(x)

	# Output projections
	boundaries = self.boundary_head(x)
	language_clusters = self.language_head(x)
	compressed = self.compression_head(x)

	# Update monitoring metrics
	# Ensure tensor is 1-dimensional for buffer assignment
	compression_ratio = split_output['compression_ratio']
	if compression_ratio.dim() == 0: # Scalar tensor
	self.compression_ratios[0] = compression_ratio
	else:
	self.compression_ratios = compression_ratio

	return {
	'last_hidden_state': x,
	'all_hidden_states': all_hidden_states,
	'boundaries': boundaries,
	'language_clusters': language_clusters,
	'compressed': compressed,
	'compression_ratio': split_output['compression_ratio'],
	'num_tokens': split_output['num_tokens'],
	'splitting_probs': split_output.get('split_weights', None), # Add for diagnostics
	'gate_values': gate_values_list,
	'gate_averages': self.gate_averages,
	'split_info': {
	'language_features': split_output['language_features'],
	'split_weights': split_output['split_weights']
	}
	}

	def get_monitoring_stats(self) -> Dict[str, float]:
	"""
	Get monitoring statistics (GPT-5 suggestion)
	"""
	return {
	'avg_compression_ratio': self.compression_ratios.item(),
	'gate_layer1': self.gate_averages[0].item(),
	'gate_layer2': self.gate_averages[1].item(),
	'gate_layer3': self.gate_averages[2].item(),
	}

	def set_warmup_step(self, step: int, total_warmup: int = 1000):
	"""
	Set warmup alpha for all gates (GPT suggestion)
	Gradually increase gate influence from 0 to 1
	"""
	alpha = min(1.0, step / total_warmup)
	for cross_attn in self.cross_attentions:
	cross_attn.warmup_alpha = torch.tensor(alpha, device=cross_attn.warmup_alpha.device)

	def adaptive_compression_control(self, reconstruction_loss: float):
	"""
	Adaptive compression based on reconstruction quality
	No fixed phases - model learns optimal compression
	"""
	# If reconstruction is poor, model will learn to use more tokens
	# This happens automatically through gradient descent
	# No manual phase control needed
	pass # Let gradients handle it


	class DualSlidingWindowEncoder(EncoderV62):
	"""
	Extension with dual sliding window system
	Handles both chunk-level and token-level boundaries
	"""

	def __init__(self, config: Optional[Dict] = None):
	super().__init__(config)

	# Chunk-level sliding window
	self.chunk_window = nn.Conv1d(
	in_channels=1,
	out_channels=1,
	kernel_size=8, # 8-byte overlap
	stride=40, # 48-8=40 stride
	padding=4
	)

	# Token-level sliding window
	self.token_window = nn.MultiheadAttention(
	embed_dim=self.hidden_dim,
	num_heads=self.num_heads,
	batch_first=True
	)

	def process_long_sequence(self, input_ids: torch.Tensor) -> torch.Tensor:
	"""
	Handle sequences longer than 48 bytes with sliding windows
	"""
	batch_size, seq_len = input_ids.shape

	if seq_len <= 48:
	return super().forward(input_ids)

	# Process in chunks with overlap
	chunks = []
	for i in range(0, seq_len - 48 + 1, 40): # 8-byte overlap
	chunk = input_ids[:, i:i+48]
	chunk_output = super().forward(chunk)
	chunks.append(chunk_output['last_hidden_state'])

	# Combine chunks with attention
	combined = torch.cat(chunks, dim=1)
	attended, _ = self.token_window(combined, combined, combined)

	return {
	'last_hidden_state': attended,
	'num_chunks': len(chunks),
	'total_compression': seq_len / attended.size(1)
	}


	if __name__ == "__main__":
	# Test the encoder
	encoder = EncoderV62()

	# Test input
	batch_size = 2
	input_ids = torch.randint(0, 256, (batch_size, 48))

	# Forward pass
	output = encoder(input_ids)

	print(f"Input shape: {input_ids.shape}")
	print(f"Output tokens: {output['num_tokens']}")
	print(f"Compression ratio: {output['compression_ratio']:.2f}:1")
	print(f"Gate averages: {output['gate_averages']}")
	print(f"Monitoring stats: {encoder.get_monitoring_stats()}")