Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

App Files Files Community

intelligent-tokenizer-v6-demo / core /scheduler.py

ggunio

Upload folder using huggingface_hub

ff85374 verified about 1 month ago

raw

history blame contribute delete

23.3 kB

	"""
	Learning Rate Schedulers for v6.2.0
	Advanced scheduling with warmup and phase-based adjustments
	"""

	import torch
	import math
	from typing import Optional, Dict, List, Any
	import numpy as np


	class WarmupCosineScheduler:
	"""
	Cosine annealing with linear warmup
	GPT-5 suggested: Essential for stable progressive splitting training
	"""

	def __init__(self,
	optimizer: torch.optim.Optimizer,
	warmup_steps: int,
	total_steps: int,
	min_lr: float = 1e-6,
	max_lr: Optional[float] = None):
	self.optimizer = optimizer
	self.warmup_steps = warmup_steps
	self.total_steps = total_steps
	self.min_lr = min_lr
	self.max_lr = max_lr or optimizer.param_groups[0]['lr']
	self.current_step = 0

	def step(self):
	"""Update learning rate"""
	self.current_step += 1

	if self.current_step <= self.warmup_steps:
	# Linear warmup
	lr = self.max_lr * (self.current_step / self.warmup_steps)
	else:
	# Cosine annealing (GPT fix: guard against division by zero)
	if self.total_steps <= self.warmup_steps:
	lr = self.min_lr
	else:
	progress = (self.current_step - self.warmup_steps) / max(1, self.total_steps - self.warmup_steps)
	progress = min(1.0, max(0.0, progress)) # Clamp to [0, 1]
	lr = self.min_lr + (self.max_lr - self.min_lr) * 0.5 * (1 + math.cos(math.pi * progress))

	for param_group in self.optimizer.param_groups:
	param_group['lr'] = lr

	return lr

	def get_lr(self):
	"""Get current learning rate"""
	return self.optimizer.param_groups[0]['lr']


	class PhaseBasedScheduler:
	"""
	Curriculum learning scheduler with phase transitions
	Adjusts learning rate based on training phases
	"""

	def __init__(self,
	optimizer: torch.optim.Optimizer,
	phase_configs: List[Dict],
	current_epoch: int = 0):
	"""
	Args:
	optimizer: PyTorch optimizer
	phase_configs: List of phase configurations
	[{
	'epochs': (start, end),
	'lr': learning_rate,
	'warmup_epochs': warmup_duration
	}, ...]
	"""
	self.optimizer = optimizer
	self.phase_configs = phase_configs
	self.current_epoch = current_epoch
	self.current_phase = 0
	self.base_lr = optimizer.param_groups[0]['lr']

	def step(self, epoch: Optional[int] = None):
	"""Update learning rate based on current phase"""
	if epoch is not None:
	self.current_epoch = epoch

	# Find current phase
	for i, phase in enumerate(self.phase_configs):
	start_epoch, end_epoch = phase['epochs']
	if start_epoch <= self.current_epoch <= end_epoch:
	self.current_phase = i
	break

	phase = self.phase_configs[self.current_phase]
	target_lr = phase['lr']
	warmup_epochs = phase.get('warmup_epochs', 0)
	start_epoch = phase['epochs'][0]

	# Apply warmup if in warmup period
	if self.current_epoch - start_epoch < warmup_epochs:
	warmup_progress = (self.current_epoch - start_epoch + 1) / warmup_epochs
	lr = target_lr * warmup_progress
	else:
	lr = target_lr

	# Update optimizer
	for param_group in self.optimizer.param_groups:
	param_group['lr'] = lr

	return lr


	class AdaptiveScheduler:
	"""
	Adaptive learning rate based on validation metrics
	Reduces LR when metrics plateau
	"""

	def __init__(self,
	optimizer: torch.optim.Optimizer,
	mode: str = 'min',
	factor: float = 0.5,
	patience: int = 10,
	threshold: float = 1e-4,
	min_lr: float = 1e-7):
	"""
	Args:
	optimizer: PyTorch optimizer
	mode: 'min' or 'max' - whether to reduce LR when metric stops decreasing or increasing
	factor: Factor to reduce LR by
	patience: Number of epochs with no improvement to wait
	threshold: Minimum change to qualify as improvement
	min_lr: Minimum learning rate
	"""
	self.optimizer = optimizer
	self.mode = mode
	self.factor = factor
	self.patience = patience
	self.threshold = threshold
	self.min_lr = min_lr

	self.best_score = None
	self.num_bad_epochs = 0
	self.last_reduction = 0

	def step(self, metric: float, epoch: int = 0):
	"""Update learning rate based on metric"""
	current_lr = self.optimizer.param_groups[0]['lr']

	if self.best_score is None:
	self.best_score = metric
	else:
	if self.mode == 'min':
	improved = metric < self.best_score - self.threshold
	else:
	improved = metric > self.best_score + self.threshold

	if improved:
	self.best_score = metric
	self.num_bad_epochs = 0
	else:
	self.num_bad_epochs += 1

	# Reduce LR if patience exceeded
	if self.num_bad_epochs >= self.patience:
	new_lr = max(current_lr * self.factor, self.min_lr)

	if new_lr < current_lr:
	print(f"Reducing learning rate from {current_lr:.2e} to {new_lr:.2e}")

	for param_group in self.optimizer.param_groups:
	param_group['lr'] = new_lr

	self.num_bad_epochs = 0
	self.last_reduction = epoch

	return current_lr


	class ProgressiveSplittingScheduler:
	"""
	Adaptive scheduler for progressive splitting
	No fixed targets - adjusts based on quality feedback
	"""

	def __init__(self,
	optimizer: torch.optim.Optimizer,
	initial_lr: float = 1e-4,
	min_reconstruction: float = 0.85,
	ema: float = 0.98,
	min_lr: float = 1e-7):
	self.optimizer = optimizer
	self.initial_lr = initial_lr
	self.min_reconstruction = min_reconstruction # Quality threshold
	self.ema = ema
	self.min_lr = min_lr

	# Adaptive multipliers based on performance
	self.quality_multiplier = 1.0 # Adjusts with reconstruction quality

	# No phases - continuous adaptation
	self.current_state = 'learning'

	# EMA tracking for smooth transitions
	self._ema_comp = None
	self._ema_recon = None

	def step(self, metrics: Dict[str, float]):
	"""
	Update learning rate based on current metrics
	GPT fix: EMA smoothing and minimum floor

	Args:
	metrics: Dictionary containing:
	- compression_ratio: Current compression ratio
	- reconstruction_acc: Reconstruction accuracy
	"""
	compression_ratio = float(metrics.get('compression_ratio', 0.0))
	reconstruction_acc = float(metrics.get('reconstruction_acc', 0.0))

	# Update EMA (GPT fix: smooth transitions)
	if self._ema_comp is None:
	self._ema_comp = compression_ratio
	self._ema_recon = reconstruction_acc
	else:
	self._ema_comp = self.ema * self._ema_comp + (1 - self.ema) * compression_ratio
	self._ema_recon = self.ema * self._ema_recon + (1 - self.ema) * reconstruction_acc

	# Adaptive adjustment based on reconstruction quality only
	# No fixed compression target - emerges from quality
	if self._ema_recon < self.min_reconstruction:
	# Poor reconstruction - reduce LR for careful learning
	self.quality_multiplier = max(0.5, self._ema_recon)
	else:
	# Good reconstruction - normal learning
	self.quality_multiplier = 1.0

	# Smooth LR changes
	reconstruction_factor = max(0.1, self._ema_recon)

	# Combined learning rate (adaptive, no phase multiplier)
	lr = self.initial_lr * self.quality_multiplier * reconstruction_factor
	lr = max(lr, self.min_lr) # Ensure minimum LR

	# Update optimizer
	for param_group in self.optimizer.param_groups:
	param_group['lr'] = lr

	return lr


	class GumbelTemperatureScheduler:
	"""
	Temperature annealing for Gumbel-Softmax
	GPT-5 suggestion: Critical for progressive splitting
	"""

	def __init__(self,
	initial_temp: float = 1.0,
	final_temp: float = 0.1,
	anneal_rate: float = 0.99995,
	anneal_steps: Optional[int] = None):
	self.initial_temp = initial_temp
	self.final_temp = final_temp
	self.anneal_rate = anneal_rate
	self.anneal_steps = anneal_steps
	self.current_step = 0
	self.current_temp = initial_temp

	def step(self):
	"""Update temperature"""
	self.current_step += 1

	if self.anneal_steps:
	# Linear annealing
	progress = min(1.0, self.current_step / self.anneal_steps)
	self.current_temp = self.initial_temp + (self.final_temp - self.initial_temp) * progress
	else:
	# Exponential annealing
	self.current_temp = max(
	self.final_temp,
	self.initial_temp * (self.anneal_rate ** self.current_step)
	)

	return self.current_temp

	def get_temperature(self):
	"""Get current temperature"""
	return self.current_temp


	class CompressionRatioScheduler:
	"""
	Schedule target compression ratio during training
	Gradually increase compression requirements
	"""

	def __init__(self,
	initial_ratio: float = 8.0,
	target_ratio: float = 24.0,
	warmup_epochs: int = 10,
	total_epochs: int = 100):
	self.initial_ratio = initial_ratio
	self.target_ratio = target_ratio
	self.warmup_epochs = warmup_epochs
	self.total_epochs = total_epochs
	self.current_epoch = 0

	def step(self, epoch: Optional[int] = None):
	"""Update target compression ratio"""
	if epoch is not None:
	self.current_epoch = epoch
	else:
	self.current_epoch += 1

	if self.current_epoch < self.warmup_epochs:
	# Start with lower compression requirement
	ratio = self.initial_ratio
	else:
	# Gradually increase to target
	progress = (self.current_epoch - self.warmup_epochs) / (self.total_epochs - self.warmup_epochs)
	progress = min(1.0, progress)
	ratio = self.initial_ratio + (self.target_ratio - self.initial_ratio) * progress

	return ratio


	class MultiScheduler:
	"""
	Combine multiple schedulers for comprehensive training control
	"""

	def __init__(self, schedulers: Dict):
	"""
	Args:
	schedulers: Dictionary of schedulers
	{
	'lr': learning_rate_scheduler,
	'gumbel': gumbel_temperature_scheduler,
	'compression': compression_ratio_scheduler,
	...
	}
	"""
	self.schedulers = schedulers

	def step(self, **kwargs):
	"""
	Step all schedulers
	GPT fix: unified input convention

	Returns:
	Dictionary with all scheduler outputs
	"""
	results = {}

	for name, scheduler in self.schedulers.items():
	try:
	# Check scheduler type and pass appropriate arguments
	if hasattr(scheduler, '__class__'):
	class_name = scheduler.__class__.__name__

	if class_name == 'AdaptiveScheduler' and 'metric' in kwargs:
	results[name] = scheduler.step(kwargs['metric'], kwargs.get('epoch', 0))
	elif class_name == 'PhaseBasedScheduler' and 'epoch' in kwargs:
	results[name] = scheduler.step(kwargs['epoch'])
	elif class_name == 'CompressionRatioScheduler' and 'epoch' in kwargs:
	results[name] = scheduler.step(kwargs['epoch'])
	elif class_name == 'ProgressiveSplittingScheduler' and 'metrics' in kwargs:
	results[name] = scheduler.step(kwargs['metrics'])
	elif hasattr(scheduler, 'step'):
	# Generic step (no arguments)
	results[name] = scheduler.step()
	else:
	if hasattr(scheduler, 'step'):
	results[name] = scheduler.step()
	except Exception as e:
	print(f"Warning: Scheduler '{name}' step failed: {e}")
	results[name] = None

	return results

	def get_current_values(self):
	"""Get current values from all schedulers"""
	values = {}

	for name, scheduler in self.schedulers.items():
	if hasattr(scheduler, 'get_lr'):
	values[name] = scheduler.get_lr()
	elif hasattr(scheduler, 'get_temperature'):
	values[name] = scheduler.get_temperature()
	elif hasattr(scheduler, 'current_temp'):
	values[name] = scheduler.current_temp
	elif hasattr(scheduler, 'current_epoch'):
	values[name] = scheduler.current_epoch

	return values


	class GateWarmupScheduler:
	"""게이트 파라미터 웜업 스케줄러

	초기: 모든 레이어 동등 사용 (gate=1.0)
	웜업: 점진적 게이트 학습 시작
	후기: 최적 게이트 값으로 수렴
	"""

	def __init__(
	self,
	optimizer: torch.optim.Optimizer,
	warmup_steps: int = 1000,
	gate_param_group_name: str = 'gates',
	importance_param_group_name: str = 'importance'
	):
	"""
	Args:
	optimizer: 옵티마이저
	warmup_steps: 웜업 스텝 수
	gate_param_group_name: 게이트 파라미터 그룹 이름
	importance_param_group_name: 중요도 파라미터 그룹 이름
	"""
	self.optimizer = optimizer
	self.warmup_steps = warmup_steps
	self.gate_group_name = gate_param_group_name
	self.importance_group_name = importance_param_group_name

	# 초기 학습률 저장
	self.base_lrs = {}
	for group in optimizer.param_groups:
	if 'name' in group:
	self.base_lrs[group['name']] = group['lr']

	def get_gate_factor(self, step: int) -> float:
	"""게이트 학습률 계수 계산

	웜업 기간 동안은 낮은 학습률,
	이후 정상 학습률로 전환
	"""
	if step < self.warmup_steps:
	# 웜업 기간: 선형 증가
	return step / self.warmup_steps
	else:
	# 정상 학습
	return 1.0

	def get_importance_factor(self, step: int) -> float:
	"""중요도 학습률 계수 계산

	게이트보다 느리게 학습 시작
	"""
	delayed_warmup = self.warmup_steps * 1.5
	if step < delayed_warmup:
	return step / delayed_warmup * 0.5
	else:
	return 1.0

	def step(self, current_step: int):
	"""스케줄러 스텝

	Args:
	current_step: 현재 글로벌 스텝
	"""
	# 게이트 파라미터 그룹 학습률 조정
	gate_factor = self.get_gate_factor(current_step)
	importance_factor = self.get_importance_factor(current_step)

	for group in self.optimizer.param_groups:
	if 'name' not in group:
	continue

	if group['name'] == self.gate_group_name:
	# 게이트 학습률 조정
	group['lr'] = self.base_lrs[self.gate_group_name] * gate_factor

	elif group['name'] == self.importance_group_name:
	# 중요도 학습률 조정
	group['lr'] = self.base_lrs[self.importance_group_name] * importance_factor

	def get_lr(self) -> Dict[str, float]:
	"""현재 학습률 반환"""
	lrs = {}
	for group in self.optimizer.param_groups:
	if 'name' in group:
	lrs[group['name']] = group['lr']
	return lrs


	class UniversalCosineScheduler:
	"""Universal Cosine Annealing 스케줄러

	모든 언어에 대해 동일한 스케줄 적용
	"""

	def __init__(
	self,
	optimizer: torch.optim.Optimizer,
	warmup_steps: int = 1000,
	total_steps: int = 10000,
	min_lr_ratio: float = 0.1
	):
	self.optimizer = optimizer
	self.warmup_steps = warmup_steps
	self.total_steps = total_steps
	self.min_lr_ratio = min_lr_ratio
	self.current_step = 0

	# 초기 학습률 저장
	self.base_lrs = [group['lr'] for group in optimizer.param_groups]

	def step(self):
	"""스케줄러 스텝"""
	self.current_step += 1

	for idx, param_group in enumerate(self.optimizer.param_groups):
	if self.current_step < self.warmup_steps:
	# Warmup 단계
	lr = self.base_lrs[idx] * (self.current_step / self.warmup_steps)
	else:
	# Cosine annealing
	if self.total_steps <= self.warmup_steps:
	# warmup_steps가 total_steps보다 크거나 같은 경우
	lr = self.base_lrs[idx] * self.min_lr_ratio
	else:
	progress = min(1.0, (self.current_step - self.warmup_steps) / max(1, self.total_steps - self.warmup_steps))
	lr = self.base_lrs[idx] * (
	self.min_lr_ratio + (1 - self.min_lr_ratio) * 0.5 * (1 + math.cos(math.pi * progress))
	)

	param_group['lr'] = lr

	def get_last_lr(self) -> List[float]:
	"""마지막 학습률 반환"""
	return [group['lr'] for group in self.optimizer.param_groups]

	def state_dict(self) -> Dict[str, Any]:
	"""스케줄러 상태 딕셔너리 반환 (체크포인트 저장용)"""
	return {
	'current_step': self.current_step,
	'warmup_steps': self.warmup_steps,
	'total_steps': self.total_steps,
	'min_lr_ratio': self.min_lr_ratio,
	'base_lrs': self.base_lrs
	}

	def load_state_dict(self, state_dict: Dict[str, Any]):
	"""스케줄러 상태 로드 (체크포인트 재시작용)"""
	self.current_step = state_dict['current_step']
	self.warmup_steps = state_dict['warmup_steps']
	self.total_steps = state_dict['total_steps']
	self.min_lr_ratio = state_dict['min_lr_ratio']
	self.base_lrs = state_dict['base_lrs']


	class AdaptiveLayerScheduler:
	"""레이어별 적응적 스케줄러

	각 레이어의 학습 진행도에 따라 동적으로 조정
	"""

	def __init__(
	self,
	layer_builder,
	threshold_active: float = 0.7,
	threshold_skip: float = 0.3
	):
	"""
	Args:
	layer_builder: LayerBuilder 인스턴스
	threshold_active: 활성 레이어 임계값
	threshold_skip: 스킵 레이어 임계값
	"""
	self.layer_builder = layer_builder
	self.threshold_active = threshold_active
	self.threshold_skip = threshold_skip

	# 레이어별 통계
	self.layer_stats = {
	'usage_count': torch.zeros(5),
	'contribution': torch.zeros(5)
	}

	def update_stats(self, batch_output):
	"""배치 출력으로 통계 업데이트"""
	with torch.no_grad():
	gates = torch.sigmoid(self.layer_builder.layer_gates)

	# 사용 횟수 업데이트
	self.layer_stats['usage_count'] += (gates > self.threshold_skip).float()

	# 기여도 추정 (간단한 버전)
	importance = torch.nn.functional.softmax(
	self.layer_builder.layer_importance, dim=0
	)
	self.layer_stats['contribution'] += importance.detach()

	def get_layer_status(self) -> Dict[int, str]:
	"""각 레이어의 상태 반환"""
	gates = torch.sigmoid(self.layer_builder.layer_gates)
	status = {}

	for i in range(5):
	if gates[i] > self.threshold_active:
	status[i] = "ACTIVE"
	elif gates[i] > self.threshold_skip:
	status[i] = "PARTIAL"
	else:
	status[i] = "SKIP"

	return status

	def suggest_pruning(self) -> List[int]:
	"""프루닝 가능한 레이어 제안"""
	gates = torch.sigmoid(self.layer_builder.layer_gates)
	prunable = []

	for i in range(5):
	if gates[i] < self.threshold_skip:
	# 낮은 게이트 값 + 낮은 기여도
	if self.layer_stats['contribution'][i] < 0.1:
	prunable.append(i)

	return prunable


	if __name__ == "__main__":
	# Test schedulers
	print("Testing Schedulers")

	# Create dummy optimizer
	model = torch.nn.Linear(10, 10)
	optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

	# Test WarmupCosineScheduler
	print("\n1. WarmupCosineScheduler:")
	scheduler = WarmupCosineScheduler(optimizer, warmup_steps=100, total_steps=1000)
	lrs = []
	for step in range(200):
	lr = scheduler.step()
	if step % 20 == 0:
	print(f" Step {step}: LR = {lr:.6f}")
	lrs.append(lr)

	# Test PhaseBasedScheduler
	print("\n2. PhaseBasedScheduler:")
	phase_configs = [
	{'epochs': (0, 30), 'lr': 1e-4, 'warmup_epochs': 5},
	{'epochs': (31, 60), 'lr': 5e-5, 'warmup_epochs': 2},
	{'epochs': (61, 100), 'lr': 1e-5, 'warmup_epochs': 0}
	]
	scheduler = PhaseBasedScheduler(optimizer, phase_configs)
	for epoch in [0, 5, 31, 35, 61, 80]:
	lr = scheduler.step(epoch)
	print(f" Epoch {epoch}: LR = {lr:.6f}")

	# Test GumbelTemperatureScheduler
	print("\n3. GumbelTemperatureScheduler:")
	scheduler = GumbelTemperatureScheduler()
	for step in [0, 100, 500, 1000, 5000]:
	for _ in range(step - scheduler.current_step):
	scheduler.step()
	temp = scheduler.get_temperature()
	print(f" Step {step}: Temperature = {temp:.4f}")

	# Test CompressionRatioScheduler
	print("\n4. CompressionRatioScheduler:")
	scheduler = CompressionRatioScheduler()
	for epoch in [0, 5, 10, 30, 50, 80, 100]:
	ratio = scheduler.step(epoch)
	print(f" Epoch {epoch}: Target ratio = {ratio:.1f}:1")