Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

App Files Files Community

intelligent-tokenizer-v6-demo / inference.py

ggunio

Upload inference.py with huggingface_hub

905c972 verified about 1 month ago

raw

history blame contribute delete

6.29 kB

	"""
	B2NL-IntelligentTokenizer v6.2.1 - 실제 작동하는 추론 코드
	이 파일이 메인 사용법입니다.
	"""

	import torch
	import sys
	from pathlib import Path

	# 경로 추가
	sys.path.insert(0, str(Path(__file__).parent.parent.parent / "intelligent-tokenizer_v6.2.1"))
	sys.path.insert(0, str(Path(__file__).parent.parent.parent / "intelligent-tokenizer_v6.2.1/core"))

	from core.unified_model import IntelligentTokenizerV62
	from core.tokenizer import ByteTokenizerV62


	class B2NLTokenizer:
	"""실제로 작동하는 B2NL 토크나이저"""

	def __init__(self, checkpoint_path: str = None):
	"""
	Args:
	checkpoint_path: 체크포인트 경로 (없으면 기본값 사용)
	"""
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	# 기본 체크포인트 경로
	if checkpoint_path is None:
	checkpoint_path = "D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt"

	# 모델 로드
	self.model = IntelligentTokenizerV62()
	checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)
	self.model.load_state_dict(checkpoint['model_state_dict'])
	self.model = self.model.to(self.device)
	self.model.eval()

	print(f"Model loaded successfully on {self.device}")

	def compress(self, text: str) -> dict:
	"""텍스트를 압축"""
	return self.model.compress(text)

	def reconstruct(self, text: str, temperature: float = 0.1) -> str:
	"""
	텍스트를 압축 후 복원 (실제 작동하는 버전)

	Args:
	text: 입력 텍스트
	temperature: 생성 온도 (낮을수록 결정적)

	Returns:
	복원된 텍스트
	"""
	# 1. 텍스트 인코딩
	tokenizer = self.model.tokenizer
	encoded = tokenizer.encode(text)

	if isinstance(encoded, dict):
	input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
	attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']
	else:
	input_ids = encoded.unsqueeze(0) if encoded.dim() == 1 else encoded
	attention_mask = torch.ones_like(input_ids)

	input_ids = input_ids.to(self.device)
	attention_mask = attention_mask.to(self.device)

	# 2. 인코더로 압축
	with torch.no_grad():
	encoder_outputs = self.model.encoder(
	input_ids=input_ids,
	attention_mask=attention_mask
	)

	# 모든 히든 스테이트 준비
	if 'all_hidden_states' in encoder_outputs:
	encoder_all_hidden = encoder_outputs['all_hidden_states']
	else:
	compressed = encoder_outputs.get('compressed', encoder_outputs.get('hidden_states'))
	encoder_all_hidden = [compressed] * 4

	# 3. 자동회귀 디코딩 (실제 작동하는 방식)
	batch_size = input_ids.size(0)
	max_length = 48

	# BOS 토큰으로 시작
	generated = torch.full((batch_size, 1), tokenizer.BOS, device=self.device)

	for step in range(max_length - 1):
	with torch.no_grad():
	# 현재까지 생성된 시퀀스로 디코딩
	decoder_outputs = self.model.decoder(
	encoder_all_hidden=encoder_all_hidden,
	decoder_input_ids=generated,
	attention_mask=torch.ones_like(generated),
	use_cache=False
	)

	# 다음 토큰 예측
	logits = decoder_outputs['logits'][:, -1, :] / temperature

	# Top-k 샘플링
	top_k = 10
	indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
	logits[indices_to_remove] = float('-inf')

	# 확률 계산 및 샘플링
	probs = torch.nn.functional.softmax(logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)

	# 생성된 시퀀스에 추가
	generated = torch.cat([generated, next_token], dim=1)

	# EOS 토큰 체크
	if (next_token == tokenizer.EOS).all():
	break

	# 4. 텍스트로 디코딩
	if generated.dim() > 1:
	text = tokenizer.decode(generated[0])
	else:
	text = tokenizer.decode(generated)

	return text


	def test_tokenizer():
	"""토크나이저 테스트"""
	print("="*60)
	print("B2NL-IntelligentTokenizer v6.2.1 테스트")
	print("="*60)

	# 토크나이저 초기화
	tokenizer = B2NLTokenizer()

	# 테스트 텍스트
	test_texts = [
	"Hello, world!",
	"안녕하세요, 반갑습니다.",
	"The quick brown fox jumps over the lazy dog.",
	"人工智能技术正在改变世界。",
	]

	for text in test_texts:
	print(f"\n원본: {text}")

	# 압축
	compressed = tokenizer.compress(text)
	print(f"압축률: {compressed['compression_ratio']:.1f}:1 ({compressed['num_tokens']} 토큰)")

	# 복원
	reconstructed = tokenizer.reconstruct(text, temperature=0.1)
	print(f"복원: {reconstructed}")

	# 정확도 계산
	min_len = min(len(text), len(reconstructed))
	accuracy = sum(1 for i in range(min_len) if text[i] == reconstructed[i]) / len(text) * 100
	print(f"정확도: {accuracy:.1f}%")

	print("\n" + "="*60)
	print("Test completed!")
	print("="*60)


	# 사용 예제
	def example_usage():
	"""간단한 사용 예제"""
	# 1. 토크나이저 초기화
	tokenizer = B2NLTokenizer()

	# 2. 텍스트 압축
	text = "안녕하세요, 반갑습니다!"
	compressed = tokenizer.compress(text)
	print(f"압축 결과: {compressed['compression_ratio']:.1f}:1")

	# 3. 텍스트 복원
	reconstructed = tokenizer.reconstruct(text)
	print(f"복원 결과: {reconstructed}")

	return tokenizer


	if __name__ == "__main__":
	test_tokenizer()