Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

App Files Files Community

ggunio commited on Oct 6

Commit

905c972

verified ·

1 Parent(s): 4e3eeae

Upload inference.py with huggingface_hub

Browse files

Files changed (1) hide show

inference.py +186 -297

inference.py CHANGED Viewed

@@ -1,297 +1,186 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-Intelligent Tokenizer v6.0 - Inference Module
-임베딩과 복원 기능
-"""
-import torch
-import sys
-import io
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-# UTF-8 인코딩 설정
-if sys.stdout.encoding != 'utf-8':
-    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
-    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
-sys.path.append(str(Path(__file__).parent))
-from core.boundary_aware_model import BoundaryAwareTokenizerModel
-from src.core.byte_tokenizer_v6 import ByteTokenizerV6
-class IntelligentTokenizer:
-    """Intelligent Tokenizer for embedding and restoration"""
-    def __init__(self, checkpoint_path: str = "checkpoints/latest_checkpoint.pt", device: str = None):
-        """
-        Initialize tokenizer
-        Args:
-            checkpoint_path: Path to model checkpoint
-            device: Device to use ('cuda', 'cpu', or None for auto)
-        """
-        if device is None:
-            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        else:
-            self.device = torch.device(device)
-        print(f"Initializing Intelligent Tokenizer v6.0...")
-        print(f"Device: {self.device}")
-        # Load checkpoint
-        checkpoint_path = Path(checkpoint_path)
-        if not checkpoint_path.exists():
-            raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
-        checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)
-        # Initialize model
-        self.model = BoundaryAwareTokenizerModel(**checkpoint['model_config'])
-        self.model.load_state_dict(checkpoint['model_state_dict'])
-        self.model = self.model.to(self.device)
-        self.model.eval()
-        # Initialize tokenizer
-        self.tokenizer = ByteTokenizerV6()
-        self.max_chunk_size = 250  # Safe margin for 256 byte limit
-        print(f"Model loaded: Epoch {checkpoint['epoch']}, Loss {checkpoint['loss']:.4f}")
-        print(f"Ready for inference!")
-    def embed(self, text: str) -> torch.Tensor:
-        """
-        Convert text to embeddings
-        Args:
-            text: Input text
-        Returns:
-            Embedding tensor
-        """
-        # Handle long text by chunking
-        if len(text.encode('utf-8')) > self.max_chunk_size:
-            chunks = self._split_text_safely(text)
-            embeddings = []
-            for chunk in chunks:
-                emb = self._embed_single(chunk)
-                embeddings.append(emb)
-            # Concatenate embeddings
-            return torch.cat(embeddings, dim=1)
-        else:
-            return self._embed_single(text)
-    def _embed_single(self, text: str) -> torch.Tensor:
-        """Embed single chunk"""
-        # Encode text
-        encoded = self.tokenizer.encode(text)
-        byte_ids = encoded['input_ids']
-        input_ids = torch.tensor([byte_ids], device=self.device)
-        attention_mask = torch.tensor([encoded['attention_mask']], device=self.device)
-        with torch.no_grad():
-            # Get embeddings
-            encoder_outputs = self.model.encoder(input_ids, attention_mask)
-            embeddings = encoder_outputs['last_hidden_state']
-        return embeddings
-    def restore(self, text: str) -> Tuple[str, float]:
-        """
-        Test restoration capability
-        Args:
-            text: Input text
-        Returns:
-            Tuple of (restored_text, accuracy)
-        """
-        # Handle long text
-        if len(text.encode('utf-8')) > self.max_chunk_size:
-            chunks = self._split_text_safely(text)
-            restored_chunks = []
-            accuracies = []
-            for chunk in chunks:
-                restored, acc = self._restore_single(chunk)
-                restored_chunks.append(restored)
-                accuracies.append(acc)
-            return ''.join(restored_chunks), sum(accuracies) / len(accuracies)
-        else:
-            return self._restore_single(text)
-    def _restore_single(self, text: str) -> Tuple[str, float]:
-        """Restore single chunk"""
-        # Encode text
-        encoded = self.tokenizer.encode(text)
-        byte_ids = encoded['input_ids']
-        if len(byte_ids) <= 1:
-            return text, 1.0
-        input_ids = torch.tensor([byte_ids], device=self.device)
-        attention_mask = torch.tensor([encoded['attention_mask']], device=self.device)
-        with torch.no_grad():
-            # Teacher forcing for restoration test
-            decoder_input = input_ids[:, :-1]
-            labels = input_ids[:, 1:]
-            outputs = self.model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                decoder_input_ids=decoder_input,
-                labels=labels,
-                use_cross_attention=True
-            )
-            # Get predictions
-            predictions = torch.argmax(outputs['logits'], dim=-1)
-            accuracy = (predictions == labels).float().mean().item()
-            # Decode predictions
-            try:
-                # Remove special tokens and convert to bytes
-                pred_list = predictions[0].cpu().tolist()
-                # Add BOS at beginning for full sequence
-                full_sequence = [self.tokenizer.BOS] + pred_list
-                # Filter valid bytes
-                filtered = [b for b in full_sequence if 0 <= b < 256]
-                if filtered:
-                    restored_bytes = bytes(filtered)
-                    restored_text = restored_bytes.decode('utf-8', errors='ignore')
-                else:
-                    restored_text = ""
-            except Exception as e:
-                print(f"Restoration error: {e}")
-                restored_text = ""
-        return restored_text, accuracy
-    def compress(self, text: str) -> Dict:
-        """
-        Get compression statistics
-        Args:
-            text: Input text
-        Returns:
-            Dict with compression info
-        """
-        text_bytes = text.encode('utf-8')
-        embeddings = self.embed(text)
-        original_size = len(text_bytes)
-        compressed_size = embeddings.shape[1]
-        compression_ratio = original_size / compressed_size if compressed_size > 0 else 0
-        return {
-            'original_bytes': original_size,
-            'compressed_tokens': compressed_size,
-            'compression_ratio': compression_ratio,
-            'embedding_shape': list(embeddings.shape)
-        }
-    def _split_text_safely(self, text: str) -> List[str]:
-        """Split text safely at UTF-8 boundaries"""
-        chunks = []
-        text_bytes = text.encode('utf-8')
-        start = 0
-        while start < len(text_bytes):
-            end = min(start + self.max_chunk_size, len(text_bytes))
-            # Find valid UTF-8 boundary
-            while end > start and end < len(text_bytes):
-                try:
-                    chunk = text_bytes[start:end].decode('utf-8')
-                    break
-                except UnicodeDecodeError:
-                    end -= 1
-            if end > start:
-                chunk = text_bytes[start:end].decode('utf-8')
-                chunks.append(chunk)
-                start = end
-            else:
-                break
-        return chunks
-def test_model():
-    """Test model functionality"""
-    print("="*70)
-    print("INTELLIGENT TOKENIZER v6.0 - FUNCTIONALITY TEST")
-    print("="*70)
-    # Initialize tokenizer
-    tokenizer = IntelligentTokenizer()
-    # Test samples
-    test_samples = [
-        ("English", "Hello, world!"),
-        ("Korean", "안녕하세요. 반갑습니다."),
-        ("Chinese", "今天天气很好"),
-        ("Japanese", "こんにちは"),
-        ("Arabic", "مرحبا بك"),
-        ("Russian", "Привет, как дела?"),
-        ("Emoji", "Hello 👋 World 🌍!"),
-    ]
-    print("\n" + "="*70)
-    print("EMBEDDING & RESTORATION TESTS")
-    print("="*70)
-    total_accuracy = 0
-    successful = 0
-    for lang, text in test_samples:
-        print(f"\n[{lang}]")
-        print(f"Original: {text}")
-        # Test embedding
-        embeddings = tokenizer.embed(text)
-        print(f"Embedding: {embeddings.shape}")
-        # Test compression
-        compression = tokenizer.compress(text)
-        print(f"Compression: {compression['original_bytes']} bytes → {compression['compressed_tokens']} tokens")
-        print(f"Ratio: {compression['compression_ratio']:.2f}x")
-        # Test restoration
-        restored, accuracy = tokenizer.restore(text)
-        print(f"Restored: {restored}")
-        print(f"Accuracy: {accuracy:.1%}")
-        if accuracy > 0.7:
-            successful += 1
-        total_accuracy += accuracy
-    # Summary
-    print("\n" + "="*70)
-    print("TEST SUMMARY")
-    print("="*70)
-    print(f"Tests passed: {successful}/{len(test_samples)}")
-    print(f"Average accuracy: {total_accuracy/len(test_samples):.1%}")
-    if successful == len(test_samples):
-        print("\n✅ ALL TESTS PASSED!")
-        return True
-    elif successful >= len(test_samples) * 0.7:
-        print("\n⚠️ PARTIAL SUCCESS (70%+ tests passed)")
-        return True
-    else:
-        print("\n❌ TESTS FAILED")
-        return False
-if __name__ == "__main__":
-    success = test_model()
-    sys.exit(0 if success else 1)

+"""
+B2NL-IntelligentTokenizer v6.2.1 - 실제 작동하는 추론 코드
+이 파일이 메인 사용법입니다.
+"""
+import torch
+import sys
+from pathlib import Path
+# 경로 추가
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "intelligent-tokenizer_v6.2.1"))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "intelligent-tokenizer_v6.2.1/core"))
+from core.unified_model import IntelligentTokenizerV62
+from core.tokenizer import ByteTokenizerV62
+class B2NLTokenizer:
+    """실제로 작동하는 B2NL 토크나이저"""
+    def __init__(self, checkpoint_path: str = None):
+        """
+        Args:
+            checkpoint_path: 체크포인트 경로 (없으면 기본값 사용)
+        """
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # 기본 체크포인트 경로
+        if checkpoint_path is None:
+            checkpoint_path = "D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt"
+        # 모델 로드
+        self.model = IntelligentTokenizerV62()
+        checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        print(f"Model loaded successfully on {self.device}")
+    def compress(self, text: str) -> dict:
+        """텍스트를 압축"""
+        return self.model.compress(text)
+    def reconstruct(self, text: str, temperature: float = 0.1) -> str:
+        """
+        텍스트를 압축 후 복원 (실제 작동하는 버전)
+        Args:
+            text: 입력 텍스트
+            temperature: 생성 온도 (낮을수록 결정적)
+        Returns:
+            복원된 텍스트
+        """
+        # 1. 텍스트 인코딩
+        tokenizer = self.model.tokenizer
+        encoded = tokenizer.encode(text)
+        if isinstance(encoded, dict):
+            input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
+            attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']
+        else:
+            input_ids = encoded.unsqueeze(0) if encoded.dim() == 1 else encoded
+            attention_mask = torch.ones_like(input_ids)
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+        # 2. 인코더로 압축
+        with torch.no_grad():
+            encoder_outputs = self.model.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask
+            )
+            # 모든 히든 스테이트 준비
+            if 'all_hidden_states' in encoder_outputs:
+                encoder_all_hidden = encoder_outputs['all_hidden_states']
+            else:
+                compressed = encoder_outputs.get('compressed', encoder_outputs.get('hidden_states'))
+                encoder_all_hidden = [compressed] * 4
+        # 3. 자동회귀 디코딩 (실제 작동하는 방식)
+        batch_size = input_ids.size(0)
+        max_length = 48
+        # BOS 토큰으로 시작
+        generated = torch.full((batch_size, 1), tokenizer.BOS, device=self.device)
+        for step in range(max_length - 1):
+            with torch.no_grad():
+                # 현재까지 생성된 시퀀스로 디코딩
+                decoder_outputs = self.model.decoder(
+                    encoder_all_hidden=encoder_all_hidden,
+                    decoder_input_ids=generated,
+                    attention_mask=torch.ones_like(generated),
+                    use_cache=False
+                )
+                # 다음 토큰 예측
+                logits = decoder_outputs['logits'][:, -1, :] / temperature
+                # Top-k 샘플링
+                top_k = 10
+                indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+                logits[indices_to_remove] = float('-inf')
+                # 확률 계산 및 샘플링
+                probs = torch.nn.functional.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                # 생성된 시퀀스에 추가
+                generated = torch.cat([generated, next_token], dim=1)
+                # EOS 토큰 체크
+                if (next_token == tokenizer.EOS).all():
+                    break
+        # 4. 텍스트로 디코딩
+        if generated.dim() > 1:
+            text = tokenizer.decode(generated[0])
+        else:
+            text = tokenizer.decode(generated)
+        return text
+def test_tokenizer():
+    """토크나이저 테스트"""
+    print("="*60)
+    print("B2NL-IntelligentTokenizer v6.2.1 테스트")
+    print("="*60)
+    # 토크나이저 초기화
+    tokenizer = B2NLTokenizer()
+    # 테스트 텍스트
+    test_texts = [
+        "Hello, world!",
+        "안녕하세요, 반갑습니다.",
+        "The quick brown fox jumps over the lazy dog.",
+        "人工智能技术正在改变世界。",
+    ]
+    for text in test_texts:
+        print(f"\n원본: {text}")
+        # 압축
+        compressed = tokenizer.compress(text)
+        print(f"압축률: {compressed['compression_ratio']:.1f}:1 ({compressed['num_tokens']} 토큰)")
+        # 복원
+        reconstructed = tokenizer.reconstruct(text, temperature=0.1)
+        print(f"복원: {reconstructed}")
+        # 정확도 계산
+        min_len = min(len(text), len(reconstructed))
+        accuracy = sum(1 for i in range(min_len) if text[i] == reconstructed[i]) / len(text) * 100
+        print(f"정확도: {accuracy:.1f}%")
+    print("\n" + "="*60)
+    print("Test completed!")
+    print("="*60)
+# 사용 예제
+def example_usage():
+    """간단한 사용 예제"""
+    # 1. 토크나이저 초기화
+    tokenizer = B2NLTokenizer()
+    # 2. 텍스트 압축
+    text = "안녕하세요, 반갑습니다!"
+    compressed = tokenizer.compress(text)
+    print(f"압축 결과: {compressed['compression_ratio']:.1f}:1")
+    # 3. 텍스트 복원
+    reconstructed = tokenizer.reconstruct(text)
+    print(f"복원 결과: {reconstructed}")
+    return tokenizer
+if __name__ == "__main__":
+    test_tokenizer()