Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

App Files Files Community

ggunio commited on Sep 14

Commit

318d977

verified ·

1 Parent(s): 51bc354

Upload inference.py with huggingface_hub

Browse files

Files changed (1) hide show

inference.py +297 -0

inference.py ADDED Viewed

	@@ -0,0 +1,297 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Intelligent Tokenizer v6.0 - Inference Module
+임베딩과 복원 기능
+"""
+import torch
+import sys
+import io
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+# UTF-8 인코딩 설정
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+sys.path.append(str(Path(__file__).parent))
+from core.boundary_aware_model import BoundaryAwareTokenizerModel
+from src.core.byte_tokenizer_v6 import ByteTokenizerV6
+class IntelligentTokenizer:
+    """Intelligent Tokenizer for embedding and restoration"""
+    def __init__(self, checkpoint_path: str = "checkpoints/latest_checkpoint.pt", device: str = None):
+        """
+        Initialize tokenizer
+        Args:
+            checkpoint_path: Path to model checkpoint
+            device: Device to use ('cuda', 'cpu', or None for auto)
+        """
+        if device is None:
+            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        else:
+            self.device = torch.device(device)
+        print(f"Initializing Intelligent Tokenizer v6.0...")
+        print(f"Device: {self.device}")
+        # Load checkpoint
+        checkpoint_path = Path(checkpoint_path)
+        if not checkpoint_path.exists():
+            raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+        checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)
+        # Initialize model
+        self.model = BoundaryAwareTokenizerModel(**checkpoint['model_config'])
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        # Initialize tokenizer
+        self.tokenizer = ByteTokenizerV6()
+        self.max_chunk_size = 250  # Safe margin for 256 byte limit
+        print(f"Model loaded: Epoch {checkpoint['epoch']}, Loss {checkpoint['loss']:.4f}")
+        print(f"Ready for inference!")
+    def embed(self, text: str) -> torch.Tensor:
+        """
+        Convert text to embeddings
+        Args:
+            text: Input text
+        Returns:
+            Embedding tensor
+        """
+        # Handle long text by chunking
+        if len(text.encode('utf-8')) > self.max_chunk_size:
+            chunks = self._split_text_safely(text)
+            embeddings = []
+            for chunk in chunks:
+                emb = self._embed_single(chunk)
+                embeddings.append(emb)
+            # Concatenate embeddings
+            return torch.cat(embeddings, dim=1)
+        else:
+            return self._embed_single(text)
+    def _embed_single(self, text: str) -> torch.Tensor:
+        """Embed single chunk"""
+        # Encode text
+        encoded = self.tokenizer.encode(text)
+        byte_ids = encoded['input_ids']
+        input_ids = torch.tensor([byte_ids], device=self.device)
+        attention_mask = torch.tensor([encoded['attention_mask']], device=self.device)
+        with torch.no_grad():
+            # Get embeddings
+            encoder_outputs = self.model.encoder(input_ids, attention_mask)
+            embeddings = encoder_outputs['last_hidden_state']
+        return embeddings
+    def restore(self, text: str) -> Tuple[str, float]:
+        """
+        Test restoration capability
+        Args:
+            text: Input text
+        Returns:
+            Tuple of (restored_text, accuracy)
+        """
+        # Handle long text
+        if len(text.encode('utf-8')) > self.max_chunk_size:
+            chunks = self._split_text_safely(text)
+            restored_chunks = []
+            accuracies = []
+            for chunk in chunks:
+                restored, acc = self._restore_single(chunk)
+                restored_chunks.append(restored)
+                accuracies.append(acc)
+            return ''.join(restored_chunks), sum(accuracies) / len(accuracies)
+        else:
+            return self._restore_single(text)
+    def _restore_single(self, text: str) -> Tuple[str, float]:
+        """Restore single chunk"""
+        # Encode text
+        encoded = self.tokenizer.encode(text)
+        byte_ids = encoded['input_ids']
+        if len(byte_ids) <= 1:
+            return text, 1.0
+        input_ids = torch.tensor([byte_ids], device=self.device)
+        attention_mask = torch.tensor([encoded['attention_mask']], device=self.device)
+        with torch.no_grad():
+            # Teacher forcing for restoration test
+            decoder_input = input_ids[:, :-1]
+            labels = input_ids[:, 1:]
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input,
+                labels=labels,
+                use_cross_attention=True
+            )
+            # Get predictions
+            predictions = torch.argmax(outputs['logits'], dim=-1)
+            accuracy = (predictions == labels).float().mean().item()
+            # Decode predictions
+            try:
+                # Remove special tokens and convert to bytes
+                pred_list = predictions[0].cpu().tolist()
+                # Add BOS at beginning for full sequence
+                full_sequence = [self.tokenizer.BOS] + pred_list
+                # Filter valid bytes
+                filtered = [b for b in full_sequence if 0 <= b < 256]
+                if filtered:
+                    restored_bytes = bytes(filtered)
+                    restored_text = restored_bytes.decode('utf-8', errors='ignore')
+                else:
+                    restored_text = ""
+            except Exception as e:
+                print(f"Restoration error: {e}")
+                restored_text = ""
+        return restored_text, accuracy
+    def compress(self, text: str) -> Dict:
+        """
+        Get compression statistics
+        Args:
+            text: Input text
+        Returns:
+            Dict with compression info
+        """
+        text_bytes = text.encode('utf-8')
+        embeddings = self.embed(text)
+        original_size = len(text_bytes)
+        compressed_size = embeddings.shape[1]
+        compression_ratio = original_size / compressed_size if compressed_size > 0 else 0
+        return {
+            'original_bytes': original_size,
+            'compressed_tokens': compressed_size,
+            'compression_ratio': compression_ratio,
+            'embedding_shape': list(embeddings.shape)
+        }
+    def _split_text_safely(self, text: str) -> List[str]:
+        """Split text safely at UTF-8 boundaries"""
+        chunks = []
+        text_bytes = text.encode('utf-8')
+        start = 0
+        while start < len(text_bytes):
+            end = min(start + self.max_chunk_size, len(text_bytes))
+            # Find valid UTF-8 boundary
+            while end > start and end < len(text_bytes):
+                try:
+                    chunk = text_bytes[start:end].decode('utf-8')
+                    break
+                except UnicodeDecodeError:
+                    end -= 1
+            if end > start:
+                chunk = text_bytes[start:end].decode('utf-8')
+                chunks.append(chunk)
+                start = end
+            else:
+                break
+        return chunks
+def test_model():
+    """Test model functionality"""
+    print("="*70)
+    print("INTELLIGENT TOKENIZER v6.0 - FUNCTIONALITY TEST")
+    print("="*70)
+    # Initialize tokenizer
+    tokenizer = IntelligentTokenizer()
+    # Test samples
+    test_samples = [
+        ("English", "Hello, world!"),
+        ("Korean", "안녕하세요. 반갑습니다."),
+        ("Chinese", "今天天气很好"),
+        ("Japanese", "こんにちは"),
+        ("Arabic", "مرحبا بك"),
+        ("Russian", "Привет, как дела?"),
+        ("Emoji", "Hello 👋 World 🌍!"),
+    ]
+    print("\n" + "="*70)
+    print("EMBEDDING & RESTORATION TESTS")
+    print("="*70)
+    total_accuracy = 0
+    successful = 0
+    for lang, text in test_samples:
+        print(f"\n[{lang}]")
+        print(f"Original: {text}")
+        # Test embedding
+        embeddings = tokenizer.embed(text)
+        print(f"Embedding: {embeddings.shape}")
+        # Test compression
+        compression = tokenizer.compress(text)
+        print(f"Compression: {compression['original_bytes']} bytes → {compression['compressed_tokens']} tokens")
+        print(f"Ratio: {compression['compression_ratio']:.2f}x")
+        # Test restoration
+        restored, accuracy = tokenizer.restore(text)
+        print(f"Restored: {restored}")
+        print(f"Accuracy: {accuracy:.1%}")
+        if accuracy > 0.7:
+            successful += 1
+        total_accuracy += accuracy
+    # Summary
+    print("\n" + "="*70)
+    print("TEST SUMMARY")
+    print("="*70)
+    print(f"Tests passed: {successful}/{len(test_samples)}")
+    print(f"Average accuracy: {total_accuracy/len(test_samples):.1%}")
+    if successful == len(test_samples):
+        print("\n✅ ALL TESTS PASSED!")
+        return True
+    elif successful >= len(test_samples) * 0.7:
+        print("\n⚠️ PARTIAL SUCCESS (70%+ tests passed)")
+        return True
+    else:
+        print("\n❌ TESTS FAILED")
+        return False
+if __name__ == "__main__":
+    success = test_model()
+    sys.exit(0 if success else 1)