Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

App Files Files Community

ggunio commited on Sep 14

Commit

7aabfdc

verified ·

1 Parent(s): 318d977

Upload demo_poc.py with huggingface_hub

Browse files

Files changed (1) hide show

demo_poc.py +266 -0

demo_poc.py ADDED Viewed

	@@ -0,0 +1,266 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+POC 데모 스크립트 - 긴 텍스트 자동 분할 처리
+"""
+import torch
+import sys
+import io
+from pathlib import Path
+import time
+# UTF-8 인코딩 설정
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+sys.path.append(str(Path(__file__).parent))
+from core.boundary_aware_model import BoundaryAwareTokenizerModel
+from src.core.byte_tokenizer_v6 import ByteTokenizerV6
+# Device
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+class IntelligentTokenizerPOC:
+    """POC 데모용 클래스"""
+    def __init__(self, checkpoint_path="checkpoints/unified/latest_checkpoint.pt"):
+        print("="*70)
+        print("INTELLIGENT TOKENIZER v6.0 - POC Demo")
+        print("="*70)
+        print(f"Device: {device}")
+        print(f"Loading checkpoint...")
+        # 체크포인트 로드
+        checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+        self.model = BoundaryAwareTokenizerModel(**checkpoint['model_config'])
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.model = self.model.to(device)
+        self.model.eval()
+        self.tokenizer = ByteTokenizerV6()
+        self.max_chunk_size = 250  # 256보다 약간 작게 (안전 마진)
+        print(f"Model loaded: Epoch {checkpoint['epoch']}, Loss {checkpoint['loss']:.4f}")
+        print(f"Current limitation: 256 bytes per chunk")
+        print(f"(Due to POC development constraints and limited GPU resources)")
+        print("="*70)
+        print()
+    def process_text(self, text: str, show_details=True):
+        """텍스트 처리 (자동 분할)"""
+        # 바이트로 변환
+        text_bytes = text.encode('utf-8')
+        total_bytes = len(text_bytes)
+        if show_details:
+            print(f"Input text: {text[:100]}..." if len(text) > 100 else f"Input text: {text}")
+            print(f"Total bytes: {total_bytes}")
+        # 256 바이트 초과시 자동 분할
+        if total_bytes > self.max_chunk_size:
+            chunks = self._split_text_safely(text)
+            if show_details:
+                print(f"Auto-splitting into {len(chunks)} chunks (256 byte limit for POC)")
+                print("Note: Production version will handle up to 4096+ bytes")
+                print("-"*50)
+            results = []
+            total_compressed = 0
+            for i, chunk in enumerate(chunks):
+                if show_details:
+                    print(f"\nChunk {i+1}/{len(chunks)}:")
+                result = self._process_single_chunk(chunk, show_details)
+                results.append(result)
+                total_compressed += result['compressed_tokens']
+            # 전체 통계
+            if show_details:
+                print("\n" + "="*50)
+                print("OVERALL RESULTS:")
+                print(f"Total input: {total_bytes} bytes")
+                print(f"Total compressed: {total_compressed} tokens")
+                print(f"Compression ratio: {total_bytes/total_compressed:.2f}x")
+                print(f"Average accuracy: {sum(r['accuracy'] for r in results)/len(results):.1%}")
+            return results
+        else:
+            # 단일 청크 처리
+            return self._process_single_chunk(text, show_details)
+    def _split_text_safely(self, text: str):
+        """UTF-8 경계를 고려한 안전한 텍스트 분할"""
+        chunks = []
+        text_bytes = text.encode('utf-8')
+        start = 0
+        while start < len(text_bytes):
+            # 청크 크기 결정
+            end = min(start + self.max_chunk_size, len(text_bytes))
+            # UTF-8 경계 확인 (한글은 3바이트)
+            while end > start and end < len(text_bytes):
+                try:
+                    # 디코딩 시도
+                    chunk = text_bytes[start:end].decode('utf-8')
+                    break
+                except UnicodeDecodeError:
+                    # UTF-8 경계가 아니면 1바이트 뒤로
+                    end -= 1
+            if end > start:
+                chunk = text_bytes[start:end].decode('utf-8')
+                chunks.append(chunk)
+                start = end
+            else:
+                break
+        return chunks
+    def _process_single_chunk(self, text: str, show_details=True):
+        """단일 청크 처리"""
+        # 인코딩
+        encoded = self.tokenizer.encode(text)
+        byte_ids = encoded['input_ids']
+        input_ids = torch.tensor([byte_ids], device=device)
+        attention_mask = torch.tensor([encoded['attention_mask']], device=device)
+        with torch.no_grad():
+            # 압축
+            start_time = time.time()
+            encoder_outputs = self.model.encoder(input_ids, attention_mask)
+            encoder_hidden = encoder_outputs['last_hidden_state']
+            compression_time = time.time() - start_time
+            compressed_tokens = encoder_hidden.shape[1]
+            compression_ratio = len(byte_ids) / compressed_tokens
+            # 복원 (Teacher Forcing)
+            if len(byte_ids) > 1:
+                decoder_input = input_ids[:, :-1]
+                labels = input_ids[:, 1:]
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    decoder_input_ids=decoder_input,
+                    labels=labels,
+                    use_cross_attention=True
+                )
+                predictions = torch.argmax(outputs['logits'], dim=-1)
+                accuracy = (predictions == labels).float().mean().item()
+            else:
+                accuracy = 1.0
+        if show_details:
+            print(f"  Input: {len(byte_ids)} bytes")
+            print(f"  Compressed: {compressed_tokens} tokens ({compression_ratio:.2f}x)")
+            print(f"  Accuracy: {accuracy:.1%}")
+            print(f"  Processing time: {compression_time*1000:.1f}ms")
+        return {
+            'text': text,
+            'input_bytes': len(byte_ids),
+            'compressed_tokens': compressed_tokens,
+            'compression_ratio': compression_ratio,
+            'accuracy': accuracy,
+            'time_ms': compression_time * 1000
+        }
+    def benchmark_languages(self):
+        """다국어 벤치마크"""
+        print("\n" + "="*70)
+        print("MULTILINGUAL BENCHMARK")
+        print("="*70)
+        test_samples = {
+            'English': "The quick brown fox jumps over the lazy dog",
+            'Korean': "안녕하세요. 오늘 날씨가 정말 좋네요",
+            'Chinese': "今天天气很好",
+            'Japanese': "こんにちは",
+            'Spanish': "Hola, ¿cómo estás?",
+            'Arabic': "مرحبا بك",
+            'Russian': "Привет, как дела?",
+        }
+        for lang, text in test_samples.items():
+            print(f"\n{lang}:")
+            self._process_single_chunk(text, show_details=True)
+    def explain_advantages(self):
+        """장점 설명"""
+        print("\n" + "="*70)
+        print("KEY ADVANTAGES")
+        print("="*70)
+        print("""
+1. PURE LEARNING-BASED
+   - No vocabulary files (260 fixed bytes vs 50K+ tokens)
+   - No language-specific rules
+   - Learns compression patterns from data
+2. MULTILINGUAL EQUALITY
+   - All 204 languages treated equally
+   - No vocabulary bias towards English
+   - Better for low-resource languages
+3. COMPRESSION CAPABILITY
+   - Current: 2-3x compression (POC stage)
+   - Target: 5-10x compression (with more training)
+   - API cost reduction: 50-80%
+4. CURRENT LIMITATIONS (POC)
+   - 256 byte chunks (due to limited GPU resources)
+   - Will expand to 4096+ bytes post-POC
+   - Training on personal RTX 3060 (4 months development)
+5. FUTURE ROADMAP
+   - Multimodal support (text + image + audio)
+   - Dynamic compression levels
+   - Real-time streaming mode
+        """)
+        print("="*70)
+def main():
+    """메인 데모"""
+    poc = IntelligentTokenizerPOC()
+    # 1. 짧은 텍스트 데모
+    print("\n### SHORT TEXT DEMO ###")
+    poc.process_text("Hello, world!")
+    poc.process_text("안녕하세요. 반갑습니다.")
+    # 2. 긴 텍스트 자동 분할 데모
+    print("\n### LONG TEXT AUTO-SPLIT DEMO ###")
+    long_text = """
+    인공지능 기술이 빠르게 발전하고 있습니다. 특히 자연어 처리 분야에서
+    놀라운 성과를 보이고 있으며, 이는 우리의 일상생활에도 큰 영향을
+    미치고 있습니다. 앞으로 더 많은 혁신이 기대됩니다.
+    The development of artificial intelligence is accelerating rapidly.
+    Natural language processing, in particular, has shown remarkable progress,
+    significantly impacting our daily lives. We can expect even more innovations
+    in the near future.
+    """
+    poc.process_text(long_text)
+    # 3. 다국어 벤치마크
+    poc.benchmark_languages()
+    # 4. 장점 설명
+    poc.explain_advantages()
+    print("\n" + "="*70)
+    print("POC DEMO COMPLETE")
+    print("Developed in 4 months by a solo developer with no prior AI experience")
+    print("Contact: [your contact info]")
+    print("="*70)
+if __name__ == "__main__":
+    main()