Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

File size: 18,169 Bytes

13c2c77
77a029c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13c2c77
 
2607a65
 
 
4e3eeae
13c2c77
77a029c
8ba68ce
4e3eeae
8ba68ce
4e3eeae
 
 
 
8ba68ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e3eeae
8ba68ce
 
 
 
 
 
 
 
 
 
 
 
4e3eeae
13c2c77
8ba68ce
 
 
0250815
8ba68ce
 
 
0a70666
8ba68ce
 
4e3eeae
8ba68ce
13c2c77
8ba68ce
 
 
2607a65
8ba68ce
 
 
 
4e3eeae
8ba68ce
 
4e3eeae
77a029c
8ba68ce
4e3eeae
77a029c
 
 
 
 
 
 
 
 
 
4e3eeae
85b3a58
 
77a029c
85b3a58
 
 
77a029c
85b3a58
 
77a029c
 
 
13c2c77
8ba68ce
13c2c77
8ba68ce
77a029c
 
8ba68ce
13c2c77
8ba68ce
 
77a029c
 
 
 
 
 
 
 
 
 
0a70666
8ba68ce
 
2607a65
77a029c
 
 
 
2607a65
77a029c
2607a65
77a029c
13c2c77
8ba68ce
 
2607a65
8ba68ce
 
13c2c77
8ba68ce
 
 
 
13c2c77
77a029c
 
 
 
 
 
2068c6b
77a029c
2068c6b
 
 
 
 
3a4dfa2
2068c6b
77a029c
2068c6b
 
 
 
77a029c
2068c6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77a029c
 
8ba68ce
13c2c77
8ba68ce
77a029c
 
 
 
 
4e3eeae
8ba68ce
 
 
 
 
 
 
4e3eeae
 
8ba68ce
 
 
 
 
 
 
 
 
77a029c
 
 
 
4e3eeae
8ba68ce
77a029c
4e3eeae
13c2c77
8ba68ce
 
 
 
 
77a029c
8ba68ce
13c2c77
8ba68ce
13c2c77
8ba68ce
 
 
13c2c77
8ba68ce
 
77a029c
 
8ba68ce
 
 
 
 
 
2068c6b
 
 
 
 
 
 
 
 
 
4e3eeae
13c2c77
8ba68ce
 
77a029c
8ba68ce
13c2c77
2607a65
77a029c
 
 
 
2068c6b
77a029c
2068c6b
 
77a029c
2068c6b
 
 
 
77a029c
2068c6b
 
 
3a4dfa2
2068c6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77a029c
2068c6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77a029c
2068c6b
 
 
 
 
 
77a029c
2068c6b
77a029c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ba68ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77a029c
 
 
8ba68ce
77a029c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ba68ce
77a029c
 
8ba68ce
 
 
 
 
 
 
 
77a029c
 
8ba68ce
 
 
 
 
 
 
 
 
 
 
2607a65
8ba68ce

"""
B2NL-IntelligentTokenizer v6.2.1 - Progressive Byte-to-Natural Language Tokenizer

⚠️ IMPORTANT: Currently in AUTOREGRESSIVE MODE (Teacher Forcing Training)
- Current: ~500ms inference (accurate but slow)
- Coming Soon (November 2025): Non-autoregressive training (<50ms, 10x faster)

🚀 Purpose: Embedding Preprocessing Model for Inter-modal Communication
This model serves as a preprocessing layer that converts raw text into compressed
semantic embeddings, enabling efficient inter-modal communication between different
AI systems. By separating language understanding from task-specific inference,
it provides a universal representation layer for multi-modal AI applications.

Key Features:
- Fixed 16:1 compression ratio (48 bytes → 3 embeddings per chunk)
- Byte-level processing (no vocabulary required)
- 204 language support via FLORES-200 training
- Sliding window for texts > 48 bytes
"""

import gradio as gr
import torch
import sys
import io
import time
import math
from pathlib import Path

# Fix Windows Unicode
if sys.platform == 'win32':
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')

# Add paths
sys.path.insert(0, 'core')

from unified_model import IntelligentTokenizerV62
from tokenizer import ByteTokenizerV62

class B2NLTokenizer:
    def __init__(self):
        self.model = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.load_model()

    def load_model(self):
        """Load model from HuggingFace or local"""
        try:
            # Try HuggingFace first
            from huggingface_hub import hf_hub_download
            checkpoint_path = hf_hub_download(
                repo_id="ggunio/B2NL-IntelligentTokenizer-v6.2.1",
                filename="pytorch_model.bin"
            )
            print(f"Loading from HuggingFace")
        except:
            # Try local paths
            checkpoint_paths = [
                "pytorch_model.bin",
                "checkpoints/v62/16.0/epoch_100.pt",
                "D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt"
            ]
            checkpoint_path = None
            for path in checkpoint_paths:
                if Path(path).exists():
                    checkpoint_path = path
                    break

            if not checkpoint_path:
                print("❌ Model not found")
                return

        # Load model
        self.model = IntelligentTokenizerV62()
        checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)

        if 'model_state_dict' in checkpoint:
            self.model.load_state_dict(checkpoint['model_state_dict'])
        else:
            self.model.load_state_dict(checkpoint)

        self.model = self.model.to(self.device)
        self.model.eval()
        print(f"✅ Model loaded on {self.device}")

    def process_text(self, text, temperature=0.1):
        """Process text and return detailed results"""
        if not self.model or not text:
            return "Please enter text", "", ""

        try:
            start_time = time.time()

            # Calculate chunks and embeddings
            text_bytes = len(text.encode('utf-8'))

            # For texts > 48 bytes: sliding window with 8-byte overlap
            if text_bytes <= 48:
                num_chunks = 1
                num_embeddings = 3  # 1 chunk = 3 embeddings
            else:
                # Sliding window: first chunk 48 bytes, then slide by 40 bytes (8 overlap)
                num_chunks = 1 + math.ceil((text_bytes - 48) / 40)
                num_embeddings = num_chunks * 3

            # Reconstruct (full text, not truncated)
            with torch.no_grad():
                # Calculate appropriate max_length based on input
                max_gen_length = max(48, min(len(text) + 10, 512))  # Allow some extra space

                reconstructed = self.model.generate(text, temperature=temperature, max_length=max_gen_length)

                # For long texts, ensure we get full reconstruction
                if text_bytes > 48:
                    # Current model limitation: may not fully reconstruct very long texts
                    # This is due to sliding window processing
                    full_reconstruction = reconstructed
                else:
                    full_reconstruction = reconstructed

            elapsed_time = (time.time() - start_time) * 1000

            # Calculate accuracy
            min_len = min(len(text), len(full_reconstruction))
            matches = sum(1 for i in range(min_len) if text[i] == full_reconstruction[i])
            accuracy = (matches / len(text)) * 100 if text else 0

            # Format results
            stats = f"""📊 **Compression Statistics**
• Input: {text_bytes} bytes ({len(text)} chars)
• Chunks: {num_chunks} chunk{"s" if num_chunks > 1 else ""} (48-byte chunks with 8-byte overlap for long texts)
• Embeddings generated: {num_embeddings} embedding vectors (3 per chunk)
• Compression ratio: 16:1 fixed (48 bytes → 3 embeddings)
• Processing time: {elapsed_time:.1f}ms (autoregressive mode - slow)
• Reconstruction accuracy: {accuracy:.1f}%

⚠️ **Current Mode**: Autoregressive (Teacher Forcing training only)
• Speed: ~500ms per generation
• Coming: Non-autoregressive training (10x faster)"""

            details = f"""🔤 **Original Text** ({len(text)} chars, {text_bytes} bytes):
{text}

🔄 **Reconstructed Text** ({len(full_reconstruction)} chars):
{full_reconstruction}

✅ **Match Rate**: {accuracy:.1f}% ({matches}/{len(text)} characters)

📝 **Note**: Reconstruction quality may decrease for texts > 48 bytes due to sliding window processing."""

            return stats, details, full_reconstruction

        except Exception as e:
            return f"Error: {str(e)}", "", ""

# Initialize
tokenizer = B2NLTokenizer()

# Gradio Interface
with gr.Blocks(title="B2NL v6.2.1", theme=gr.themes.Soft()) as app:
    gr.Markdown("""
    # 🚀 B2NL-IntelligentTokenizer v6.2.1

    ## 📖 What is this model?

    **B2NL (Byte-to-Natural Language)** is a progressive tokenizer that converts raw text into compressed semantic embeddings.
    Unlike traditional tokenizers that use fixed vocabularies, B2NL learns directly from bytes and generates dense embeddings
    that capture semantic meaning while achieving 16:1 compression.

    ### 🔬 How the 16:1 Compression Works

    ```
    Input: 48 bytes (including padding/special tokens)
           ↓
    Processing: Byte-level analysis with learned boundaries
           ↓
    Output: 3 embedding vectors (1280-dim each)
    ```

    **Key Innovation**: The model learns to identify **semantic boundaries** within the 48-byte window.
    Instead of splitting at arbitrary points, it discovers natural language units (words, morphemes, phrases)
    and encodes them into meaningful embeddings. This is why "Hello, world!" (13 bytes) still generates
    3 embeddings - the model pads to 48 bytes but learns which parts contain actual information.

    ### 🎯 Why This Matters

    1. **Semantic Preservation**: Unlike byte-pair encoding (BPE) which can split words arbitrarily,
       B2NL respects semantic boundaries learned from data.

    2. **Language Agnostic**: No vocabulary needed - works equally well for all 204 languages.
       Korean "안녕하세요" and English "Hello" are processed the same way.

    3. **Predictable Costs**: Always 16:1 compression means predictable API costs for LLMs.
       48 bytes → 3 embeddings, always.

    4. **Inter-modal Bridge**: These embeddings can be used as a universal representation
       for cross-modal tasks (text→image, text→audio, etc.)

    ### 🎯 Real-World Applications

    - **LLM Cost Reduction**: 75% fewer tokens = 75% cost savings on API calls
    - **Multilingual Search**: Single embedding space for 204 languages
    - **Edge AI**: Compressed representations for bandwidth-limited IoT devices
    - **Cross-modal AI**: Universal embeddings for multimodal models

    ### ⚙️ Technical Architecture

    - **Encoder**: 6 layers, progressive dimension reduction
    - **Decoder**: 6 layers with cross-attention, reconstructs from embeddings
    - **Boundary Learning**: Gumbel-Softmax for differentiable boundary detection
    - **Total Parameters**: 244.7M (137.9M encoder + 106.8M decoder)
    - **Training**: FLORES-200 (204 languages), 100 epochs, teacher forcing

    ### ⚠️ Current Limitations

    - **Mode**: Autoregressive (teacher forcing only) - ~500ms per generation
    - **Long Texts**: Quality decreases for texts > 48 bytes (sliding window limitation)
    - **Coming Soon**: Non-autoregressive training (November 2025) for 10x speedup

    ---
    """)

    with gr.Tab("🔄 Reconstruction Test"):
        gr.Markdown("""
        Test how well the model compresses and reconstructs text. The model processes text in 48-byte chunks,
        generating 3 embedding vectors per chunk. For longer texts, it uses a sliding window with 8-byte overlap.
        """)

        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(
                    label="Input Text",
                    placeholder="Enter any text in any of 204 languages...",
                    lines=5
                )

                gr.Examples(
                    examples=[
                        # Major languages
                        "Hello, world! How are you today?",
                        "안녕하세요, 반갑습니다. 오늘 날씨가 좋네요.",
                        "你好世界！今天天气很好。",
                        "こんにちは世界！今日はいい天気ですね。",
                        "Bonjour le monde! Comment allez-vous?",
                        "Hola mundo! ¿Cómo estás hoy?",
                        "Привет мир! Как дела?",
                        "مرحبا بالعالم! كيف حالك اليوم؟",
                        # Test different lengths
                        "Short",  # 5 bytes - 1 chunk, 3 embeddings
                        "This is exactly 48 bytes of text for one chunk!",  # 48 bytes - 1 chunk, 3 embeddings
                        "This is a longer text that exceeds 48 bytes and will need multiple chunks with sliding window processing.",  # >48 bytes - multiple chunks
                    ],
                    inputs=input_text,
                    label="Example texts (various lengths and languages)"
                )

                temperature = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.1,
                    step=0.1,
                    label="Temperature (0.1 = Most accurate, 1.0 = More creative)"
                )

                process_btn = gr.Button("🔄 Compress & Reconstruct", variant="primary", size="lg")

            with gr.Column():
                stats_output = gr.Markdown(label="Statistics")
                details_output = gr.Markdown(label="Details")

    with gr.Tab("📊 Batch Test"):
        gr.Markdown("""
        Test multiple texts at once to compare compression across different languages and lengths.
        Each text is processed independently, showing how the fixed 16:1 compression works across languages.
        """)

        batch_input = gr.Textbox(
            label="Enter multiple texts (one per line)",
            placeholder="Enter texts in different languages...\nOne text per line",
            lines=10,
            value="""The quick brown fox jumps over the lazy dog.
안녕하세요, 반갑습니다. 오늘 날씨가 정말 좋네요.
你好世界！今天天气很好，我们一起去散步吧。
こんにちは世界！今日はいい天気ですね。散歩に行きましょう。
Bonjour le monde! Comment allez-vous aujourd'hui?
مرحبا بالعالم! كيف حالك اليوم؟ الطقس جميل جداً.
Привет мир! Как дела? Погода сегодня прекрасная!
This text is exactly 48 bytes long for testing!
Short text
A much longer text that definitely exceeds 48 bytes and will require sliding window processing with 8-byte overlaps between chunks."""
        )

        batch_btn = gr.Button("🔄 Process Batch", variant="primary")
        batch_output = gr.Dataframe(
            headers=["Text", "Language", "Bytes", "Chunks", "Embeddings", "Accuracy"],
            label="Batch Results"
        )

    with gr.Tab("📖 Documentation"):
        gr.Markdown("""
        ## Understanding B2NL Tokenization

        ### 🔬 The Core Innovation: Learned Semantic Boundaries

        Traditional tokenizers use fixed rules (BPE, WordPiece) that can split words arbitrarily.
        B2NL learns to identify **semantic units** within byte sequences:

        ```
        Traditional BPE:  "안녕하세요" → "안", "녕", "하", "세", "요" (5 tokens)
        B2NL:            "안녕하세요" → [emb1, emb2, emb3] (3 embeddings capturing full meaning)
        ```

        ### 📐 The 48-Byte → 3 Embeddings Architecture

        ```
        [48 bytes input] → [Encoder] → [3 × 1280-dim embeddings] → [Decoder] → [48 bytes output]
                 ↑                              ↓
            (with padding)             (semantic compression)
        ```

        **Why 48 bytes?**
        - Optimal for GPU parallelization (divisible by 8, 16, 24)
        - Captures most words/phrases in any language
        - Allows consistent 16:1 compression ratio

        **Why 3 embeddings?**
        - Matches typical semantic units in 48-byte window
        - Provides redundancy for robust reconstruction
        - Optimal for transformer cross-attention

        ### 🌐 Language-Agnostic Processing

        The model treats all languages equally at the byte level:

        | Language | Sample Text | Bytes | Embeddings | Compression |
        |----------|------------|-------|------------|-------------|
        | English | "Hello" | 5 (+43 pad) | 3 | 16:1 |
        | Korean | "안녕하세요" | 15 (+33 pad) | 3 | 16:1 |
        | Chinese | "你好世界" | 12 (+36 pad) | 3 | 16:1 |
        | Arabic | "مرحبا" | 10 (+38 pad) | 3 | 16:1 |

        All get compressed to 3 embeddings, but the model learns which parts contain information.

        ### 🔄 Sliding Window for Long Texts

        For texts exceeding 48 bytes:
        ```
        Text: "This is a very long sentence that exceeds 48 bytes..."

        Chunk 1: [Bytes 0-47]   → 3 embeddings
                      ↓ (8-byte overlap)
        Chunk 2: [Bytes 40-87]  → 3 embeddings
                      ↓ (8-byte overlap)
        Chunk 3: [Bytes 80-127] → 3 embeddings
        ```

        The 8-byte overlap preserves context across boundaries, preventing word splits.

        ### Current Limitations

        1. **Speed**: ~500ms per generation (autoregressive mode)
        2. **Long Texts**: Quality decreases with multiple chunks
        3. **Training**: Only teacher forcing, no autoregressive training yet

        ### Upcoming Improvements (November 2025)

        - **Non-autoregressive training**: 10x speed improvement
        - **Better long text handling**: Improved sliding window
        - **Streaming support**: Real-time processing

        ---

        **Author**: Jinhyun Woo
        **Paper**: [Zenodo](https://zenodo.org/records/17116281)
        **GitHub**: [Woojiggun/intelligent-tokenizer](https://github.com/Woojiggun/intelligent-tokenizer)
        **Model**: [ggunio/B2NL-IntelligentTokenizer-v6.2.1](https://huggingface.co/ggunio/B2NL-IntelligentTokenizer-v6.2.1)
        """)

    # Connect functions
    process_btn.click(
        fn=lambda text, temp: tokenizer.process_text(text, temp),
        inputs=[input_text, temperature],
        outputs=[stats_output, details_output]
    )

    def process_batch(texts):
        if not texts:
            return []

        results = []
        for text in texts.strip().split('\n'):
            if not text.strip():
                continue

            # Process each text
            text = text.strip()
            text_bytes = len(text.encode('utf-8'))

            # Calculate chunks and embeddings
            if text_bytes <= 48:
                num_chunks = 1
                num_embeddings = 3
            else:
                num_chunks = 1 + math.ceil((text_bytes - 48) / 40)
                num_embeddings = num_chunks * 3

            # Get reconstruction
            stats, details, reconstructed = tokenizer.process_text(text, 0.1)

            # Detect language (simple heuristic)
            if any(ord(c) >= 0x3040 and ord(c) <= 0x309F for c in text):
                lang = "Japanese"
            elif any(ord(c) >= 0xAC00 and ord(c) <= 0xD7AF for c in text):
                lang = "Korean"
            elif any(ord(c) >= 0x4E00 and ord(c) <= 0x9FFF for c in text):
                lang = "Chinese"
            elif any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in text):
                lang = "Arabic"
            elif any(ord(c) >= 0x0400 and ord(c) <= 0x04FF for c in text):
                lang = "Russian"
            else:
                lang = "English/Latin"

            # Calculate accuracy
            if "Error" not in stats:
                min_len = min(len(text), len(reconstructed))
                matches = sum(1 for i in range(min_len) if text[i] == reconstructed[i])
                accuracy = (matches / len(text)) * 100 if text else 0

                results.append([
                    text[:50] + "..." if len(text) > 50 else text,
                    lang,
                    text_bytes,
                    num_chunks,
                    num_embeddings,
                    f"{accuracy:.1f}%"
                ])

        return results

    batch_btn.click(
        fn=process_batch,
        inputs=batch_input,
        outputs=batch_output
    )

if __name__ == "__main__":
    app.launch()