File size: 18,169 Bytes
13c2c77
77a029c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13c2c77
 
2607a65
 
 
4e3eeae
13c2c77
77a029c
8ba68ce
4e3eeae
8ba68ce
4e3eeae
 
 
 
8ba68ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e3eeae
8ba68ce
 
 
 
 
 
 
 
 
 
 
 
4e3eeae
13c2c77
8ba68ce
 
 
0250815
8ba68ce
 
 
0a70666
8ba68ce
 
4e3eeae
8ba68ce
13c2c77
8ba68ce
 
 
2607a65
8ba68ce
 
 
 
4e3eeae
8ba68ce
 
4e3eeae
77a029c
8ba68ce
4e3eeae
77a029c
 
 
 
 
 
 
 
 
 
4e3eeae
85b3a58
 
77a029c
85b3a58
 
 
77a029c
85b3a58
 
77a029c
 
 
13c2c77
8ba68ce
13c2c77
8ba68ce
77a029c
 
8ba68ce
13c2c77
8ba68ce
 
77a029c
 
 
 
 
 
 
 
 
 
0a70666
8ba68ce
 
2607a65
77a029c
 
 
 
2607a65
77a029c
2607a65
77a029c
13c2c77
8ba68ce
 
2607a65
8ba68ce
 
13c2c77
8ba68ce
 
 
 
13c2c77
77a029c
 
 
 
 
 
2068c6b
77a029c
2068c6b
 
 
 
 
3a4dfa2
2068c6b
77a029c
2068c6b
 
 
 
77a029c
2068c6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77a029c
 
8ba68ce
13c2c77
8ba68ce
77a029c
 
 
 
 
4e3eeae
8ba68ce
 
 
 
 
 
 
4e3eeae
 
8ba68ce
 
 
 
 
 
 
 
 
77a029c
 
 
 
4e3eeae
8ba68ce
77a029c
4e3eeae
13c2c77
8ba68ce
 
 
 
 
77a029c
8ba68ce
13c2c77
8ba68ce
13c2c77
8ba68ce
 
 
13c2c77
8ba68ce
 
77a029c
 
8ba68ce
 
 
 
 
 
2068c6b
 
 
 
 
 
 
 
 
 
4e3eeae
13c2c77
8ba68ce
 
77a029c
8ba68ce
13c2c77
2607a65
77a029c
 
 
 
2068c6b
77a029c
2068c6b
 
77a029c
2068c6b
 
 
 
77a029c
2068c6b
 
 
3a4dfa2
2068c6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77a029c
2068c6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77a029c
2068c6b
 
 
 
 
 
77a029c
2068c6b
77a029c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ba68ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77a029c
 
 
8ba68ce
77a029c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ba68ce
77a029c
 
8ba68ce
 
 
 
 
 
 
 
77a029c
 
8ba68ce
 
 
 
 
 
 
 
 
 
 
2607a65
8ba68ce
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
"""
B2NL-IntelligentTokenizer v6.2.1 - Progressive Byte-to-Natural Language Tokenizer

⚠️ IMPORTANT: Currently in AUTOREGRESSIVE MODE (Teacher Forcing Training)
- Current: ~500ms inference (accurate but slow)
- Coming Soon (November 2025): Non-autoregressive training (<50ms, 10x faster)

🚀 Purpose: Embedding Preprocessing Model for Inter-modal Communication
This model serves as a preprocessing layer that converts raw text into compressed
semantic embeddings, enabling efficient inter-modal communication between different
AI systems. By separating language understanding from task-specific inference,
it provides a universal representation layer for multi-modal AI applications.

Key Features:
- Fixed 16:1 compression ratio (48 bytes → 3 embeddings per chunk)
- Byte-level processing (no vocabulary required)
- 204 language support via FLORES-200 training
- Sliding window for texts > 48 bytes
"""

import gradio as gr
import torch
import sys
import io
import time
import math
from pathlib import Path

# Fix Windows Unicode
if sys.platform == 'win32':
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')

# Add paths
sys.path.insert(0, 'core')

from unified_model import IntelligentTokenizerV62
from tokenizer import ByteTokenizerV62

class B2NLTokenizer:
    def __init__(self):
        self.model = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.load_model()

    def load_model(self):
        """Load model from HuggingFace or local"""
        try:
            # Try HuggingFace first
            from huggingface_hub import hf_hub_download
            checkpoint_path = hf_hub_download(
                repo_id="ggunio/B2NL-IntelligentTokenizer-v6.2.1",
                filename="pytorch_model.bin"
            )
            print(f"Loading from HuggingFace")
        except:
            # Try local paths
            checkpoint_paths = [
                "pytorch_model.bin",
                "checkpoints/v62/16.0/epoch_100.pt",
                "D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt"
            ]
            checkpoint_path = None
            for path in checkpoint_paths:
                if Path(path).exists():
                    checkpoint_path = path
                    break

            if not checkpoint_path:
                print("❌ Model not found")
                return

        # Load model
        self.model = IntelligentTokenizerV62()
        checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)

        if 'model_state_dict' in checkpoint:
            self.model.load_state_dict(checkpoint['model_state_dict'])
        else:
            self.model.load_state_dict(checkpoint)

        self.model = self.model.to(self.device)
        self.model.eval()
        print(f"✅ Model loaded on {self.device}")

    def process_text(self, text, temperature=0.1):
        """Process text and return detailed results"""
        if not self.model or not text:
            return "Please enter text", "", ""

        try:
            start_time = time.time()

            # Calculate chunks and embeddings
            text_bytes = len(text.encode('utf-8'))

            # For texts > 48 bytes: sliding window with 8-byte overlap
            if text_bytes <= 48:
                num_chunks = 1
                num_embeddings = 3  # 1 chunk = 3 embeddings
            else:
                # Sliding window: first chunk 48 bytes, then slide by 40 bytes (8 overlap)
                num_chunks = 1 + math.ceil((text_bytes - 48) / 40)
                num_embeddings = num_chunks * 3

            # Reconstruct (full text, not truncated)
            with torch.no_grad():
                # Calculate appropriate max_length based on input
                max_gen_length = max(48, min(len(text) + 10, 512))  # Allow some extra space

                reconstructed = self.model.generate(text, temperature=temperature, max_length=max_gen_length)

                # For long texts, ensure we get full reconstruction
                if text_bytes > 48:
                    # Current model limitation: may not fully reconstruct very long texts
                    # This is due to sliding window processing
                    full_reconstruction = reconstructed
                else:
                    full_reconstruction = reconstructed

            elapsed_time = (time.time() - start_time) * 1000

            # Calculate accuracy
            min_len = min(len(text), len(full_reconstruction))
            matches = sum(1 for i in range(min_len) if text[i] == full_reconstruction[i])
            accuracy = (matches / len(text)) * 100 if text else 0

            # Format results
            stats = f"""📊 **Compression Statistics**
• Input: {text_bytes} bytes ({len(text)} chars)
• Chunks: {num_chunks} chunk{"s" if num_chunks > 1 else ""} (48-byte chunks with 8-byte overlap for long texts)
• Embeddings generated: {num_embeddings} embedding vectors (3 per chunk)
• Compression ratio: 16:1 fixed (48 bytes → 3 embeddings)
• Processing time: {elapsed_time:.1f}ms (autoregressive mode - slow)
• Reconstruction accuracy: {accuracy:.1f}%

⚠️ **Current Mode**: Autoregressive (Teacher Forcing training only)
• Speed: ~500ms per generation
• Coming: Non-autoregressive training (10x faster)"""

            details = f"""🔤 **Original Text** ({len(text)} chars, {text_bytes} bytes):
{text}

🔄 **Reconstructed Text** ({len(full_reconstruction)} chars):
{full_reconstruction}

✅ **Match Rate**: {accuracy:.1f}% ({matches}/{len(text)} characters)

📝 **Note**: Reconstruction quality may decrease for texts > 48 bytes due to sliding window processing."""

            return stats, details, full_reconstruction

        except Exception as e:
            return f"Error: {str(e)}", "", ""

# Initialize
tokenizer = B2NLTokenizer()

# Gradio Interface
with gr.Blocks(title="B2NL v6.2.1", theme=gr.themes.Soft()) as app:
    gr.Markdown("""
    # 🚀 B2NL-IntelligentTokenizer v6.2.1

    ## 📖 What is this model?

    **B2NL (Byte-to-Natural Language)** is a progressive tokenizer that converts raw text into compressed semantic embeddings.
    Unlike traditional tokenizers that use fixed vocabularies, B2NL learns directly from bytes and generates dense embeddings
    that capture semantic meaning while achieving 16:1 compression.

    ### 🔬 How the 16:1 Compression Works

    ```
    Input: 48 bytes (including padding/special tokens)

    Processing: Byte-level analysis with learned boundaries

    Output: 3 embedding vectors (1280-dim each)
    ```

    **Key Innovation**: The model learns to identify **semantic boundaries** within the 48-byte window.
    Instead of splitting at arbitrary points, it discovers natural language units (words, morphemes, phrases)
    and encodes them into meaningful embeddings. This is why "Hello, world!" (13 bytes) still generates
    3 embeddings - the model pads to 48 bytes but learns which parts contain actual information.

    ### 🎯 Why This Matters

    1. **Semantic Preservation**: Unlike byte-pair encoding (BPE) which can split words arbitrarily,
       B2NL respects semantic boundaries learned from data.

    2. **Language Agnostic**: No vocabulary needed - works equally well for all 204 languages.
       Korean "안녕하세요" and English "Hello" are processed the same way.

    3. **Predictable Costs**: Always 16:1 compression means predictable API costs for LLMs.
       48 bytes → 3 embeddings, always.

    4. **Inter-modal Bridge**: These embeddings can be used as a universal representation
       for cross-modal tasks (text→image, text→audio, etc.)

    ### 🎯 Real-World Applications

    - **LLM Cost Reduction**: 75% fewer tokens = 75% cost savings on API calls
    - **Multilingual Search**: Single embedding space for 204 languages
    - **Edge AI**: Compressed representations for bandwidth-limited IoT devices
    - **Cross-modal AI**: Universal embeddings for multimodal models

    ### ⚙️ Technical Architecture

    - **Encoder**: 6 layers, progressive dimension reduction
    - **Decoder**: 6 layers with cross-attention, reconstructs from embeddings
    - **Boundary Learning**: Gumbel-Softmax for differentiable boundary detection
    - **Total Parameters**: 244.7M (137.9M encoder + 106.8M decoder)
    - **Training**: FLORES-200 (204 languages), 100 epochs, teacher forcing

    ### ⚠️ Current Limitations

    - **Mode**: Autoregressive (teacher forcing only) - ~500ms per generation
    - **Long Texts**: Quality decreases for texts > 48 bytes (sliding window limitation)
    - **Coming Soon**: Non-autoregressive training (November 2025) for 10x speedup

    ---
    """)

    with gr.Tab("🔄 Reconstruction Test"):
        gr.Markdown("""
        Test how well the model compresses and reconstructs text. The model processes text in 48-byte chunks,
        generating 3 embedding vectors per chunk. For longer texts, it uses a sliding window with 8-byte overlap.
        """)

        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(
                    label="Input Text",
                    placeholder="Enter any text in any of 204 languages...",
                    lines=5
                )

                gr.Examples(
                    examples=[
                        # Major languages
                        "Hello, world! How are you today?",
                        "안녕하세요, 반갑습니다. 오늘 날씨가 좋네요.",
                        "你好世界!今天天气很好。",
                        "こんにちは世界!今日はいい天気ですね。",
                        "Bonjour le monde! Comment allez-vous?",
                        "Hola mundo! ¿Cómo estás hoy?",
                        "Привет мир! Как дела?",
                        "مرحبا بالعالم! كيف حالك اليوم؟",
                        # Test different lengths
                        "Short",  # 5 bytes - 1 chunk, 3 embeddings
                        "This is exactly 48 bytes of text for one chunk!",  # 48 bytes - 1 chunk, 3 embeddings
                        "This is a longer text that exceeds 48 bytes and will need multiple chunks with sliding window processing.",  # >48 bytes - multiple chunks
                    ],
                    inputs=input_text,
                    label="Example texts (various lengths and languages)"
                )

                temperature = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.1,
                    step=0.1,
                    label="Temperature (0.1 = Most accurate, 1.0 = More creative)"
                )

                process_btn = gr.Button("🔄 Compress & Reconstruct", variant="primary", size="lg")

            with gr.Column():
                stats_output = gr.Markdown(label="Statistics")
                details_output = gr.Markdown(label="Details")

    with gr.Tab("📊 Batch Test"):
        gr.Markdown("""
        Test multiple texts at once to compare compression across different languages and lengths.
        Each text is processed independently, showing how the fixed 16:1 compression works across languages.
        """)

        batch_input = gr.Textbox(
            label="Enter multiple texts (one per line)",
            placeholder="Enter texts in different languages...\nOne text per line",
            lines=10,
            value="""The quick brown fox jumps over the lazy dog.
안녕하세요, 반갑습니다. 오늘 날씨가 정말 좋네요.
你好世界!今天天气很好,我们一起去散步吧。
こんにちは世界!今日はいい天気ですね。散歩に行きましょう。
Bonjour le monde! Comment allez-vous aujourd'hui?
مرحبا بالعالم! كيف حالك اليوم؟ الطقس جميل جداً.
Привет мир! Как дела? Погода сегодня прекрасная!
This text is exactly 48 bytes long for testing!
Short text
A much longer text that definitely exceeds 48 bytes and will require sliding window processing with 8-byte overlaps between chunks."""
        )

        batch_btn = gr.Button("🔄 Process Batch", variant="primary")
        batch_output = gr.Dataframe(
            headers=["Text", "Language", "Bytes", "Chunks", "Embeddings", "Accuracy"],
            label="Batch Results"
        )

    with gr.Tab("📖 Documentation"):
        gr.Markdown("""
        ## Understanding B2NL Tokenization

        ### 🔬 The Core Innovation: Learned Semantic Boundaries

        Traditional tokenizers use fixed rules (BPE, WordPiece) that can split words arbitrarily.
        B2NL learns to identify **semantic units** within byte sequences:

        ```
        Traditional BPE:  "안녕하세요" → "안", "녕", "하", "세", "요" (5 tokens)
        B2NL:            "안녕하세요" → [emb1, emb2, emb3] (3 embeddings capturing full meaning)
        ```

        ### 📐 The 48-Byte → 3 Embeddings Architecture

        ```
        [48 bytes input] → [Encoder] → [3 × 1280-dim embeddings] → [Decoder] → [48 bytes output]
                 ↑                              ↓
            (with padding)             (semantic compression)
        ```

        **Why 48 bytes?**
        - Optimal for GPU parallelization (divisible by 8, 16, 24)
        - Captures most words/phrases in any language
        - Allows consistent 16:1 compression ratio

        **Why 3 embeddings?**
        - Matches typical semantic units in 48-byte window
        - Provides redundancy for robust reconstruction
        - Optimal for transformer cross-attention

        ### 🌐 Language-Agnostic Processing

        The model treats all languages equally at the byte level:

        | Language | Sample Text | Bytes | Embeddings | Compression |
        |----------|------------|-------|------------|-------------|
        | English | "Hello" | 5 (+43 pad) | 3 | 16:1 |
        | Korean | "안녕하세요" | 15 (+33 pad) | 3 | 16:1 |
        | Chinese | "你好世界" | 12 (+36 pad) | 3 | 16:1 |
        | Arabic | "مرحبا" | 10 (+38 pad) | 3 | 16:1 |

        All get compressed to 3 embeddings, but the model learns which parts contain information.

        ### 🔄 Sliding Window for Long Texts

        For texts exceeding 48 bytes:
        ```
        Text: "This is a very long sentence that exceeds 48 bytes..."

        Chunk 1: [Bytes 0-47]   → 3 embeddings
                      ↓ (8-byte overlap)
        Chunk 2: [Bytes 40-87]  → 3 embeddings
                      ↓ (8-byte overlap)
        Chunk 3: [Bytes 80-127] → 3 embeddings
        ```

        The 8-byte overlap preserves context across boundaries, preventing word splits.

        ### Current Limitations

        1. **Speed**: ~500ms per generation (autoregressive mode)
        2. **Long Texts**: Quality decreases with multiple chunks
        3. **Training**: Only teacher forcing, no autoregressive training yet

        ### Upcoming Improvements (November 2025)

        - **Non-autoregressive training**: 10x speed improvement
        - **Better long text handling**: Improved sliding window
        - **Streaming support**: Real-time processing

        ---

        **Author**: Jinhyun Woo
        **Paper**: [Zenodo](https://zenodo.org/records/17116281)
        **GitHub**: [Woojiggun/intelligent-tokenizer](https://github.com/Woojiggun/intelligent-tokenizer)
        **Model**: [ggunio/B2NL-IntelligentTokenizer-v6.2.1](https://huggingface.co/ggunio/B2NL-IntelligentTokenizer-v6.2.1)
        """)

    # Connect functions
    process_btn.click(
        fn=lambda text, temp: tokenizer.process_text(text, temp),
        inputs=[input_text, temperature],
        outputs=[stats_output, details_output]
    )

    def process_batch(texts):
        if not texts:
            return []

        results = []
        for text in texts.strip().split('\n'):
            if not text.strip():
                continue

            # Process each text
            text = text.strip()
            text_bytes = len(text.encode('utf-8'))

            # Calculate chunks and embeddings
            if text_bytes <= 48:
                num_chunks = 1
                num_embeddings = 3
            else:
                num_chunks = 1 + math.ceil((text_bytes - 48) / 40)
                num_embeddings = num_chunks * 3

            # Get reconstruction
            stats, details, reconstructed = tokenizer.process_text(text, 0.1)

            # Detect language (simple heuristic)
            if any(ord(c) >= 0x3040 and ord(c) <= 0x309F for c in text):
                lang = "Japanese"
            elif any(ord(c) >= 0xAC00 and ord(c) <= 0xD7AF for c in text):
                lang = "Korean"
            elif any(ord(c) >= 0x4E00 and ord(c) <= 0x9FFF for c in text):
                lang = "Chinese"
            elif any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in text):
                lang = "Arabic"
            elif any(ord(c) >= 0x0400 and ord(c) <= 0x04FF for c in text):
                lang = "Russian"
            else:
                lang = "English/Latin"

            # Calculate accuracy
            if "Error" not in stats:
                min_len = min(len(text), len(reconstructed))
                matches = sum(1 for i in range(min_len) if text[i] == reconstructed[i])
                accuracy = (matches / len(text)) * 100 if text else 0

                results.append([
                    text[:50] + "..." if len(text) > 50 else text,
                    lang,
                    text_bytes,
                    num_chunks,
                    num_embeddings,
                    f"{accuracy:.1f}%"
                ])

        return results

    batch_btn.click(
        fn=process_batch,
        inputs=batch_input,
        outputs=batch_output
    )

if __name__ == "__main__":
    app.launch()