ggunio's picture
Fix: Correct embedding dimension (1280-dim, not 768-dim)
3a4dfa2 verified
"""
B2NL-IntelligentTokenizer v6.2.1 - Progressive Byte-to-Natural Language Tokenizer
⚠️ IMPORTANT: Currently in AUTOREGRESSIVE MODE (Teacher Forcing Training)
- Current: ~500ms inference (accurate but slow)
- Coming Soon (November 2025): Non-autoregressive training (<50ms, 10x faster)
πŸš€ Purpose: Embedding Preprocessing Model for Inter-modal Communication
This model serves as a preprocessing layer that converts raw text into compressed
semantic embeddings, enabling efficient inter-modal communication between different
AI systems. By separating language understanding from task-specific inference,
it provides a universal representation layer for multi-modal AI applications.
Key Features:
- Fixed 16:1 compression ratio (48 bytes β†’ 3 embeddings per chunk)
- Byte-level processing (no vocabulary required)
- 204 language support via FLORES-200 training
- Sliding window for texts > 48 bytes
"""
import gradio as gr
import torch
import sys
import io
import time
import math
from pathlib import Path
# Fix Windows Unicode
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
# Add paths
sys.path.insert(0, 'core')
from unified_model import IntelligentTokenizerV62
from tokenizer import ByteTokenizerV62
class B2NLTokenizer:
def __init__(self):
self.model = None
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.load_model()
def load_model(self):
"""Load model from HuggingFace or local"""
try:
# Try HuggingFace first
from huggingface_hub import hf_hub_download
checkpoint_path = hf_hub_download(
repo_id="ggunio/B2NL-IntelligentTokenizer-v6.2.1",
filename="pytorch_model.bin"
)
print(f"Loading from HuggingFace")
except:
# Try local paths
checkpoint_paths = [
"pytorch_model.bin",
"checkpoints/v62/16.0/epoch_100.pt",
"D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt"
]
checkpoint_path = None
for path in checkpoint_paths:
if Path(path).exists():
checkpoint_path = path
break
if not checkpoint_path:
print("❌ Model not found")
return
# Load model
self.model = IntelligentTokenizerV62()
checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)
if 'model_state_dict' in checkpoint:
self.model.load_state_dict(checkpoint['model_state_dict'])
else:
self.model.load_state_dict(checkpoint)
self.model = self.model.to(self.device)
self.model.eval()
print(f"βœ… Model loaded on {self.device}")
def process_text(self, text, temperature=0.1):
"""Process text and return detailed results"""
if not self.model or not text:
return "Please enter text", "", ""
try:
start_time = time.time()
# Calculate chunks and embeddings
text_bytes = len(text.encode('utf-8'))
# For texts > 48 bytes: sliding window with 8-byte overlap
if text_bytes <= 48:
num_chunks = 1
num_embeddings = 3 # 1 chunk = 3 embeddings
else:
# Sliding window: first chunk 48 bytes, then slide by 40 bytes (8 overlap)
num_chunks = 1 + math.ceil((text_bytes - 48) / 40)
num_embeddings = num_chunks * 3
# Reconstruct (full text, not truncated)
with torch.no_grad():
# Calculate appropriate max_length based on input
max_gen_length = max(48, min(len(text) + 10, 512)) # Allow some extra space
reconstructed = self.model.generate(text, temperature=temperature, max_length=max_gen_length)
# For long texts, ensure we get full reconstruction
if text_bytes > 48:
# Current model limitation: may not fully reconstruct very long texts
# This is due to sliding window processing
full_reconstruction = reconstructed
else:
full_reconstruction = reconstructed
elapsed_time = (time.time() - start_time) * 1000
# Calculate accuracy
min_len = min(len(text), len(full_reconstruction))
matches = sum(1 for i in range(min_len) if text[i] == full_reconstruction[i])
accuracy = (matches / len(text)) * 100 if text else 0
# Format results
stats = f"""πŸ“Š **Compression Statistics**
β€’ Input: {text_bytes} bytes ({len(text)} chars)
β€’ Chunks: {num_chunks} chunk{"s" if num_chunks > 1 else ""} (48-byte chunks with 8-byte overlap for long texts)
β€’ Embeddings generated: {num_embeddings} embedding vectors (3 per chunk)
β€’ Compression ratio: 16:1 fixed (48 bytes β†’ 3 embeddings)
β€’ Processing time: {elapsed_time:.1f}ms (autoregressive mode - slow)
β€’ Reconstruction accuracy: {accuracy:.1f}%
⚠️ **Current Mode**: Autoregressive (Teacher Forcing training only)
β€’ Speed: ~500ms per generation
β€’ Coming: Non-autoregressive training (10x faster)"""
details = f"""πŸ”€ **Original Text** ({len(text)} chars, {text_bytes} bytes):
{text}
πŸ”„ **Reconstructed Text** ({len(full_reconstruction)} chars):
{full_reconstruction}
βœ… **Match Rate**: {accuracy:.1f}% ({matches}/{len(text)} characters)
πŸ“ **Note**: Reconstruction quality may decrease for texts > 48 bytes due to sliding window processing."""
return stats, details, full_reconstruction
except Exception as e:
return f"Error: {str(e)}", "", ""
# Initialize
tokenizer = B2NLTokenizer()
# Gradio Interface
with gr.Blocks(title="B2NL v6.2.1", theme=gr.themes.Soft()) as app:
gr.Markdown("""
# πŸš€ B2NL-IntelligentTokenizer v6.2.1
## πŸ“– What is this model?
**B2NL (Byte-to-Natural Language)** is a progressive tokenizer that converts raw text into compressed semantic embeddings.
Unlike traditional tokenizers that use fixed vocabularies, B2NL learns directly from bytes and generates dense embeddings
that capture semantic meaning while achieving 16:1 compression.
### πŸ”¬ How the 16:1 Compression Works
```
Input: 48 bytes (including padding/special tokens)
↓
Processing: Byte-level analysis with learned boundaries
↓
Output: 3 embedding vectors (1280-dim each)
```
**Key Innovation**: The model learns to identify **semantic boundaries** within the 48-byte window.
Instead of splitting at arbitrary points, it discovers natural language units (words, morphemes, phrases)
and encodes them into meaningful embeddings. This is why "Hello, world!" (13 bytes) still generates
3 embeddings - the model pads to 48 bytes but learns which parts contain actual information.
### 🎯 Why This Matters
1. **Semantic Preservation**: Unlike byte-pair encoding (BPE) which can split words arbitrarily,
B2NL respects semantic boundaries learned from data.
2. **Language Agnostic**: No vocabulary needed - works equally well for all 204 languages.
Korean "μ•ˆλ…•ν•˜μ„Έμš”" and English "Hello" are processed the same way.
3. **Predictable Costs**: Always 16:1 compression means predictable API costs for LLMs.
48 bytes β†’ 3 embeddings, always.
4. **Inter-modal Bridge**: These embeddings can be used as a universal representation
for cross-modal tasks (text→image, text→audio, etc.)
### 🎯 Real-World Applications
- **LLM Cost Reduction**: 75% fewer tokens = 75% cost savings on API calls
- **Multilingual Search**: Single embedding space for 204 languages
- **Edge AI**: Compressed representations for bandwidth-limited IoT devices
- **Cross-modal AI**: Universal embeddings for multimodal models
### βš™οΈ Technical Architecture
- **Encoder**: 6 layers, progressive dimension reduction
- **Decoder**: 6 layers with cross-attention, reconstructs from embeddings
- **Boundary Learning**: Gumbel-Softmax for differentiable boundary detection
- **Total Parameters**: 244.7M (137.9M encoder + 106.8M decoder)
- **Training**: FLORES-200 (204 languages), 100 epochs, teacher forcing
### ⚠️ Current Limitations
- **Mode**: Autoregressive (teacher forcing only) - ~500ms per generation
- **Long Texts**: Quality decreases for texts > 48 bytes (sliding window limitation)
- **Coming Soon**: Non-autoregressive training (November 2025) for 10x speedup
---
""")
with gr.Tab("πŸ”„ Reconstruction Test"):
gr.Markdown("""
Test how well the model compresses and reconstructs text. The model processes text in 48-byte chunks,
generating 3 embedding vectors per chunk. For longer texts, it uses a sliding window with 8-byte overlap.
""")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Text",
placeholder="Enter any text in any of 204 languages...",
lines=5
)
gr.Examples(
examples=[
# Major languages
"Hello, world! How are you today?",
"μ•ˆλ…•ν•˜μ„Έμš”, λ°˜κ°‘μŠ΅λ‹ˆλ‹€. 였늘 날씨가 μ’‹λ„€μš”.",
"δ½ ε₯½δΈ–η•ŒοΌδ»Šε€©ε€©ζ°”εΎˆε₯½γ€‚",
"γ“γ‚“γ«γ‘γ―δΈ–η•ŒοΌδ»Šζ—₯はいい倩気ですね。",
"Bonjour le monde! Comment allez-vous?",
"Hola mundo! ΒΏCΓ³mo estΓ‘s hoy?",
"ΠŸΡ€ΠΈΠ²Π΅Ρ‚ ΠΌΠΈΡ€! Как Π΄Π΅Π»Π°?",
"Ω…Ψ±Ψ­Ψ¨Ψ§ Ψ¨Ψ§Ω„ΨΉΨ§Ω„Ω…! ΩƒΩŠΩ Ψ­Ψ§Ω„Ωƒ Ψ§Ω„ΩŠΩˆΩ…ΨŸ",
# Test different lengths
"Short", # 5 bytes - 1 chunk, 3 embeddings
"This is exactly 48 bytes of text for one chunk!", # 48 bytes - 1 chunk, 3 embeddings
"This is a longer text that exceeds 48 bytes and will need multiple chunks with sliding window processing.", # >48 bytes - multiple chunks
],
inputs=input_text,
label="Example texts (various lengths and languages)"
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.1,
step=0.1,
label="Temperature (0.1 = Most accurate, 1.0 = More creative)"
)
process_btn = gr.Button("πŸ”„ Compress & Reconstruct", variant="primary", size="lg")
with gr.Column():
stats_output = gr.Markdown(label="Statistics")
details_output = gr.Markdown(label="Details")
with gr.Tab("πŸ“Š Batch Test"):
gr.Markdown("""
Test multiple texts at once to compare compression across different languages and lengths.
Each text is processed independently, showing how the fixed 16:1 compression works across languages.
""")
batch_input = gr.Textbox(
label="Enter multiple texts (one per line)",
placeholder="Enter texts in different languages...\nOne text per line",
lines=10,
value="""The quick brown fox jumps over the lazy dog.
μ•ˆλ…•ν•˜μ„Έμš”, λ°˜κ°‘μŠ΅λ‹ˆλ‹€. 였늘 날씨가 정말 μ’‹λ„€μš”.
δ½ ε₯½δΈ–η•ŒοΌδ»Šε€©ε€©ζ°”εΎˆε₯½οΌŒζˆ‘δ»¬δΈ€θ΅·εŽ»ζ•£ζ­₯吧。
γ“γ‚“γ«γ‘γ―δΈ–η•ŒοΌδ»Šζ—₯γ―γ„γ„ε€©ζ°—γ§γ™γ­γ€‚ζ•£ζ­©γ«θ‘ŒγγΎγ—γ‚‡γ†γ€‚
Bonjour le monde! Comment allez-vous aujourd'hui?
Ω…Ψ±Ψ­Ψ¨Ψ§ Ψ¨Ψ§Ω„ΨΉΨ§Ω„Ω…! ΩƒΩŠΩ Ψ­Ψ§Ω„Ωƒ Ψ§Ω„ΩŠΩˆΩ…ΨŸ Ψ§Ω„Ψ·Ω‚Ψ³ Ψ¬Ω…ΩŠΩ„ Ψ¬Ψ―Ψ§Ω‹.
ΠŸΡ€ΠΈΠ²Π΅Ρ‚ ΠΌΠΈΡ€! Как Π΄Π΅Π»Π°? Погода сСгодня прСкрасная!
This text is exactly 48 bytes long for testing!
Short text
A much longer text that definitely exceeds 48 bytes and will require sliding window processing with 8-byte overlaps between chunks."""
)
batch_btn = gr.Button("πŸ”„ Process Batch", variant="primary")
batch_output = gr.Dataframe(
headers=["Text", "Language", "Bytes", "Chunks", "Embeddings", "Accuracy"],
label="Batch Results"
)
with gr.Tab("πŸ“– Documentation"):
gr.Markdown("""
## Understanding B2NL Tokenization
### πŸ”¬ The Core Innovation: Learned Semantic Boundaries
Traditional tokenizers use fixed rules (BPE, WordPiece) that can split words arbitrarily.
B2NL learns to identify **semantic units** within byte sequences:
```
Traditional BPE: "μ•ˆλ…•ν•˜μ„Έμš”" β†’ "μ•ˆ", "λ…•", "ν•˜", "μ„Έ", "μš”" (5 tokens)
B2NL: "μ•ˆλ…•ν•˜μ„Έμš”" β†’ [emb1, emb2, emb3] (3 embeddings capturing full meaning)
```
### πŸ“ The 48-Byte β†’ 3 Embeddings Architecture
```
[48 bytes input] β†’ [Encoder] β†’ [3 Γ— 1280-dim embeddings] β†’ [Decoder] β†’ [48 bytes output]
↑ ↓
(with padding) (semantic compression)
```
**Why 48 bytes?**
- Optimal for GPU parallelization (divisible by 8, 16, 24)
- Captures most words/phrases in any language
- Allows consistent 16:1 compression ratio
**Why 3 embeddings?**
- Matches typical semantic units in 48-byte window
- Provides redundancy for robust reconstruction
- Optimal for transformer cross-attention
### 🌐 Language-Agnostic Processing
The model treats all languages equally at the byte level:
| Language | Sample Text | Bytes | Embeddings | Compression |
|----------|------------|-------|------------|-------------|
| English | "Hello" | 5 (+43 pad) | 3 | 16:1 |
| Korean | "μ•ˆλ…•ν•˜μ„Έμš”" | 15 (+33 pad) | 3 | 16:1 |
| Chinese | "δ½ ε₯½δΈ–η•Œ" | 12 (+36 pad) | 3 | 16:1 |
| Arabic | "Ω…Ψ±Ψ­Ψ¨Ψ§" | 10 (+38 pad) | 3 | 16:1 |
All get compressed to 3 embeddings, but the model learns which parts contain information.
### πŸ”„ Sliding Window for Long Texts
For texts exceeding 48 bytes:
```
Text: "This is a very long sentence that exceeds 48 bytes..."
Chunk 1: [Bytes 0-47] β†’ 3 embeddings
↓ (8-byte overlap)
Chunk 2: [Bytes 40-87] β†’ 3 embeddings
↓ (8-byte overlap)
Chunk 3: [Bytes 80-127] β†’ 3 embeddings
```
The 8-byte overlap preserves context across boundaries, preventing word splits.
### Current Limitations
1. **Speed**: ~500ms per generation (autoregressive mode)
2. **Long Texts**: Quality decreases with multiple chunks
3. **Training**: Only teacher forcing, no autoregressive training yet
### Upcoming Improvements (November 2025)
- **Non-autoregressive training**: 10x speed improvement
- **Better long text handling**: Improved sliding window
- **Streaming support**: Real-time processing
---
**Author**: Jinhyun Woo
**Paper**: [Zenodo](https://zenodo.org/records/17116281)
**GitHub**: [Woojiggun/intelligent-tokenizer](https://github.com/Woojiggun/intelligent-tokenizer)
**Model**: [ggunio/B2NL-IntelligentTokenizer-v6.2.1](https://huggingface.co/ggunio/B2NL-IntelligentTokenizer-v6.2.1)
""")
# Connect functions
process_btn.click(
fn=lambda text, temp: tokenizer.process_text(text, temp),
inputs=[input_text, temperature],
outputs=[stats_output, details_output]
)
def process_batch(texts):
if not texts:
return []
results = []
for text in texts.strip().split('\n'):
if not text.strip():
continue
# Process each text
text = text.strip()
text_bytes = len(text.encode('utf-8'))
# Calculate chunks and embeddings
if text_bytes <= 48:
num_chunks = 1
num_embeddings = 3
else:
num_chunks = 1 + math.ceil((text_bytes - 48) / 40)
num_embeddings = num_chunks * 3
# Get reconstruction
stats, details, reconstructed = tokenizer.process_text(text, 0.1)
# Detect language (simple heuristic)
if any(ord(c) >= 0x3040 and ord(c) <= 0x309F for c in text):
lang = "Japanese"
elif any(ord(c) >= 0xAC00 and ord(c) <= 0xD7AF for c in text):
lang = "Korean"
elif any(ord(c) >= 0x4E00 and ord(c) <= 0x9FFF for c in text):
lang = "Chinese"
elif any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in text):
lang = "Arabic"
elif any(ord(c) >= 0x0400 and ord(c) <= 0x04FF for c in text):
lang = "Russian"
else:
lang = "English/Latin"
# Calculate accuracy
if "Error" not in stats:
min_len = min(len(text), len(reconstructed))
matches = sum(1 for i in range(min_len) if text[i] == reconstructed[i])
accuracy = (matches / len(text)) * 100 if text else 0
results.append([
text[:50] + "..." if len(text) > 50 else text,
lang,
text_bytes,
num_chunks,
num_embeddings,
f"{accuracy:.1f}%"
])
return results
batch_btn.click(
fn=process_batch,
inputs=batch_input,
outputs=batch_output
)
if __name__ == "__main__":
app.launch()