File size: 18,169 Bytes
13c2c77 77a029c 13c2c77 2607a65 4e3eeae 13c2c77 77a029c 8ba68ce 4e3eeae 8ba68ce 4e3eeae 8ba68ce 4e3eeae 8ba68ce 4e3eeae 13c2c77 8ba68ce 0250815 8ba68ce 0a70666 8ba68ce 4e3eeae 8ba68ce 13c2c77 8ba68ce 2607a65 8ba68ce 4e3eeae 8ba68ce 4e3eeae 77a029c 8ba68ce 4e3eeae 77a029c 4e3eeae 85b3a58 77a029c 85b3a58 77a029c 85b3a58 77a029c 13c2c77 8ba68ce 13c2c77 8ba68ce 77a029c 8ba68ce 13c2c77 8ba68ce 77a029c 0a70666 8ba68ce 2607a65 77a029c 2607a65 77a029c 2607a65 77a029c 13c2c77 8ba68ce 2607a65 8ba68ce 13c2c77 8ba68ce 13c2c77 77a029c 2068c6b 77a029c 2068c6b 3a4dfa2 2068c6b 77a029c 2068c6b 77a029c 2068c6b 77a029c 8ba68ce 13c2c77 8ba68ce 77a029c 4e3eeae 8ba68ce 4e3eeae 8ba68ce 77a029c 4e3eeae 8ba68ce 77a029c 4e3eeae 13c2c77 8ba68ce 77a029c 8ba68ce 13c2c77 8ba68ce 13c2c77 8ba68ce 13c2c77 8ba68ce 77a029c 8ba68ce 2068c6b 4e3eeae 13c2c77 8ba68ce 77a029c 8ba68ce 13c2c77 2607a65 77a029c 2068c6b 77a029c 2068c6b 77a029c 2068c6b 77a029c 2068c6b 3a4dfa2 2068c6b 77a029c 2068c6b 77a029c 2068c6b 77a029c 2068c6b 77a029c 8ba68ce 77a029c 8ba68ce 77a029c 8ba68ce 77a029c 8ba68ce 77a029c 8ba68ce 2607a65 8ba68ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 |
"""
B2NL-IntelligentTokenizer v6.2.1 - Progressive Byte-to-Natural Language Tokenizer
⚠️ IMPORTANT: Currently in AUTOREGRESSIVE MODE (Teacher Forcing Training)
- Current: ~500ms inference (accurate but slow)
- Coming Soon (November 2025): Non-autoregressive training (<50ms, 10x faster)
🚀 Purpose: Embedding Preprocessing Model for Inter-modal Communication
This model serves as a preprocessing layer that converts raw text into compressed
semantic embeddings, enabling efficient inter-modal communication between different
AI systems. By separating language understanding from task-specific inference,
it provides a universal representation layer for multi-modal AI applications.
Key Features:
- Fixed 16:1 compression ratio (48 bytes → 3 embeddings per chunk)
- Byte-level processing (no vocabulary required)
- 204 language support via FLORES-200 training
- Sliding window for texts > 48 bytes
"""
import gradio as gr
import torch
import sys
import io
import time
import math
from pathlib import Path
# Fix Windows Unicode
if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
# Add paths
sys.path.insert(0, 'core')
from unified_model import IntelligentTokenizerV62
from tokenizer import ByteTokenizerV62
class B2NLTokenizer:
def __init__(self):
self.model = None
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.load_model()
def load_model(self):
"""Load model from HuggingFace or local"""
try:
# Try HuggingFace first
from huggingface_hub import hf_hub_download
checkpoint_path = hf_hub_download(
repo_id="ggunio/B2NL-IntelligentTokenizer-v6.2.1",
filename="pytorch_model.bin"
)
print(f"Loading from HuggingFace")
except:
# Try local paths
checkpoint_paths = [
"pytorch_model.bin",
"checkpoints/v62/16.0/epoch_100.pt",
"D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt"
]
checkpoint_path = None
for path in checkpoint_paths:
if Path(path).exists():
checkpoint_path = path
break
if not checkpoint_path:
print("❌ Model not found")
return
# Load model
self.model = IntelligentTokenizerV62()
checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)
if 'model_state_dict' in checkpoint:
self.model.load_state_dict(checkpoint['model_state_dict'])
else:
self.model.load_state_dict(checkpoint)
self.model = self.model.to(self.device)
self.model.eval()
print(f"✅ Model loaded on {self.device}")
def process_text(self, text, temperature=0.1):
"""Process text and return detailed results"""
if not self.model or not text:
return "Please enter text", "", ""
try:
start_time = time.time()
# Calculate chunks and embeddings
text_bytes = len(text.encode('utf-8'))
# For texts > 48 bytes: sliding window with 8-byte overlap
if text_bytes <= 48:
num_chunks = 1
num_embeddings = 3 # 1 chunk = 3 embeddings
else:
# Sliding window: first chunk 48 bytes, then slide by 40 bytes (8 overlap)
num_chunks = 1 + math.ceil((text_bytes - 48) / 40)
num_embeddings = num_chunks * 3
# Reconstruct (full text, not truncated)
with torch.no_grad():
# Calculate appropriate max_length based on input
max_gen_length = max(48, min(len(text) + 10, 512)) # Allow some extra space
reconstructed = self.model.generate(text, temperature=temperature, max_length=max_gen_length)
# For long texts, ensure we get full reconstruction
if text_bytes > 48:
# Current model limitation: may not fully reconstruct very long texts
# This is due to sliding window processing
full_reconstruction = reconstructed
else:
full_reconstruction = reconstructed
elapsed_time = (time.time() - start_time) * 1000
# Calculate accuracy
min_len = min(len(text), len(full_reconstruction))
matches = sum(1 for i in range(min_len) if text[i] == full_reconstruction[i])
accuracy = (matches / len(text)) * 100 if text else 0
# Format results
stats = f"""📊 **Compression Statistics**
• Input: {text_bytes} bytes ({len(text)} chars)
• Chunks: {num_chunks} chunk{"s" if num_chunks > 1 else ""} (48-byte chunks with 8-byte overlap for long texts)
• Embeddings generated: {num_embeddings} embedding vectors (3 per chunk)
• Compression ratio: 16:1 fixed (48 bytes → 3 embeddings)
• Processing time: {elapsed_time:.1f}ms (autoregressive mode - slow)
• Reconstruction accuracy: {accuracy:.1f}%
⚠️ **Current Mode**: Autoregressive (Teacher Forcing training only)
• Speed: ~500ms per generation
• Coming: Non-autoregressive training (10x faster)"""
details = f"""🔤 **Original Text** ({len(text)} chars, {text_bytes} bytes):
{text}
🔄 **Reconstructed Text** ({len(full_reconstruction)} chars):
{full_reconstruction}
✅ **Match Rate**: {accuracy:.1f}% ({matches}/{len(text)} characters)
📝 **Note**: Reconstruction quality may decrease for texts > 48 bytes due to sliding window processing."""
return stats, details, full_reconstruction
except Exception as e:
return f"Error: {str(e)}", "", ""
# Initialize
tokenizer = B2NLTokenizer()
# Gradio Interface
with gr.Blocks(title="B2NL v6.2.1", theme=gr.themes.Soft()) as app:
gr.Markdown("""
# 🚀 B2NL-IntelligentTokenizer v6.2.1
## 📖 What is this model?
**B2NL (Byte-to-Natural Language)** is a progressive tokenizer that converts raw text into compressed semantic embeddings.
Unlike traditional tokenizers that use fixed vocabularies, B2NL learns directly from bytes and generates dense embeddings
that capture semantic meaning while achieving 16:1 compression.
### 🔬 How the 16:1 Compression Works
```
Input: 48 bytes (including padding/special tokens)
↓
Processing: Byte-level analysis with learned boundaries
↓
Output: 3 embedding vectors (1280-dim each)
```
**Key Innovation**: The model learns to identify **semantic boundaries** within the 48-byte window.
Instead of splitting at arbitrary points, it discovers natural language units (words, morphemes, phrases)
and encodes them into meaningful embeddings. This is why "Hello, world!" (13 bytes) still generates
3 embeddings - the model pads to 48 bytes but learns which parts contain actual information.
### 🎯 Why This Matters
1. **Semantic Preservation**: Unlike byte-pair encoding (BPE) which can split words arbitrarily,
B2NL respects semantic boundaries learned from data.
2. **Language Agnostic**: No vocabulary needed - works equally well for all 204 languages.
Korean "안녕하세요" and English "Hello" are processed the same way.
3. **Predictable Costs**: Always 16:1 compression means predictable API costs for LLMs.
48 bytes → 3 embeddings, always.
4. **Inter-modal Bridge**: These embeddings can be used as a universal representation
for cross-modal tasks (text→image, text→audio, etc.)
### 🎯 Real-World Applications
- **LLM Cost Reduction**: 75% fewer tokens = 75% cost savings on API calls
- **Multilingual Search**: Single embedding space for 204 languages
- **Edge AI**: Compressed representations for bandwidth-limited IoT devices
- **Cross-modal AI**: Universal embeddings for multimodal models
### ⚙️ Technical Architecture
- **Encoder**: 6 layers, progressive dimension reduction
- **Decoder**: 6 layers with cross-attention, reconstructs from embeddings
- **Boundary Learning**: Gumbel-Softmax for differentiable boundary detection
- **Total Parameters**: 244.7M (137.9M encoder + 106.8M decoder)
- **Training**: FLORES-200 (204 languages), 100 epochs, teacher forcing
### ⚠️ Current Limitations
- **Mode**: Autoregressive (teacher forcing only) - ~500ms per generation
- **Long Texts**: Quality decreases for texts > 48 bytes (sliding window limitation)
- **Coming Soon**: Non-autoregressive training (November 2025) for 10x speedup
---
""")
with gr.Tab("🔄 Reconstruction Test"):
gr.Markdown("""
Test how well the model compresses and reconstructs text. The model processes text in 48-byte chunks,
generating 3 embedding vectors per chunk. For longer texts, it uses a sliding window with 8-byte overlap.
""")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Text",
placeholder="Enter any text in any of 204 languages...",
lines=5
)
gr.Examples(
examples=[
# Major languages
"Hello, world! How are you today?",
"안녕하세요, 반갑습니다. 오늘 날씨가 좋네요.",
"你好世界!今天天气很好。",
"こんにちは世界!今日はいい天気ですね。",
"Bonjour le monde! Comment allez-vous?",
"Hola mundo! ¿Cómo estás hoy?",
"Привет мир! Как дела?",
"مرحبا بالعالم! كيف حالك اليوم؟",
# Test different lengths
"Short", # 5 bytes - 1 chunk, 3 embeddings
"This is exactly 48 bytes of text for one chunk!", # 48 bytes - 1 chunk, 3 embeddings
"This is a longer text that exceeds 48 bytes and will need multiple chunks with sliding window processing.", # >48 bytes - multiple chunks
],
inputs=input_text,
label="Example texts (various lengths and languages)"
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.1,
step=0.1,
label="Temperature (0.1 = Most accurate, 1.0 = More creative)"
)
process_btn = gr.Button("🔄 Compress & Reconstruct", variant="primary", size="lg")
with gr.Column():
stats_output = gr.Markdown(label="Statistics")
details_output = gr.Markdown(label="Details")
with gr.Tab("📊 Batch Test"):
gr.Markdown("""
Test multiple texts at once to compare compression across different languages and lengths.
Each text is processed independently, showing how the fixed 16:1 compression works across languages.
""")
batch_input = gr.Textbox(
label="Enter multiple texts (one per line)",
placeholder="Enter texts in different languages...\nOne text per line",
lines=10,
value="""The quick brown fox jumps over the lazy dog.
안녕하세요, 반갑습니다. 오늘 날씨가 정말 좋네요.
你好世界!今天天气很好,我们一起去散步吧。
こんにちは世界!今日はいい天気ですね。散歩に行きましょう。
Bonjour le monde! Comment allez-vous aujourd'hui?
مرحبا بالعالم! كيف حالك اليوم؟ الطقس جميل جداً.
Привет мир! Как дела? Погода сегодня прекрасная!
This text is exactly 48 bytes long for testing!
Short text
A much longer text that definitely exceeds 48 bytes and will require sliding window processing with 8-byte overlaps between chunks."""
)
batch_btn = gr.Button("🔄 Process Batch", variant="primary")
batch_output = gr.Dataframe(
headers=["Text", "Language", "Bytes", "Chunks", "Embeddings", "Accuracy"],
label="Batch Results"
)
with gr.Tab("📖 Documentation"):
gr.Markdown("""
## Understanding B2NL Tokenization
### 🔬 The Core Innovation: Learned Semantic Boundaries
Traditional tokenizers use fixed rules (BPE, WordPiece) that can split words arbitrarily.
B2NL learns to identify **semantic units** within byte sequences:
```
Traditional BPE: "안녕하세요" → "안", "녕", "하", "세", "요" (5 tokens)
B2NL: "안녕하세요" → [emb1, emb2, emb3] (3 embeddings capturing full meaning)
```
### 📐 The 48-Byte → 3 Embeddings Architecture
```
[48 bytes input] → [Encoder] → [3 × 1280-dim embeddings] → [Decoder] → [48 bytes output]
↑ ↓
(with padding) (semantic compression)
```
**Why 48 bytes?**
- Optimal for GPU parallelization (divisible by 8, 16, 24)
- Captures most words/phrases in any language
- Allows consistent 16:1 compression ratio
**Why 3 embeddings?**
- Matches typical semantic units in 48-byte window
- Provides redundancy for robust reconstruction
- Optimal for transformer cross-attention
### 🌐 Language-Agnostic Processing
The model treats all languages equally at the byte level:
| Language | Sample Text | Bytes | Embeddings | Compression |
|----------|------------|-------|------------|-------------|
| English | "Hello" | 5 (+43 pad) | 3 | 16:1 |
| Korean | "안녕하세요" | 15 (+33 pad) | 3 | 16:1 |
| Chinese | "你好世界" | 12 (+36 pad) | 3 | 16:1 |
| Arabic | "مرحبا" | 10 (+38 pad) | 3 | 16:1 |
All get compressed to 3 embeddings, but the model learns which parts contain information.
### 🔄 Sliding Window for Long Texts
For texts exceeding 48 bytes:
```
Text: "This is a very long sentence that exceeds 48 bytes..."
Chunk 1: [Bytes 0-47] → 3 embeddings
↓ (8-byte overlap)
Chunk 2: [Bytes 40-87] → 3 embeddings
↓ (8-byte overlap)
Chunk 3: [Bytes 80-127] → 3 embeddings
```
The 8-byte overlap preserves context across boundaries, preventing word splits.
### Current Limitations
1. **Speed**: ~500ms per generation (autoregressive mode)
2. **Long Texts**: Quality decreases with multiple chunks
3. **Training**: Only teacher forcing, no autoregressive training yet
### Upcoming Improvements (November 2025)
- **Non-autoregressive training**: 10x speed improvement
- **Better long text handling**: Improved sliding window
- **Streaming support**: Real-time processing
---
**Author**: Jinhyun Woo
**Paper**: [Zenodo](https://zenodo.org/records/17116281)
**GitHub**: [Woojiggun/intelligent-tokenizer](https://github.com/Woojiggun/intelligent-tokenizer)
**Model**: [ggunio/B2NL-IntelligentTokenizer-v6.2.1](https://huggingface.co/ggunio/B2NL-IntelligentTokenizer-v6.2.1)
""")
# Connect functions
process_btn.click(
fn=lambda text, temp: tokenizer.process_text(text, temp),
inputs=[input_text, temperature],
outputs=[stats_output, details_output]
)
def process_batch(texts):
if not texts:
return []
results = []
for text in texts.strip().split('\n'):
if not text.strip():
continue
# Process each text
text = text.strip()
text_bytes = len(text.encode('utf-8'))
# Calculate chunks and embeddings
if text_bytes <= 48:
num_chunks = 1
num_embeddings = 3
else:
num_chunks = 1 + math.ceil((text_bytes - 48) / 40)
num_embeddings = num_chunks * 3
# Get reconstruction
stats, details, reconstructed = tokenizer.process_text(text, 0.1)
# Detect language (simple heuristic)
if any(ord(c) >= 0x3040 and ord(c) <= 0x309F for c in text):
lang = "Japanese"
elif any(ord(c) >= 0xAC00 and ord(c) <= 0xD7AF for c in text):
lang = "Korean"
elif any(ord(c) >= 0x4E00 and ord(c) <= 0x9FFF for c in text):
lang = "Chinese"
elif any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in text):
lang = "Arabic"
elif any(ord(c) >= 0x0400 and ord(c) <= 0x04FF for c in text):
lang = "Russian"
else:
lang = "English/Latin"
# Calculate accuracy
if "Error" not in stats:
min_len = min(len(text), len(reconstructed))
matches = sum(1 for i in range(min_len) if text[i] == reconstructed[i])
accuracy = (matches / len(text)) * 100 if text else 0
results.append([
text[:50] + "..." if len(text) > 50 else text,
lang,
text_bytes,
num_chunks,
num_embeddings,
f"{accuracy:.1f}%"
])
return results
batch_btn.click(
fn=process_batch,
inputs=batch_input,
outputs=batch_output
)
if __name__ == "__main__":
app.launch() |