|
|
""" |
|
|
B2NL-IntelligentTokenizer v6.2.1 - Progressive Byte-to-Natural Language Tokenizer |
|
|
|
|
|
β οΈ IMPORTANT: Currently in AUTOREGRESSIVE MODE (Teacher Forcing Training) |
|
|
- Current: ~500ms inference (accurate but slow) |
|
|
- Coming Soon (November 2025): Non-autoregressive training (<50ms, 10x faster) |
|
|
|
|
|
π Purpose: Embedding Preprocessing Model for Inter-modal Communication |
|
|
This model serves as a preprocessing layer that converts raw text into compressed |
|
|
semantic embeddings, enabling efficient inter-modal communication between different |
|
|
AI systems. By separating language understanding from task-specific inference, |
|
|
it provides a universal representation layer for multi-modal AI applications. |
|
|
|
|
|
Key Features: |
|
|
- Fixed 16:1 compression ratio (48 bytes β 3 embeddings per chunk) |
|
|
- Byte-level processing (no vocabulary required) |
|
|
- 204 language support via FLORES-200 training |
|
|
- Sliding window for texts > 48 bytes |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import torch |
|
|
import sys |
|
|
import io |
|
|
import time |
|
|
import math |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
if sys.platform == 'win32': |
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') |
|
|
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') |
|
|
|
|
|
|
|
|
sys.path.insert(0, 'core') |
|
|
|
|
|
from unified_model import IntelligentTokenizerV62 |
|
|
from tokenizer import ByteTokenizerV62 |
|
|
|
|
|
class B2NLTokenizer: |
|
|
def __init__(self): |
|
|
self.model = None |
|
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
self.load_model() |
|
|
|
|
|
def load_model(self): |
|
|
"""Load model from HuggingFace or local""" |
|
|
try: |
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
|
checkpoint_path = hf_hub_download( |
|
|
repo_id="ggunio/B2NL-IntelligentTokenizer-v6.2.1", |
|
|
filename="pytorch_model.bin" |
|
|
) |
|
|
print(f"Loading from HuggingFace") |
|
|
except: |
|
|
|
|
|
checkpoint_paths = [ |
|
|
"pytorch_model.bin", |
|
|
"checkpoints/v62/16.0/epoch_100.pt", |
|
|
"D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt" |
|
|
] |
|
|
checkpoint_path = None |
|
|
for path in checkpoint_paths: |
|
|
if Path(path).exists(): |
|
|
checkpoint_path = path |
|
|
break |
|
|
|
|
|
if not checkpoint_path: |
|
|
print("β Model not found") |
|
|
return |
|
|
|
|
|
|
|
|
self.model = IntelligentTokenizerV62() |
|
|
checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False) |
|
|
|
|
|
if 'model_state_dict' in checkpoint: |
|
|
self.model.load_state_dict(checkpoint['model_state_dict']) |
|
|
else: |
|
|
self.model.load_state_dict(checkpoint) |
|
|
|
|
|
self.model = self.model.to(self.device) |
|
|
self.model.eval() |
|
|
print(f"β
Model loaded on {self.device}") |
|
|
|
|
|
def process_text(self, text, temperature=0.1): |
|
|
"""Process text and return detailed results""" |
|
|
if not self.model or not text: |
|
|
return "Please enter text", "", "" |
|
|
|
|
|
try: |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
text_bytes = len(text.encode('utf-8')) |
|
|
|
|
|
|
|
|
if text_bytes <= 48: |
|
|
num_chunks = 1 |
|
|
num_embeddings = 3 |
|
|
else: |
|
|
|
|
|
num_chunks = 1 + math.ceil((text_bytes - 48) / 40) |
|
|
num_embeddings = num_chunks * 3 |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
|
|
|
max_gen_length = max(48, min(len(text) + 10, 512)) |
|
|
|
|
|
reconstructed = self.model.generate(text, temperature=temperature, max_length=max_gen_length) |
|
|
|
|
|
|
|
|
if text_bytes > 48: |
|
|
|
|
|
|
|
|
full_reconstruction = reconstructed |
|
|
else: |
|
|
full_reconstruction = reconstructed |
|
|
|
|
|
elapsed_time = (time.time() - start_time) * 1000 |
|
|
|
|
|
|
|
|
min_len = min(len(text), len(full_reconstruction)) |
|
|
matches = sum(1 for i in range(min_len) if text[i] == full_reconstruction[i]) |
|
|
accuracy = (matches / len(text)) * 100 if text else 0 |
|
|
|
|
|
|
|
|
stats = f"""π **Compression Statistics** |
|
|
β’ Input: {text_bytes} bytes ({len(text)} chars) |
|
|
β’ Chunks: {num_chunks} chunk{"s" if num_chunks > 1 else ""} (48-byte chunks with 8-byte overlap for long texts) |
|
|
β’ Embeddings generated: {num_embeddings} embedding vectors (3 per chunk) |
|
|
β’ Compression ratio: 16:1 fixed (48 bytes β 3 embeddings) |
|
|
β’ Processing time: {elapsed_time:.1f}ms (autoregressive mode - slow) |
|
|
β’ Reconstruction accuracy: {accuracy:.1f}% |
|
|
|
|
|
β οΈ **Current Mode**: Autoregressive (Teacher Forcing training only) |
|
|
β’ Speed: ~500ms per generation |
|
|
β’ Coming: Non-autoregressive training (10x faster)""" |
|
|
|
|
|
details = f"""π€ **Original Text** ({len(text)} chars, {text_bytes} bytes): |
|
|
{text} |
|
|
|
|
|
π **Reconstructed Text** ({len(full_reconstruction)} chars): |
|
|
{full_reconstruction} |
|
|
|
|
|
β
**Match Rate**: {accuracy:.1f}% ({matches}/{len(text)} characters) |
|
|
|
|
|
π **Note**: Reconstruction quality may decrease for texts > 48 bytes due to sliding window processing.""" |
|
|
|
|
|
return stats, details, full_reconstruction |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error: {str(e)}", "", "" |
|
|
|
|
|
|
|
|
tokenizer = B2NLTokenizer() |
|
|
|
|
|
|
|
|
with gr.Blocks(title="B2NL v6.2.1", theme=gr.themes.Soft()) as app: |
|
|
gr.Markdown(""" |
|
|
# π B2NL-IntelligentTokenizer v6.2.1 |
|
|
|
|
|
## π What is this model? |
|
|
|
|
|
**B2NL (Byte-to-Natural Language)** is a progressive tokenizer that converts raw text into compressed semantic embeddings. |
|
|
Unlike traditional tokenizers that use fixed vocabularies, B2NL learns directly from bytes and generates dense embeddings |
|
|
that capture semantic meaning while achieving 16:1 compression. |
|
|
|
|
|
### π¬ How the 16:1 Compression Works |
|
|
|
|
|
``` |
|
|
Input: 48 bytes (including padding/special tokens) |
|
|
β |
|
|
Processing: Byte-level analysis with learned boundaries |
|
|
β |
|
|
Output: 3 embedding vectors (1280-dim each) |
|
|
``` |
|
|
|
|
|
**Key Innovation**: The model learns to identify **semantic boundaries** within the 48-byte window. |
|
|
Instead of splitting at arbitrary points, it discovers natural language units (words, morphemes, phrases) |
|
|
and encodes them into meaningful embeddings. This is why "Hello, world!" (13 bytes) still generates |
|
|
3 embeddings - the model pads to 48 bytes but learns which parts contain actual information. |
|
|
|
|
|
### π― Why This Matters |
|
|
|
|
|
1. **Semantic Preservation**: Unlike byte-pair encoding (BPE) which can split words arbitrarily, |
|
|
B2NL respects semantic boundaries learned from data. |
|
|
|
|
|
2. **Language Agnostic**: No vocabulary needed - works equally well for all 204 languages. |
|
|
Korean "μλ
νμΈμ" and English "Hello" are processed the same way. |
|
|
|
|
|
3. **Predictable Costs**: Always 16:1 compression means predictable API costs for LLMs. |
|
|
48 bytes β 3 embeddings, always. |
|
|
|
|
|
4. **Inter-modal Bridge**: These embeddings can be used as a universal representation |
|
|
for cross-modal tasks (textβimage, textβaudio, etc.) |
|
|
|
|
|
### π― Real-World Applications |
|
|
|
|
|
- **LLM Cost Reduction**: 75% fewer tokens = 75% cost savings on API calls |
|
|
- **Multilingual Search**: Single embedding space for 204 languages |
|
|
- **Edge AI**: Compressed representations for bandwidth-limited IoT devices |
|
|
- **Cross-modal AI**: Universal embeddings for multimodal models |
|
|
|
|
|
### βοΈ Technical Architecture |
|
|
|
|
|
- **Encoder**: 6 layers, progressive dimension reduction |
|
|
- **Decoder**: 6 layers with cross-attention, reconstructs from embeddings |
|
|
- **Boundary Learning**: Gumbel-Softmax for differentiable boundary detection |
|
|
- **Total Parameters**: 244.7M (137.9M encoder + 106.8M decoder) |
|
|
- **Training**: FLORES-200 (204 languages), 100 epochs, teacher forcing |
|
|
|
|
|
### β οΈ Current Limitations |
|
|
|
|
|
- **Mode**: Autoregressive (teacher forcing only) - ~500ms per generation |
|
|
- **Long Texts**: Quality decreases for texts > 48 bytes (sliding window limitation) |
|
|
- **Coming Soon**: Non-autoregressive training (November 2025) for 10x speedup |
|
|
|
|
|
--- |
|
|
""") |
|
|
|
|
|
with gr.Tab("π Reconstruction Test"): |
|
|
gr.Markdown(""" |
|
|
Test how well the model compresses and reconstructs text. The model processes text in 48-byte chunks, |
|
|
generating 3 embedding vectors per chunk. For longer texts, it uses a sliding window with 8-byte overlap. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
input_text = gr.Textbox( |
|
|
label="Input Text", |
|
|
placeholder="Enter any text in any of 204 languages...", |
|
|
lines=5 |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
|
|
|
"Hello, world! How are you today?", |
|
|
"μλ
νμΈμ, λ°κ°μ΅λλ€. μ€λ λ μ¨κ° μ’λ€μ.", |
|
|
"δ½ ε₯½δΈηοΌδ»ε€©ε€©ζ°εΎε₯½γ", |
|
|
"γγγ«γ‘γ―δΈηοΌδ»ζ₯γ―γγ倩ζ°γ§γγγ", |
|
|
"Bonjour le monde! Comment allez-vous?", |
|
|
"Hola mundo! ΒΏCΓ³mo estΓ‘s hoy?", |
|
|
"ΠΡΠΈΠ²Π΅Ρ ΠΌΠΈΡ! ΠΠ°ΠΊ Π΄Π΅Π»Π°?", |
|
|
"Ω
Ψ±ΨΨ¨Ψ§ Ψ¨Ψ§ΩΨΉΨ§ΩΩ
! ΩΩΩ ΨΨ§ΩΩ Ψ§ΩΩΩΩ
Ψ", |
|
|
|
|
|
"Short", |
|
|
"This is exactly 48 bytes of text for one chunk!", |
|
|
"This is a longer text that exceeds 48 bytes and will need multiple chunks with sliding window processing.", |
|
|
], |
|
|
inputs=input_text, |
|
|
label="Example texts (various lengths and languages)" |
|
|
) |
|
|
|
|
|
temperature = gr.Slider( |
|
|
minimum=0.1, |
|
|
maximum=1.0, |
|
|
value=0.1, |
|
|
step=0.1, |
|
|
label="Temperature (0.1 = Most accurate, 1.0 = More creative)" |
|
|
) |
|
|
|
|
|
process_btn = gr.Button("π Compress & Reconstruct", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(): |
|
|
stats_output = gr.Markdown(label="Statistics") |
|
|
details_output = gr.Markdown(label="Details") |
|
|
|
|
|
with gr.Tab("π Batch Test"): |
|
|
gr.Markdown(""" |
|
|
Test multiple texts at once to compare compression across different languages and lengths. |
|
|
Each text is processed independently, showing how the fixed 16:1 compression works across languages. |
|
|
""") |
|
|
|
|
|
batch_input = gr.Textbox( |
|
|
label="Enter multiple texts (one per line)", |
|
|
placeholder="Enter texts in different languages...\nOne text per line", |
|
|
lines=10, |
|
|
value="""The quick brown fox jumps over the lazy dog. |
|
|
μλ
νμΈμ, λ°κ°μ΅λλ€. μ€λ λ μ¨κ° μ λ§ μ’λ€μ. |
|
|
δ½ ε₯½δΈηοΌδ»ε€©ε€©ζ°εΎε₯½οΌζ们δΈθ΅·ε»ζ£ζ₯ε§γ |
|
|
γγγ«γ‘γ―δΈηοΌδ»ζ₯γ―γγ倩ζ°γ§γγγζ£ζ©γ«θ‘γγΎγγγγ |
|
|
Bonjour le monde! Comment allez-vous aujourd'hui? |
|
|
Ω
Ψ±ΨΨ¨Ψ§ Ψ¨Ψ§ΩΨΉΨ§ΩΩ
! ΩΩΩ ΨΨ§ΩΩ Ψ§ΩΩΩΩ
Ψ Ψ§ΩΨ·ΩΨ³ Ψ¬Ω
ΩΩ Ψ¬Ψ―Ψ§Ω. |
|
|
ΠΡΠΈΠ²Π΅Ρ ΠΌΠΈΡ! ΠΠ°ΠΊ Π΄Π΅Π»Π°? ΠΠΎΠ³ΠΎΠ΄Π° ΡΠ΅Π³ΠΎΠ΄Π½Ρ ΠΏΡΠ΅ΠΊΡΠ°ΡΠ½Π°Ρ! |
|
|
This text is exactly 48 bytes long for testing! |
|
|
Short text |
|
|
A much longer text that definitely exceeds 48 bytes and will require sliding window processing with 8-byte overlaps between chunks.""" |
|
|
) |
|
|
|
|
|
batch_btn = gr.Button("π Process Batch", variant="primary") |
|
|
batch_output = gr.Dataframe( |
|
|
headers=["Text", "Language", "Bytes", "Chunks", "Embeddings", "Accuracy"], |
|
|
label="Batch Results" |
|
|
) |
|
|
|
|
|
with gr.Tab("π Documentation"): |
|
|
gr.Markdown(""" |
|
|
## Understanding B2NL Tokenization |
|
|
|
|
|
### π¬ The Core Innovation: Learned Semantic Boundaries |
|
|
|
|
|
Traditional tokenizers use fixed rules (BPE, WordPiece) that can split words arbitrarily. |
|
|
B2NL learns to identify **semantic units** within byte sequences: |
|
|
|
|
|
``` |
|
|
Traditional BPE: "μλ
νμΈμ" β "μ", "λ
", "ν", "μΈ", "μ" (5 tokens) |
|
|
B2NL: "μλ
νμΈμ" β [emb1, emb2, emb3] (3 embeddings capturing full meaning) |
|
|
``` |
|
|
|
|
|
### π The 48-Byte β 3 Embeddings Architecture |
|
|
|
|
|
``` |
|
|
[48 bytes input] β [Encoder] β [3 Γ 1280-dim embeddings] β [Decoder] β [48 bytes output] |
|
|
β β |
|
|
(with padding) (semantic compression) |
|
|
``` |
|
|
|
|
|
**Why 48 bytes?** |
|
|
- Optimal for GPU parallelization (divisible by 8, 16, 24) |
|
|
- Captures most words/phrases in any language |
|
|
- Allows consistent 16:1 compression ratio |
|
|
|
|
|
**Why 3 embeddings?** |
|
|
- Matches typical semantic units in 48-byte window |
|
|
- Provides redundancy for robust reconstruction |
|
|
- Optimal for transformer cross-attention |
|
|
|
|
|
### π Language-Agnostic Processing |
|
|
|
|
|
The model treats all languages equally at the byte level: |
|
|
|
|
|
| Language | Sample Text | Bytes | Embeddings | Compression | |
|
|
|----------|------------|-------|------------|-------------| |
|
|
| English | "Hello" | 5 (+43 pad) | 3 | 16:1 | |
|
|
| Korean | "μλ
νμΈμ" | 15 (+33 pad) | 3 | 16:1 | |
|
|
| Chinese | "δ½ ε₯½δΈη" | 12 (+36 pad) | 3 | 16:1 | |
|
|
| Arabic | "Ω
Ψ±ΨΨ¨Ψ§" | 10 (+38 pad) | 3 | 16:1 | |
|
|
|
|
|
All get compressed to 3 embeddings, but the model learns which parts contain information. |
|
|
|
|
|
### π Sliding Window for Long Texts |
|
|
|
|
|
For texts exceeding 48 bytes: |
|
|
``` |
|
|
Text: "This is a very long sentence that exceeds 48 bytes..." |
|
|
|
|
|
Chunk 1: [Bytes 0-47] β 3 embeddings |
|
|
β (8-byte overlap) |
|
|
Chunk 2: [Bytes 40-87] β 3 embeddings |
|
|
β (8-byte overlap) |
|
|
Chunk 3: [Bytes 80-127] β 3 embeddings |
|
|
``` |
|
|
|
|
|
The 8-byte overlap preserves context across boundaries, preventing word splits. |
|
|
|
|
|
### Current Limitations |
|
|
|
|
|
1. **Speed**: ~500ms per generation (autoregressive mode) |
|
|
2. **Long Texts**: Quality decreases with multiple chunks |
|
|
3. **Training**: Only teacher forcing, no autoregressive training yet |
|
|
|
|
|
### Upcoming Improvements (November 2025) |
|
|
|
|
|
- **Non-autoregressive training**: 10x speed improvement |
|
|
- **Better long text handling**: Improved sliding window |
|
|
- **Streaming support**: Real-time processing |
|
|
|
|
|
--- |
|
|
|
|
|
**Author**: Jinhyun Woo |
|
|
**Paper**: [Zenodo](https://zenodo.org/records/17116281) |
|
|
**GitHub**: [Woojiggun/intelligent-tokenizer](https://github.com/Woojiggun/intelligent-tokenizer) |
|
|
**Model**: [ggunio/B2NL-IntelligentTokenizer-v6.2.1](https://huggingface.co/ggunio/B2NL-IntelligentTokenizer-v6.2.1) |
|
|
""") |
|
|
|
|
|
|
|
|
process_btn.click( |
|
|
fn=lambda text, temp: tokenizer.process_text(text, temp), |
|
|
inputs=[input_text, temperature], |
|
|
outputs=[stats_output, details_output] |
|
|
) |
|
|
|
|
|
def process_batch(texts): |
|
|
if not texts: |
|
|
return [] |
|
|
|
|
|
results = [] |
|
|
for text in texts.strip().split('\n'): |
|
|
if not text.strip(): |
|
|
continue |
|
|
|
|
|
|
|
|
text = text.strip() |
|
|
text_bytes = len(text.encode('utf-8')) |
|
|
|
|
|
|
|
|
if text_bytes <= 48: |
|
|
num_chunks = 1 |
|
|
num_embeddings = 3 |
|
|
else: |
|
|
num_chunks = 1 + math.ceil((text_bytes - 48) / 40) |
|
|
num_embeddings = num_chunks * 3 |
|
|
|
|
|
|
|
|
stats, details, reconstructed = tokenizer.process_text(text, 0.1) |
|
|
|
|
|
|
|
|
if any(ord(c) >= 0x3040 and ord(c) <= 0x309F for c in text): |
|
|
lang = "Japanese" |
|
|
elif any(ord(c) >= 0xAC00 and ord(c) <= 0xD7AF for c in text): |
|
|
lang = "Korean" |
|
|
elif any(ord(c) >= 0x4E00 and ord(c) <= 0x9FFF for c in text): |
|
|
lang = "Chinese" |
|
|
elif any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in text): |
|
|
lang = "Arabic" |
|
|
elif any(ord(c) >= 0x0400 and ord(c) <= 0x04FF for c in text): |
|
|
lang = "Russian" |
|
|
else: |
|
|
lang = "English/Latin" |
|
|
|
|
|
|
|
|
if "Error" not in stats: |
|
|
min_len = min(len(text), len(reconstructed)) |
|
|
matches = sum(1 for i in range(min_len) if text[i] == reconstructed[i]) |
|
|
accuracy = (matches / len(text)) * 100 if text else 0 |
|
|
|
|
|
results.append([ |
|
|
text[:50] + "..." if len(text) > 50 else text, |
|
|
lang, |
|
|
text_bytes, |
|
|
num_chunks, |
|
|
num_embeddings, |
|
|
f"{accuracy:.1f}%" |
|
|
]) |
|
|
|
|
|
return results |
|
|
|
|
|
batch_btn.click( |
|
|
fn=process_batch, |
|
|
inputs=batch_input, |
|
|
outputs=batch_output |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch() |