Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

App Files Files Community

intelligent-tokenizer-v6-demo / app.py

ggunio

Fix: Correct embedding dimension (1280-dim, not 768-dim)

3a4dfa2 verified about 1 month ago

raw

history blame contribute delete

18.2 kB

	"""
	B2NL-IntelligentTokenizer v6.2.1 - Progressive Byte-to-Natural Language Tokenizer

	⚠️ IMPORTANT: Currently in AUTOREGRESSIVE MODE (Teacher Forcing Training)
	- Current: ~500ms inference (accurate but slow)
	- Coming Soon (November 2025): Non-autoregressive training (<50ms, 10x faster)

	🚀 Purpose: Embedding Preprocessing Model for Inter-modal Communication
	This model serves as a preprocessing layer that converts raw text into compressed
	semantic embeddings, enabling efficient inter-modal communication between different
	AI systems. By separating language understanding from task-specific inference,
	it provides a universal representation layer for multi-modal AI applications.

	Key Features:
	- Fixed 16:1 compression ratio (48 bytes → 3 embeddings per chunk)
	- Byte-level processing (no vocabulary required)
	- 204 language support via FLORES-200 training
	- Sliding window for texts > 48 bytes
	"""

	import gradio as gr
	import torch
	import sys
	import io
	import time
	import math
	from pathlib import Path

	# Fix Windows Unicode
	if sys.platform == 'win32':
	sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
	sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')

	# Add paths
	sys.path.insert(0, 'core')

	from unified_model import IntelligentTokenizerV62
	from tokenizer import ByteTokenizerV62

	class B2NLTokenizer:
	def __init__(self):
	self.model = None
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	self.load_model()

	def load_model(self):
	"""Load model from HuggingFace or local"""
	try:
	# Try HuggingFace first
	from huggingface_hub import hf_hub_download
	checkpoint_path = hf_hub_download(
	repo_id="ggunio/B2NL-IntelligentTokenizer-v6.2.1",
	filename="pytorch_model.bin"
	)
	print(f"Loading from HuggingFace")
	except:
	# Try local paths
	checkpoint_paths = [
	"pytorch_model.bin",
	"checkpoints/v62/16.0/epoch_100.pt",
	"D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt"
	]
	checkpoint_path = None
	for path in checkpoint_paths:
	if Path(path).exists():
	checkpoint_path = path
	break

	if not checkpoint_path:
	print("❌ Model not found")
	return

	# Load model
	self.model = IntelligentTokenizerV62()
	checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)

	if 'model_state_dict' in checkpoint:
	self.model.load_state_dict(checkpoint['model_state_dict'])
	else:
	self.model.load_state_dict(checkpoint)

	self.model = self.model.to(self.device)
	self.model.eval()
	print(f"✅ Model loaded on {self.device}")

	def process_text(self, text, temperature=0.1):
	"""Process text and return detailed results"""
	if not self.model or not text:
	return "Please enter text", "", ""

	try:
	start_time = time.time()

	# Calculate chunks and embeddings
	text_bytes = len(text.encode('utf-8'))

	# For texts > 48 bytes: sliding window with 8-byte overlap
	if text_bytes <= 48:
	num_chunks = 1
	num_embeddings = 3 # 1 chunk = 3 embeddings
	else:
	# Sliding window: first chunk 48 bytes, then slide by 40 bytes (8 overlap)
	num_chunks = 1 + math.ceil((text_bytes - 48) / 40)
	num_embeddings = num_chunks * 3

	# Reconstruct (full text, not truncated)
	with torch.no_grad():
	# Calculate appropriate max_length based on input
	max_gen_length = max(48, min(len(text) + 10, 512)) # Allow some extra space

	reconstructed = self.model.generate(text, temperature=temperature, max_length=max_gen_length)

	# For long texts, ensure we get full reconstruction
	if text_bytes > 48:
	# Current model limitation: may not fully reconstruct very long texts
	# This is due to sliding window processing
	full_reconstruction = reconstructed
	else:
	full_reconstruction = reconstructed

	elapsed_time = (time.time() - start_time) * 1000

	# Calculate accuracy
	min_len = min(len(text), len(full_reconstruction))
	matches = sum(1 for i in range(min_len) if text[i] == full_reconstruction[i])
	accuracy = (matches / len(text)) * 100 if text else 0

	# Format results
	stats = f"""📊 Compression Statistics
	• Input: {text_bytes} bytes ({len(text)} chars)
	• Chunks: {num_chunks} chunk{"s" if num_chunks > 1 else ""} (48-byte chunks with 8-byte overlap for long texts)
	• Embeddings generated: {num_embeddings} embedding vectors (3 per chunk)
	• Compression ratio: 16:1 fixed (48 bytes → 3 embeddings)
	• Processing time: {elapsed_time:.1f}ms (autoregressive mode - slow)
	• Reconstruction accuracy: {accuracy:.1f}%

	⚠️ Current Mode: Autoregressive (Teacher Forcing training only)
	• Speed: ~500ms per generation
	• Coming: Non-autoregressive training (10x faster)"""

	details = f"""🔤 Original Text ({len(text)} chars, {text_bytes} bytes):
	{text}

	🔄 Reconstructed Text ({len(full_reconstruction)} chars):
	{full_reconstruction}

	✅ Match Rate: {accuracy:.1f}% ({matches}/{len(text)} characters)

	📝 Note: Reconstruction quality may decrease for texts > 48 bytes due to sliding window processing."""

	return stats, details, full_reconstruction

	except Exception as e:
	return f"Error: {str(e)}", "", ""

	# Initialize
	tokenizer = B2NLTokenizer()

	# Gradio Interface
	with gr.Blocks(title="B2NL v6.2.1", theme=gr.themes.Soft()) as app:
	gr.Markdown("""
	# 🚀 B2NL-IntelligentTokenizer v6.2.1

	## 📖 What is this model?

	B2NL (Byte-to-Natural Language) is a progressive tokenizer that converts raw text into compressed semantic embeddings.
	Unlike traditional tokenizers that use fixed vocabularies, B2NL learns directly from bytes and generates dense embeddings
	that capture semantic meaning while achieving 16:1 compression.

	### 🔬 How the 16:1 Compression Works

	```
	Input: 48 bytes (including padding/special tokens)
	↓
	Processing: Byte-level analysis with learned boundaries
	↓
	Output: 3 embedding vectors (1280-dim each)
	```

	Key Innovation: The model learns to identify semantic boundaries within the 48-byte window.
	Instead of splitting at arbitrary points, it discovers natural language units (words, morphemes, phrases)
	and encodes them into meaningful embeddings. This is why "Hello, world!" (13 bytes) still generates
	3 embeddings - the model pads to 48 bytes but learns which parts contain actual information.

	### 🎯 Why This Matters

	1. Semantic Preservation: Unlike byte-pair encoding (BPE) which can split words arbitrarily,
	B2NL respects semantic boundaries learned from data.

	2. Language Agnostic: No vocabulary needed - works equally well for all 204 languages.
	Korean "안녕하세요" and English "Hello" are processed the same way.

	3. Predictable Costs: Always 16:1 compression means predictable API costs for LLMs.
	48 bytes → 3 embeddings, always.

	4. Inter-modal Bridge: These embeddings can be used as a universal representation
	for cross-modal tasks (text→image, text→audio, etc.)

	### 🎯 Real-World Applications

	- LLM Cost Reduction: 75% fewer tokens = 75% cost savings on API calls
	- Multilingual Search: Single embedding space for 204 languages
	- Edge AI: Compressed representations for bandwidth-limited IoT devices
	- Cross-modal AI: Universal embeddings for multimodal models

	### ⚙️ Technical Architecture

	- Encoder: 6 layers, progressive dimension reduction
	- Decoder: 6 layers with cross-attention, reconstructs from embeddings
	- Boundary Learning: Gumbel-Softmax for differentiable boundary detection
	- Total Parameters: 244.7M (137.9M encoder + 106.8M decoder)
	- Training: FLORES-200 (204 languages), 100 epochs, teacher forcing

	### ⚠️ Current Limitations

	- Mode: Autoregressive (teacher forcing only) - ~500ms per generation
	- Long Texts: Quality decreases for texts > 48 bytes (sliding window limitation)
	- Coming Soon: Non-autoregressive training (November 2025) for 10x speedup

	---
	""")

	with gr.Tab("🔄 Reconstruction Test"):
	gr.Markdown("""
	Test how well the model compresses and reconstructs text. The model processes text in 48-byte chunks,
	generating 3 embedding vectors per chunk. For longer texts, it uses a sliding window with 8-byte overlap.
	""")

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Input Text",
	placeholder="Enter any text in any of 204 languages...",
	lines=5
	)

	gr.Examples(
	examples=[
	# Major languages
	"Hello, world! How are you today?",
	"안녕하세요, 반갑습니다. 오늘 날씨가 좋네요.",
	"你好世界！今天天气很好。",
	"こんにちは世界！今日はいい天気ですね。",
	"Bonjour le monde! Comment allez-vous?",
	"Hola mundo! ¿Cómo estás hoy?",
	"Привет мир! Как дела?",
	"مرحبا بالعالم! كيف حالك اليوم؟",
	# Test different lengths
	"Short", # 5 bytes - 1 chunk, 3 embeddings
	"This is exactly 48 bytes of text for one chunk!", # 48 bytes - 1 chunk, 3 embeddings
	"This is a longer text that exceeds 48 bytes and will need multiple chunks with sliding window processing.", # >48 bytes - multiple chunks
	],
	inputs=input_text,
	label="Example texts (various lengths and languages)"
	)

	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.1,
	step=0.1,
	label="Temperature (0.1 = Most accurate, 1.0 = More creative)"
	)

	process_btn = gr.Button("🔄 Compress & Reconstruct", variant="primary", size="lg")

	with gr.Column():
	stats_output = gr.Markdown(label="Statistics")
	details_output = gr.Markdown(label="Details")

	with gr.Tab("📊 Batch Test"):
	gr.Markdown("""
	Test multiple texts at once to compare compression across different languages and lengths.
	Each text is processed independently, showing how the fixed 16:1 compression works across languages.
	""")

	batch_input = gr.Textbox(
	label="Enter multiple texts (one per line)",
	placeholder="Enter texts in different languages...\nOne text per line",
	lines=10,
	value="""The quick brown fox jumps over the lazy dog.
	안녕하세요, 반갑습니다. 오늘 날씨가 정말 좋네요.
	你好世界！今天天气很好，我们一起去散步吧。
	こんにちは世界！今日はいい天気ですね。散歩に行きましょう。
	Bonjour le monde! Comment allez-vous aujourd'hui?
	مرحبا بالعالم! كيف حالك اليوم؟ الطقس جميل جداً.
	Привет мир! Как дела? Погода сегодня прекрасная!
	This text is exactly 48 bytes long for testing!
	Short text
	A much longer text that definitely exceeds 48 bytes and will require sliding window processing with 8-byte overlaps between chunks."""
	)

	batch_btn = gr.Button("🔄 Process Batch", variant="primary")
	batch_output = gr.Dataframe(
	headers=["Text", "Language", "Bytes", "Chunks", "Embeddings", "Accuracy"],
	label="Batch Results"
	)

	with gr.Tab("📖 Documentation"):
	gr.Markdown("""
	## Understanding B2NL Tokenization

	### 🔬 The Core Innovation: Learned Semantic Boundaries

	Traditional tokenizers use fixed rules (BPE, WordPiece) that can split words arbitrarily.
	B2NL learns to identify semantic units within byte sequences:

	```
	Traditional BPE: "안녕하세요" → "안", "녕", "하", "세", "요" (5 tokens)
	B2NL: "안녕하세요" → [emb1, emb2, emb3] (3 embeddings capturing full meaning)
	```

	### 📐 The 48-Byte → 3 Embeddings Architecture

	```
	[48 bytes input] → [Encoder] → [3 × 1280-dim embeddings] → [Decoder] → [48 bytes output]
	↑ ↓
	(with padding) (semantic compression)
	```

	Why 48 bytes?
	- Optimal for GPU parallelization (divisible by 8, 16, 24)
	- Captures most words/phrases in any language
	- Allows consistent 16:1 compression ratio

	Why 3 embeddings?
	- Matches typical semantic units in 48-byte window
	- Provides redundancy for robust reconstruction
	- Optimal for transformer cross-attention

	### 🌐 Language-Agnostic Processing

	The model treats all languages equally at the byte level:

	\| Language \| Sample Text \| Bytes \| Embeddings \| Compression \|
	\|----------\|------------\|-------\|------------\|-------------\|
	\| English \| "Hello" \| 5 (+43 pad) \| 3 \| 16:1 \|
	\| Korean \| "안녕하세요" \| 15 (+33 pad) \| 3 \| 16:1 \|
	\| Chinese \| "你好世界" \| 12 (+36 pad) \| 3 \| 16:1 \|
	\| Arabic \| "مرحبا" \| 10 (+38 pad) \| 3 \| 16:1 \|

	All get compressed to 3 embeddings, but the model learns which parts contain information.

	### 🔄 Sliding Window for Long Texts

	For texts exceeding 48 bytes:
	```
	Text: "This is a very long sentence that exceeds 48 bytes..."

	Chunk 1: [Bytes 0-47] → 3 embeddings
	↓ (8-byte overlap)
	Chunk 2: [Bytes 40-87] → 3 embeddings
	↓ (8-byte overlap)
	Chunk 3: [Bytes 80-127] → 3 embeddings
	```

	The 8-byte overlap preserves context across boundaries, preventing word splits.

	### Current Limitations

	1. Speed: ~500ms per generation (autoregressive mode)
	2. Long Texts: Quality decreases with multiple chunks
	3. Training: Only teacher forcing, no autoregressive training yet

	### Upcoming Improvements (November 2025)

	- Non-autoregressive training: 10x speed improvement
	- Better long text handling: Improved sliding window
	- Streaming support: Real-time processing

	---

	Author: Jinhyun Woo
	Paper: [Zenodo](https://zenodo.org/records/17116281)
	GitHub: [Woojiggun/intelligent-tokenizer](https://github.com/Woojiggun/intelligent-tokenizer)
	Model: [ggunio/B2NL-IntelligentTokenizer-v6.2.1](https://huggingface.co/ggunio/B2NL-IntelligentTokenizer-v6.2.1)
	""")

	# Connect functions
	process_btn.click(
	fn=lambda text, temp: tokenizer.process_text(text, temp),
	inputs=[input_text, temperature],
	outputs=[stats_output, details_output]
	)

	def process_batch(texts):
	if not texts:
	return []

	results = []
	for text in texts.strip().split('\n'):
	if not text.strip():
	continue

	# Process each text
	text = text.strip()
	text_bytes = len(text.encode('utf-8'))

	# Calculate chunks and embeddings
	if text_bytes <= 48:
	num_chunks = 1
	num_embeddings = 3
	else:
	num_chunks = 1 + math.ceil((text_bytes - 48) / 40)
	num_embeddings = num_chunks * 3

	# Get reconstruction
	stats, details, reconstructed = tokenizer.process_text(text, 0.1)

	# Detect language (simple heuristic)
	if any(ord(c) >= 0x3040 and ord(c) <= 0x309F for c in text):
	lang = "Japanese"
	elif any(ord(c) >= 0xAC00 and ord(c) <= 0xD7AF for c in text):
	lang = "Korean"
	elif any(ord(c) >= 0x4E00 and ord(c) <= 0x9FFF for c in text):
	lang = "Chinese"
	elif any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in text):
	lang = "Arabic"
	elif any(ord(c) >= 0x0400 and ord(c) <= 0x04FF for c in text):
	lang = "Russian"
	else:
	lang = "English/Latin"

	# Calculate accuracy
	if "Error" not in stats:
	min_len = min(len(text), len(reconstructed))
	matches = sum(1 for i in range(min_len) if text[i] == reconstructed[i])
	accuracy = (matches / len(text)) * 100 if text else 0

	results.append([
	text[:50] + "..." if len(text) > 50 else text,
	lang,
	text_bytes,
	num_chunks,
	num_embeddings,
	f"{accuracy:.1f}%"
	])

	return results

	batch_btn.click(
	fn=process_batch,
	inputs=batch_input,
	outputs=batch_output
	)

	if __name__ == "__main__":
	app.launch()