ggunio commited on
Commit
8ba68ce
·
verified ·
1 Parent(s): 0cc32d2

Simplify interface: focus on reconstruction test with embedding count and compression stats

Browse files
Files changed (1) hide show
  1. app.py +218 -678
app.py CHANGED
@@ -1,730 +1,270 @@
1
  """
2
- B2NL-IntelligentTokenizer v6.2.1 - Gradio Demo
3
-
4
- ⚠️ IMPORTANT: Currently in AUTOREGRESSIVE MODE (Teacher Forcing Training)
5
- - Current: ~500ms inference (accurate but slow)
6
- - Coming Soon (November 2025): Non-autoregressive training (<50ms, 10x faster)
7
-
8
- 🚀 Progressive Byte-to-Natural Language Tokenizer with 16:1 Fixed Compression
9
- 📊 Embedding Preprocessing Model for Inter-modal Communication
10
- 🌐 Trained on FLORES-200 dataset supporting 204 languages
11
-
12
- Key Features:
13
- - Fixed 16:1 compression ratio (3 tokens per 48-byte chunk)
14
- - Autoregressive reconstruction with high accuracy
15
- - Sliding window processing for long texts
16
- - Real-time compression statistics
17
- - Multi-language support with semantic preservation
18
-
19
- Architecture:
20
- - Encoder: 4-layer transformer with progressive splitting
21
- - Decoder: 6-layer transformer with cross-attention
22
- - Total Parameters: 230.3M
23
- - Gumbel-Softmax for differentiable token selection
24
-
25
- Purpose:
26
- This model serves as a preprocessing layer that converts raw text into compressed
27
- semantic embeddings, enabling efficient inter-modal communication between different
28
- AI systems. By separating language understanding from task-specific inference,
29
- it provides a universal representation layer for multi-modal AI applications.
30
  """
31
 
32
  import gradio as gr
33
  import torch
34
- import torch.nn.functional as F
35
- import numpy as np
36
  import sys
37
  import io
38
- from pathlib import Path
39
  import time
40
- from typing import Dict, List, Tuple, Optional
41
- from difflib import SequenceMatcher
42
 
43
- # Fix Windows Unicode output
44
  if sys.platform == 'win32':
45
  sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
46
  sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
47
 
48
- # Add project paths
49
- sys.path.insert(0, str(Path(__file__).parent.parent.parent / "intelligent-tokenizer_v6.2.1"))
50
- sys.path.insert(0, str(Path(__file__).parent.parent.parent / "intelligent-tokenizer_v6.2.1/core"))
51
-
52
- try:
53
- from core.unified_model import IntelligentTokenizerV62
54
- from core.tokenizer import ByteTokenizerV62
55
- except ImportError:
56
- print("Warning: Could not import from core, trying alternative path...")
57
- from unified_model import IntelligentTokenizerV62
58
- from tokenizer import ByteTokenizerV62
59
-
60
- # Global variables
61
- model = None
62
- device = None
63
- tokenizer = None
64
-
65
- def load_model(checkpoint_path: str = None):
66
- """
67
- Load the trained B2NL-IntelligentTokenizer model
68
-
69
- This loads the checkpoint containing the trained weights from
70
- 100 epochs of training on the FLORES-200 dataset.
71
- """
72
- global model, device, tokenizer
73
-
74
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
75
- print(f"Using device: {device}")
76
-
77
- # Initialize model
78
- model = IntelligentTokenizerV62()
79
-
80
- # Load checkpoint if provided
81
- if checkpoint_path and Path(checkpoint_path).exists():
82
- print(f"Loading checkpoint from {checkpoint_path}")
83
- checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
84
- if 'model_state_dict' in checkpoint:
85
- model.load_state_dict(checkpoint['model_state_dict'])
86
- print(f"Loaded checkpoint from epoch {checkpoint.get('epoch', 'N/A')}")
87
- else:
88
- model.load_state_dict(checkpoint)
89
-
90
- model = model.to(device)
91
- model.eval()
92
-
93
- # Initialize tokenizer
94
- tokenizer = ByteTokenizerV62()
95
-
96
- # Count parameters
97
- total_params = sum(p.numel() for p in model.parameters())
98
- print(f"Model loaded successfully! Total parameters: {total_params/1e6:.1f}M")
99
-
100
- return model
101
-
102
- def autoregressive_generate(encoder_outputs, max_length=48):
103
- """
104
- Autoregressive generation from compressed embeddings
105
-
106
- This is the proper way to generate text from the compressed representation,
107
- using the decoder in autoregressive mode with teacher forcing disabled.
108
- """
109
- # Get all encoder hidden states (decoder needs all 4 layers for cross-attention)
110
- if 'all_hidden_states' in encoder_outputs:
111
- encoder_all_hidden = encoder_outputs['all_hidden_states']
112
- else:
113
- compressed = encoder_outputs.get('compressed', encoder_outputs.get('hidden_states'))
114
- encoder_all_hidden = [compressed] * 4
115
-
116
- batch_size = encoder_all_hidden[0].shape[0]
117
- device = encoder_all_hidden[0].device
118
-
119
- # Start with BOS token
120
- generated = torch.full((batch_size, 1), tokenizer.BOS, dtype=torch.long, device=device)
121
-
122
- # Generate tokens autoregressively
123
- for _ in range(max_length - 1):
124
- with torch.no_grad():
125
- gen_mask = torch.ones_like(generated, dtype=torch.bool)
126
-
127
- # Run decoder with current sequence
128
- decoder_outputs = model.decoder(
129
- encoder_all_hidden=encoder_all_hidden,
130
- decoder_input_ids=generated,
131
- attention_mask=gen_mask,
132
- use_cache=False
133
  )
134
-
135
- # Get logits for the last position
136
- logits = decoder_outputs['logits'][:, -1, :]
137
-
138
- # Sample next token (greedy decoding for best accuracy)
139
- next_token = torch.argmax(logits, dim=-1, keepdim=True)
140
-
141
- # Append to generated sequence
142
- generated = torch.cat([generated, next_token], dim=1)
143
-
144
- # Stop if EOS is generated
145
- if (next_token == tokenizer.EOS).all():
146
- break
147
-
148
- return generated
149
-
150
- def process_with_sliding_window(text: str,
151
- chunk_size: int = 46,
152
- overlap: int = 8) -> Dict:
153
- """
154
- Process long text with sliding window approach
155
-
156
- The model processes 48-byte chunks (46 content + 2 special tokens).
157
- For longer texts, we use an 8-byte overlap to maintain context.
158
-
159
- Args:
160
- text: Input text
161
- chunk_size: Size of each chunk (default 46 bytes)
162
- overlap: Overlap between chunks (default 8 bytes)
163
-
164
- Returns:
165
- Dictionary with chunks and metadata
166
- """
167
- text_bytes = text.encode('utf-8')
168
- total_bytes = len(text_bytes)
169
-
170
- chunks = []
171
- positions = []
172
-
173
- # Handle short text
174
- if total_bytes <= chunk_size:
175
- chunks.append(text)
176
- positions.append((0, total_bytes))
177
- else:
178
- # Sliding window processing
179
- pos = 0
180
- while pos < total_bytes:
181
- end_pos = min(pos + chunk_size, total_bytes)
182
-
183
- # Extract chunk with proper UTF-8 handling
184
- chunk_bytes = text_bytes[pos:end_pos]
185
-
186
- # Ensure valid UTF-8 boundary
187
- while end_pos > pos and end_pos < total_bytes:
188
- try:
189
- chunk_text = text_bytes[pos:end_pos].decode('utf-8')
190
  break
191
- except UnicodeDecodeError:
192
- end_pos -= 1
193
-
194
- chunk_text = text_bytes[pos:end_pos].decode('utf-8', errors='ignore')
195
- chunks.append(chunk_text)
196
- positions.append((pos, end_pos))
197
-
198
- # Move window with overlap
199
- pos += chunk_size - overlap
200
-
201
- # Avoid tiny final chunk
202
- if total_bytes - pos < overlap:
203
- break
204
-
205
- return {
206
- 'chunks': chunks,
207
- 'positions': positions,
208
- 'total_bytes': total_bytes,
209
- 'num_chunks': len(chunks)
210
- }
211
-
212
- def compress_text(text: str,
213
- show_details: bool = True) -> Tuple[str, Dict]:
214
- """
215
- Compress text using B2NL-IntelligentTokenizer
216
-
217
- The model achieves a fixed 16:1 compression ratio by encoding
218
- each 48-byte chunk into exactly 3 semantic tokens.
219
-
220
- Returns:
221
- (status_message, statistics_dict)
222
- """
223
- if not model:
224
- return "❌ Model not loaded! Please load the model first.", {}
225
-
226
- if not text:
227
- return "⚠️ Please enter text to compress.", {}
228
-
229
- try:
230
- # Process with sliding window
231
- window_result = process_with_sliding_window(text)
232
- chunks = window_result['chunks']
233
- total_bytes = window_result['total_bytes']
234
-
235
- # Compress each chunk
236
- all_embeddings = []
237
- chunk_details = []
238
-
239
- for i, chunk in enumerate(chunks):
240
- with torch.no_grad():
241
- # Encode chunk
242
- encoded = tokenizer.encode(chunk)
243
- if isinstance(encoded, dict):
244
- input_ids = encoded['input_ids'].unsqueeze(0).to(device)
245
- attention_mask = encoded['attention_mask'].unsqueeze(0).to(device)
246
- else:
247
- input_ids = encoded.unsqueeze(0).to(device)
248
- attention_mask = torch.ones_like(input_ids).to(device)
249
 
250
- # Get encoder output
251
- encoder_output = model.encoder(
252
- input_ids=input_ids,
253
- attention_mask=attention_mask
254
- )
255
 
256
- # Extract compressed embeddings
257
- compressed = encoder_output.get('compressed')
 
258
 
259
- # Get actual token count
260
- if 'num_tokens' in encoder_output:
261
- num_tokens = round(encoder_output['num_tokens'])
262
- elif compressed is not None:
263
- num_tokens = compressed.shape[1]
264
- else:
265
- num_tokens = 3 # Default for 16:1 ratio
266
-
267
- if compressed is not None:
268
- all_embeddings.append(compressed)
269
- chunk_details.append({
270
- 'chunk_id': i + 1,
271
- 'text': chunk[:30] + '...' if len(chunk) > 30 else chunk,
272
- 'bytes': len(chunk.encode('utf-8')),
273
- 'tokens': num_tokens
274
- })
275
-
276
- # Calculate statistics
277
- total_tokens = sum(detail['tokens'] for detail in chunk_details)
278
- compression_ratio = total_bytes / max(1, total_tokens)
279
-
280
- stats = {
281
- 'total_bytes': total_bytes,
282
- 'total_tokens': total_tokens,
283
- 'num_chunks': len(chunks),
284
- 'compression_ratio': f"{compression_ratio:.1f}:1",
285
- 'avg_tokens_per_chunk': total_tokens / max(1, len(chunks))
286
- }
287
-
288
- # Build detailed message
289
- if show_details:
290
- details = f"✅ **Compression Complete!**\n\n"
291
- details += f"📊 **Input Statistics:**\n"
292
- details += f"- Total bytes: {total_bytes}\n"
293
- details += f"- Number of chunks: {len(chunks)}\n\n"
294
- details += f"🗜️ **Compression Results:**\n"
295
- details += f"- Total tokens generated: {total_tokens}\n"
296
- details += f"- **Compression ratio: {compression_ratio:.1f}:1**\n"
297
- details += f"- Average tokens per chunk: {stats['avg_tokens_per_chunk']:.1f}\n\n"
298
-
299
- if len(chunk_details) <= 5:
300
- details += "📝 **Chunk Details:**\n"
301
- for detail in chunk_details:
302
- details += f" • Chunk {detail['chunk_id']}: {detail['bytes']} bytes → {detail['tokens']} tokens\n"
303
-
304
- details += f"\n💡 **Note:** Fixed 16:1 compression means each 48-byte chunk "
305
- details += f"is compressed to exactly 3 tokens, preserving semantic meaning."
306
-
307
- return details, stats
308
  else:
309
- return f"Compressed: {total_bytes} bytes → {total_tokens} tokens ({compression_ratio:.1f}:1)", stats
310
-
311
- except Exception as e:
312
- return f"❌ Error during compression: {str(e)}", {}
313
-
314
- def reconstruct_text(text: str,
315
- temperature: float = 0.1,
316
- top_k: int = 10,
317
- streaming: bool = True) -> str:
318
- """
319
- Reconstruct text from compressed representation using autoregressive generation
320
-
321
- This function compresses the input text and then reconstructs it using
322
- the decoder in autoregressive mode. We use low temperature and Top-K
323
- sampling for maximum reconstruction accuracy.
324
 
325
- Args:
326
- text: Original text to compress and reconstruct
327
- temperature: Generation temperature (0.1 = very deterministic)
328
- top_k: Number of top tokens to sample from (10 = highly constrained)
329
- streaming: Whether to simulate streaming output
330
 
331
- Returns:
332
- Detailed reconstruction results with accuracy metrics
333
- """
334
- if not model:
335
- return "❌ Model not loaded! Please load the model first."
336
 
337
- if not text:
338
- return "⚠️ Please enter text to reconstruct."
339
 
340
- try:
341
- # Process with sliding window
342
- window_result = process_with_sliding_window(text)
343
- chunks = window_result['chunks']
 
344
 
345
- reconstructed_chunks = []
346
-
347
- for chunk in chunks:
348
  with torch.no_grad():
349
- # Encode chunk
350
- encoded = tokenizer.encode(chunk)
351
- if isinstance(encoded, dict):
352
- input_ids = encoded['input_ids'].unsqueeze(0).to(device)
353
- attention_mask = encoded['attention_mask'].unsqueeze(0).to(device)
354
- else:
355
- input_ids = encoded.unsqueeze(0).to(device)
356
- attention_mask = torch.ones_like(input_ids).to(device)
357
 
358
- # Get encoder outputs
359
- encoder_outputs = model.encoder(
360
- input_ids=input_ids,
361
- attention_mask=attention_mask
362
- )
363
 
364
- # Generate using autoregressive decoding
365
- generated_ids = autoregressive_generate(encoder_outputs, max_length=48)
 
 
366
 
367
- # Decode to text
368
- reconstructed = tokenizer.decode(generated_ids[0])
 
 
 
 
 
369
 
370
- # Trim to original chunk length
371
- chunk_len = len(chunk.encode('utf-8'))
372
- reconstructed = reconstructed[:chunk_len]
373
 
374
- reconstructed_chunks.append(reconstructed)
 
375
 
376
- if streaming:
377
- time.sleep(0.05) # Simulate streaming
378
 
379
- # Combine chunks (with overlap handling)
380
- if len(reconstructed_chunks) == 1:
381
- full_reconstruction = reconstructed_chunks[0]
382
- else:
383
- # First chunk in full
384
- full_reconstruction = reconstructed_chunks[0]
385
- # Subsequent chunks: skip overlap bytes
386
- for i in range(1, len(reconstructed_chunks)):
387
- chunk_text = reconstructed_chunks[i]
388
- # Skip approximately 8 bytes (overlap) - simplified
389
- if len(chunk_text) > 3:
390
- full_reconstruction += chunk_text[3:]
391
- else:
392
- full_reconstruction += chunk_text
393
-
394
- # Calculate accuracy using SequenceMatcher
395
- similarity = SequenceMatcher(None, text, full_reconstruction[:len(text)]).ratio()
396
-
397
- # Build result message
398
- result = f"🔄 **Reconstruction Complete!**\n\n"
399
- result += f"📝 **Original Text:**\n{text[:200]}{'...' if len(text) > 200 else ''}\n\n"
400
- result += f"🎯 **Reconstructed Text:**\n{full_reconstruction[:200]}{'...' if len(full_reconstruction) > 200 else ''}\n\n"
401
- result += f"📊 **Reconstruction Statistics:**\n"
402
- result += f"- **Accuracy: {similarity:.1%}**\n"
403
- result += f"- Original bytes: {len(text.encode('utf-8'))}\n"
404
- result += f"- Reconstructed bytes: {len(full_reconstruction.encode('utf-8'))}\n"
405
- result += f"- Chunks processed: {len(chunks)}\n\n"
406
-
407
- result += f"⚙️ **Generation Settings:**\n"
408
- result += f"- Temperature: {temperature} (Lower = More precise)\n"
409
- result += f"- Top-K: {top_k} (Lower = More deterministic)\n"
410
- result += f"- Method: Autoregressive decoding\n\n"
411
-
412
- if similarity >= 0.95:
413
- result += "✨ **Excellent reconstruction!** Near-perfect accuracy achieved."
414
- elif similarity >= 0.85:
415
- result += "✅ **Good reconstruction!** High accuracy with minor differences."
416
- elif similarity >= 0.70:
417
- result += "⚠️ **Moderate reconstruction.** Some semantic meaning preserved."
418
- else:
419
- result += "❌ **Poor reconstruction.** Consider retraining or adjusting parameters."
420
-
421
- return result
422
-
423
- except Exception as e:
424
- return f"❌ Error during reconstruction: {str(e)}"
425
-
426
- def compare_performance(text: str) -> str:
427
- """
428
- Compare B2NL tokenizer with traditional tokenizers
429
-
430
- Shows how our 16:1 fixed compression compares to BPE and SentencePiece
431
- in terms of token efficiency and potential cost savings.
432
- """
433
- if not text:
434
- return "⚠️ Please enter text for comparison."
435
-
436
- try:
437
- text_bytes = len(text.encode('utf-8'))
438
-
439
- # Traditional tokenizer estimates (empirical averages)
440
- # BPE (GPT-2/3): ~4 bytes per token
441
- # SentencePiece: ~4.5 bytes per token
442
- # WordPiece (BERT): ~3.5 bytes per token
443
- bpe_tokens = text_bytes // 4
444
- sentencepiece_tokens = text_bytes // 4.5
445
- wordpiece_tokens = text_bytes // 3.5
446
-
447
- # Our compression
448
- _, stats = compress_text(text, show_details=False)
449
- our_tokens = stats.get('total_tokens', 0)
450
-
451
- # Calculate improvements
452
- if our_tokens > 0:
453
- vs_bpe = bpe_tokens / our_tokens
454
- vs_sp = sentencepiece_tokens / our_tokens
455
- vs_wp = wordpiece_tokens / our_tokens
456
-
457
- savings_bpe = (1 - our_tokens/bpe_tokens) * 100
458
- savings_sp = (1 - our_tokens/sentencepiece_tokens) * 100
459
- savings_wp = (1 - our_tokens/wordpiece_tokens) * 100
460
- else:
461
- vs_bpe = vs_sp = vs_wp = 0
462
- savings_bpe = savings_sp = savings_wp = 0
463
-
464
- comparison = "## 📊 Tokenizer Comparison\n\n"
465
-
466
- # Table format
467
- comparison += "| Tokenizer | Tokens | Compression | Savings |\n"
468
- comparison += "|-----------|--------|-------------|----------|\n"
469
- comparison += f"| BPE (GPT-2/3) | {bpe_tokens} | Baseline | - |\n"
470
- comparison += f"| SentencePiece | {int(sentencepiece_tokens)} | {bpe_tokens/max(1,sentencepiece_tokens):.1f}x | {int(savings_sp-savings_bpe)}% |\n"
471
- comparison += f"| WordPiece (BERT) | {int(wordpiece_tokens)} | {bpe_tokens/max(1,wordpiece_tokens):.1f}x | {int(savings_wp-savings_bpe)}% |\n"
472
- comparison += f"| **B2NL v6.2.1** | **{our_tokens}** | **{vs_bpe:.1f}x** | **{int(savings_bpe)}%** |\n\n"
473
-
474
- # Summary
475
- comparison += f"### 🚀 Key Achievements:\n"
476
- comparison += f"- **{vs_bpe:.1f}x** more efficient than BPE tokenization\n"
477
- comparison += f"- **{int(savings_bpe)}%** reduction in token count\n"
478
- comparison += f"- Fixed 16:1 compression ratio (predictable costs)\n"
479
- comparison += f"- Semantic preservation across 204 languages\n\n"
480
-
481
- # Cost implications
482
- comparison += f"### 💰 Cost Implications:\n"
483
- comparison += f"For LLM APIs charging per token:\n"
484
- comparison += f"- Traditional: ${bpe_tokens * 0.002:.2f} (at $0.002/1K tokens)\n"
485
- comparison += f"- B2NL: ${our_tokens * 0.002:.2f}\n"
486
- comparison += f"- **Savings: ${(bpe_tokens - our_tokens) * 0.002:.2f} ({int(savings_bpe)}%)**\n\n"
487
-
488
- comparison += "📌 **Note:** B2NL serves as a preprocessing layer, converting text to "
489
- comparison += "compressed embeddings before feeding to inference models."
490
-
491
- return comparison
492
-
493
- except Exception as e:
494
- return f"❌ Error during comparison: {str(e)}"
495
-
496
- # Create Gradio interface
497
- def create_demo():
498
- """Create the interactive Gradio demo interface"""
499
-
500
- with gr.Blocks(title="B2NL-IntelligentTokenizer v6.2.1", theme=gr.themes.Soft()) as demo:
501
- gr.Markdown("""
502
- # 🚀 B2NL-IntelligentTokenizer v6.2.1
503
- ### Progressive Byte-to-Natural Language Tokenizer with 16:1 Fixed Compression
504
 
505
- ---
 
506
 
507
- **🎯 Purpose:** This model serves as an **embedding preprocessing layer** for inter-modal
508
- communication, converting raw text into compressed semantic representations that can be
509
- efficiently processed by downstream AI models.
510
 
511
- **🌐 Training:** Trained on the FLORES-200 dataset covering 204 languages with 100 epochs
512
- of progressive splitting optimization.
 
 
513
 
514
- **⚡ Innovation:** Achieves fixed 16:1 compression ratio (3 tokens per 48-byte chunk) while
515
- maintaining semantic integrity through Gumbel-Softmax differentiable token selection.
516
- """)
517
 
 
518
  with gr.Row():
519
- with gr.Column(scale=1):
520
- gr.Markdown("""
521
- ### 📊 Model Specifications
522
- - **Architecture:** 4L Encoder + 6L Decoder
523
- - **Parameters:** 230.3M
524
- - **Compression:** 16:1 fixed ratio
525
- - **Chunk Size:** 48 bytes (46 + BOS/EOS)
526
- - **Output:** 3 tokens per chunk
527
- - **Languages:** 204 (FLORES-200)
528
- """)
529
- with gr.Column(scale=1):
530
- gr.Markdown("""
531
- ### 🎯 Key Features
532
- - ✅ Fixed compression ratio (predictable)
533
- - ✅ Sliding window for long texts
534
- - ✅ Autoregressive reconstruction
535
- - ✅ Multi-language semantic preservation
536
- - ✅ Streaming processing support
537
- - ✅ 80%+ reconstruction accuracy
538
- """)
539
-
540
- # Load model section
541
- with gr.Row():
542
- checkpoint_path = gr.Textbox(
543
- label="📁 Checkpoint Path",
544
- placeholder="Path to epoch_100.pt checkpoint...",
545
- value="D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt"
546
- )
547
- load_btn = gr.Button("🔧 Load Model", variant="primary", scale=0)
548
- status = gr.Textbox(label="Status", value="⏳ Model not loaded", scale=0)
549
-
550
- # Main tabs
551
- with gr.Tabs():
552
- with gr.TabItem("🗜️ Compression Analysis"):
553
- gr.Markdown("### Analyze text compression with detailed statistics")
554
- with gr.Row():
555
- with gr.Column():
556
- input_text = gr.Textbox(
557
- label="Input Text",
558
- placeholder="Enter any text in any of 204 supported languages...",
559
- lines=10
560
- )
561
- compress_btn = gr.Button("🗜️ Compress", variant="primary")
562
-
563
- with gr.Column():
564
- compression_output = gr.Textbox(
565
- label="Compression Results",
566
- lines=10
567
- )
568
- compression_stats = gr.JSON(label="Detailed Statistics")
569
-
570
- with gr.TabItem("🔄 Reconstruction Test"):
571
- gr.Markdown("### Test compression and reconstruction accuracy")
572
- with gr.Row():
573
- with gr.Column():
574
- recon_input = gr.Textbox(
575
- label="Text to Reconstruct",
576
- placeholder="Enter text to compress and reconstruct...",
577
- lines=8
578
- )
579
- with gr.Row():
580
- temperature = gr.Slider(
581
- minimum=0.01, maximum=1.0, value=0.1, step=0.01,
582
- label="Temperature (0.1 = Precise)"
583
- )
584
- top_k = gr.Slider(
585
- minimum=1, maximum=50, value=10, step=1,
586
- label="Top-K (10 = Deterministic)"
587
- )
588
- reconstruct_btn = gr.Button("🔄 Reconstruct", variant="primary")
589
-
590
- with gr.Column():
591
- reconstruction_output = gr.Textbox(
592
- label="Reconstruction Results",
593
- lines=15
594
- )
595
-
596
- with gr.TabItem("📊 Tokenizer Comparison"):
597
- gr.Markdown("### Compare with traditional tokenizers (BPE, SentencePiece)")
598
- with gr.Row():
599
- with gr.Column():
600
- compare_input = gr.Textbox(
601
- label="Text for Comparison",
602
- placeholder="Enter text to compare tokenization efficiency...",
603
- lines=8
604
- )
605
- compare_btn = gr.Button("📊 Compare", variant="primary")
606
-
607
- with gr.Column():
608
- comparison_output = gr.Markdown()
609
-
610
- with gr.TabItem("📝 Example Tests"):
611
- gr.Markdown("### Pre-configured test examples in various languages")
612
  gr.Examples(
613
  examples=[
614
- ["The quick brown fox jumps over the lazy dog."],
615
- ["안녕하세요. 오늘 날씨가 정말 좋네요!"],
616
- ["今天天气很好,适合出去散步。"],
617
- ["Bonjour le monde! Comment allez-vous aujourd'hui?"],
618
- ["مرحبا بالعالم! كيف حالك اليوم؟"],
619
- ["こんにちは世界!今日はいい天気ですね。"],
620
- ["Привет мир! Как дела сегодня?"],
621
- ["Multi-language: Hello 안녕하세요 你好 こんにちは"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
622
  ],
623
- inputs=[input_text]
 
624
  )
625
 
626
- with gr.TabItem("📚 Documentation"):
627
- gr.Markdown("""
628
- ### Technical Details
629
-
630
- **Model Architecture:**
631
- - **Encoder:** 4-layer transformer with progressive splitting mechanism
632
- - **Decoder:** 6-layer transformer with multi-level cross-attention
633
- - **Token Selection:** Gumbel-Softmax with temperature annealing
634
- - **Attention:** Multi-Query Attention (MQA) with 8x KV cache reduction
635
-
636
- **Training Details:**
637
- - **Dataset:** FLORES-200 (204 languages)
638
- - **Epochs:** 100
639
- - **Batch Size:** 128
640
- - **Learning Rate:** 3e-5 with cosine annealing
641
- - **Loss:** Weighted combination of reconstruction, compression, and boundary losses
642
-
643
- **Compression Mechanism:**
644
- - Input text is split into 48-byte chunks (46 content + 2 special tokens)
645
- - Each chunk is compressed to exactly 3 semantic tokens
646
- - Achieves fixed 16:1 compression ratio
647
- - Uses sliding window with 8-byte overlap for long texts
648
-
649
- **Use Cases:**
650
- 1. **LLM Cost Reduction:** Reduce token counts by ~75%
651
- 2. **Cross-modal Communication:** Universal embedding layer
652
- 3. **Multilingual Processing:** Unified representation for 204 languages
653
- 4. **Bandwidth Optimization:** Compress text for transmission
654
-
655
- **Limitations:**
656
- - Mixed language text may have lower reconstruction accuracy
657
- - Optimized for semantic preservation, not exact character matching
658
- - Requires GPU for optimal performance
659
-
660
- **Citation:**
661
- ```
662
- @model{b2nl2024,
663
- title={B2NL-IntelligentTokenizer: Progressive Byte-to-Natural Language Tokenization},
664
- author={ggunio},
665
- year={2024},
666
- version={6.2.1},
667
- url={https://huggingface.co/ggunio/B2NL-IntelligentTokenizer}
668
- }
669
- ```
670
- """)
671
-
672
- # Event handlers
673
- def load_model_handler(path):
674
- try:
675
- if not path:
676
- return "⚠️ Please provide a checkpoint path"
677
- load_model(path)
678
- return "✅ Model loaded successfully! Ready for inference."
679
- except Exception as e:
680
- return f"❌ Error loading model: {str(e)}"
681
-
682
- load_btn.click(
683
- load_model_handler,
684
- inputs=[checkpoint_path],
685
- outputs=[status]
686
- )
687
 
688
- compress_btn.click(
689
- compress_text,
690
- inputs=[input_text],
691
- outputs=[compression_output, compression_stats]
692
- )
693
 
694
- reconstruct_btn.click(
695
- reconstruct_text,
696
- inputs=[recon_input, temperature, top_k],
697
- outputs=[reconstruction_output]
698
- )
699
 
700
- compare_btn.click(
701
- compare_performance,
702
- inputs=[compare_input],
703
- outputs=[comparison_output]
 
 
 
 
 
 
 
 
 
 
704
  )
705
 
706
- # Auto-load model on startup
707
- demo.load(
708
- lambda: " Ready to load model. Click 'Load Model' to begin.",
709
- outputs=[status]
710
  )
711
 
712
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
 
714
  if __name__ == "__main__":
715
- # Create and launch demo
716
- demo = create_demo()
717
-
718
- print("="*60)
719
- print("B2NL-IntelligentTokenizer v6.2.1 - Gradio Demo")
720
- print("="*60)
721
- print("Launching interactive demo...")
722
- print("Share link will be generated for public access")
723
- print("="*60)
724
-
725
- demo.launch(
726
- server_name="0.0.0.0",
727
- server_port=7860,
728
- share=True, # Create public link
729
- debug=False # Set to True for debugging
730
- )
 
1
  """
2
+ B2NL-IntelligentTokenizer v6.2.1 - Simple Demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  """
4
 
5
  import gradio as gr
6
  import torch
 
 
7
  import sys
8
  import io
 
9
  import time
10
+ from pathlib import Path
 
11
 
12
+ # Fix Windows Unicode
13
  if sys.platform == 'win32':
14
  sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
15
  sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
16
 
17
+ # Add paths
18
+ sys.path.insert(0, 'core')
19
+
20
+ from unified_model import IntelligentTokenizerV62
21
+ from tokenizer import ByteTokenizerV62
22
+
23
+ class B2NLTokenizer:
24
+ def __init__(self):
25
+ self.model = None
26
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
27
+ self.load_model()
28
+
29
+ def load_model(self):
30
+ """Load model from HuggingFace or local"""
31
+ try:
32
+ # Try HuggingFace first
33
+ from huggingface_hub import hf_hub_download
34
+ checkpoint_path = hf_hub_download(
35
+ repo_id="ggunio/B2NL-IntelligentTokenizer-v6.2.1",
36
+ filename="pytorch_model.bin"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  )
38
+ print(f"Loading from HuggingFace")
39
+ except:
40
+ # Try local paths
41
+ checkpoint_paths = [
42
+ "pytorch_model.bin",
43
+ "checkpoints/v62/16.0/epoch_100.pt",
44
+ "D:/intelligent-tokenizer/intelligent-tokenizer_v6.2.1/checkpoints/v62/16.0/epoch_100.pt"
45
+ ]
46
+ checkpoint_path = None
47
+ for path in checkpoint_paths:
48
+ if Path(path).exists():
49
+ checkpoint_path = path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ if not checkpoint_path:
53
+ print("❌ Model not found")
54
+ return
 
 
55
 
56
+ # Load model
57
+ self.model = IntelligentTokenizerV62()
58
+ checkpoint = torch.load(checkpoint_path, map_location=self.device, weights_only=False)
59
 
60
+ if 'model_state_dict' in checkpoint:
61
+ self.model.load_state_dict(checkpoint['model_state_dict'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  else:
63
+ self.model.load_state_dict(checkpoint)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ self.model = self.model.to(self.device)
66
+ self.model.eval()
67
+ print(f"✅ Model loaded on {self.device}")
 
 
68
 
69
+ def process_text(self, text, temperature=0.1):
70
+ """Process text and return detailed results"""
71
+ if not self.model or not text:
72
+ return "Please enter text", "", ""
 
73
 
74
+ try:
75
+ start_time = time.time()
76
 
77
+ # Compress (get embedding info)
78
+ compressed = self.model.compress(text)
79
+ num_tokens = compressed['num_tokens']
80
+ text_bytes = len(text.encode('utf-8'))
81
+ compression_ratio = compressed['compression_ratio']
82
 
83
+ # Reconstruct
 
 
84
  with torch.no_grad():
85
+ reconstructed = self.model.generate(text, temperature=temperature)
 
 
 
 
 
 
 
86
 
87
+ elapsed_time = (time.time() - start_time) * 1000
 
 
 
 
88
 
89
+ # Calculate accuracy
90
+ min_len = min(len(text), len(reconstructed))
91
+ matches = sum(1 for i in range(min_len) if text[i] == reconstructed[i])
92
+ accuracy = (matches / len(text)) * 100 if text else 0
93
 
94
+ # Format results
95
+ stats = f"""📊 **Compression Statistics**
96
+ • Input: {text_bytes} bytes → {num_tokens} tokens
97
+ • Compression: {compression_ratio:.1f}:1 ({(1/compression_ratio)*100:.1f}% of original)
98
+ • Embeddings generated: {num_tokens}
99
+ • Processing time: {elapsed_time:.1f}ms
100
+ • Reconstruction accuracy: {accuracy:.1f}%"""
101
 
102
+ details = f"""🔤 **Original Text** ({len(text)} chars, {text_bytes} bytes):
103
+ {text}
 
104
 
105
+ 🔄 **Reconstructed Text** ({len(reconstructed)} chars):
106
+ {reconstructed}
107
 
108
+ **Match Rate**: {accuracy:.1f}% ({matches}/{len(text)} characters)"""
 
109
 
110
+ return stats, details, reconstructed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ except Exception as e:
113
+ return f"Error: {str(e)}", "", ""
114
 
115
+ # Initialize
116
+ tokenizer = B2NLTokenizer()
 
117
 
118
+ # Gradio Interface
119
+ with gr.Blocks(title="B2NL v6.2.1", theme=gr.themes.Soft()) as app:
120
+ gr.Markdown("""
121
+ # 🚀 B2NL-IntelligentTokenizer v6.2.1
122
 
123
+ **Fixed 16:1 compression** | **204 languages** | **Autoregressive mode** (~500ms)
124
+ """)
 
125
 
126
+ with gr.Tab("🔄 Reconstruction Test"):
127
  with gr.Row():
128
+ with gr.Column():
129
+ input_text = gr.Textbox(
130
+ label="Input Text",
131
+ placeholder="Enter any text in any of 204 languages...",
132
+ lines=5
133
+ )
134
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  gr.Examples(
136
  examples=[
137
+ # Major languages
138
+ "Hello, world! How are you today?",
139
+ "안녕하세요, 반갑습니다. 오늘 날씨가 좋네요.",
140
+ "你好世界!今天天气很好。",
141
+ "こんにちは世界!今日はいい天気ですね。",
142
+ "Bonjour le monde! Comment allez-vous?",
143
+ "Hola mundo! ¿Cómo estás hoy?",
144
+ "Привет мир! Как дела?",
145
+ "مرحبا بالعالم! كيف حالك اليوم؟",
146
+ "Olá mundo! Como você está?",
147
+ "Hallo Welt! Wie geht es dir?",
148
+ # More diverse languages
149
+ "नमस्ते दुनिया! आप कैसे हैं?", # Hindi
150
+ "হ্যালো বিশ্ব! আপনি কেমন আছেন?", # Bengali
151
+ "สวัสดีชาวโลก! คุณเป็นอย่างไรบ้าง?", # Thai
152
+ "Xin chào thế giới! Bạn khỏe không?", # Vietnamese
153
+ "Kamusta mundo! Kumusta ka?", # Filipino
154
+ "Jambo dunia! Habari yako?", # Swahili
155
+ "Γεια σου κόσμε! Πώς είσαι;", # Greek
156
+ "שלום עולם! מה שלומך?", # Hebrew
157
+ "Selam dünya! Nasılsın?", # Turkish
158
+ "Salam dünya! Necəsən?", # Azerbaijani
159
  ],
160
+ inputs=input_text,
161
+ label="Example texts (204 languages supported)"
162
  )
163
 
164
+ temperature = gr.Slider(
165
+ minimum=0.1,
166
+ maximum=1.0,
167
+ value=0.1,
168
+ step=0.1,
169
+ label="Temperature (0.1 = Most accurate)"
170
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
+ process_btn = gr.Button("🔄 Compress & Reconstruct", variant="primary", size="lg")
 
 
 
 
173
 
174
+ with gr.Column():
175
+ stats_output = gr.Markdown(label="Statistics")
176
+ details_output = gr.Markdown(label="Details")
 
 
177
 
178
+ with gr.Tab("📊 Batch Test"):
179
+ gr.Markdown("""
180
+ Test multiple texts at once to compare compression rates across languages.
181
+ """)
182
+
183
+ batch_input = gr.Textbox(
184
+ label="Enter multiple texts (one per line)",
185
+ placeholder="Enter texts in different languages...\nOne text per line",
186
+ lines=10,
187
+ value="""Hello, world!
188
+ 안녕하세요, 반갑습니다.
189
+ 你好世界!
190
+ こんにちは世界!
191
+ Bonjour le monde!"""
192
  )
193
 
194
+ batch_btn = gr.Button("🔄 Process Batch", variant="primary")
195
+ batch_output = gr.Dataframe(
196
+ headers=["Text", "Language", "Bytes", "Tokens", "Compression", "Accuracy"],
197
+ label="Batch Results"
198
  )
199
 
200
+ # Connect functions
201
+ process_btn.click(
202
+ fn=lambda text, temp: tokenizer.process_text(text, temp),
203
+ inputs=[input_text, temperature],
204
+ outputs=[stats_output, details_output]
205
+ )
206
+
207
+ def process_batch(texts):
208
+ if not texts:
209
+ return []
210
+
211
+ results = []
212
+ for text in texts.strip().split('\n'):
213
+ if not text.strip():
214
+ continue
215
+
216
+ stats, details, reconstructed = tokenizer.process_text(text.strip(), 0.1)
217
+
218
+ # Parse stats for table
219
+ if "Error" not in stats:
220
+ # Detect language (simple heuristic)
221
+ if any(ord(c) >= 0x3040 and ord(c) <= 0x309F for c in text):
222
+ lang = "Japanese"
223
+ elif any(ord(c) >= 0xAC00 and ord(c) <= 0xD7AF for c in text):
224
+ lang = "Korean"
225
+ elif any(ord(c) >= 0x4E00 and ord(c) <= 0x9FFF for c in text):
226
+ lang = "Chinese"
227
+ elif any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in text):
228
+ lang = "Arabic"
229
+ elif any(ord(c) >= 0x0400 and ord(c) <= 0x04FF for c in text):
230
+ lang = "Russian"
231
+ else:
232
+ lang = "English/Latin"
233
+
234
+ text_bytes = len(text.encode('utf-8'))
235
+ compressed = tokenizer.model.compress(text)
236
+ num_tokens = compressed['num_tokens']
237
+ compression_ratio = compressed['compression_ratio']
238
+
239
+ # Calculate accuracy
240
+ min_len = min(len(text), len(reconstructed))
241
+ matches = sum(1 for i in range(min_len) if text[i] == reconstructed[i])
242
+ accuracy = (matches / len(text)) * 100 if text else 0
243
+
244
+ results.append([
245
+ text[:50] + "..." if len(text) > 50 else text,
246
+ lang,
247
+ text_bytes,
248
+ num_tokens,
249
+ f"{compression_ratio:.1f}:1",
250
+ f"{accuracy:.1f}%"
251
+ ])
252
+
253
+ return results
254
+
255
+ batch_btn.click(
256
+ fn=process_batch,
257
+ inputs=batch_input,
258
+ outputs=batch_output
259
+ )
260
+
261
+ gr.Markdown("""
262
+ ---
263
+ **Note**: This model uses autoregressive generation (teacher forcing training).
264
+ Non-autoregressive training planned for November 2025 will provide 10x speedup.
265
+
266
+ Model: [ggunio/B2NL-IntelligentTokenizer-v6.2.1](https://huggingface.co/ggunio/B2NL-IntelligentTokenizer-v6.2.1)
267
+ """)
268
 
269
  if __name__ == "__main__":
270
+ app.launch()