Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

ggunio commited on Sep 25

Commit

0250815

1 Parent(s): 83b0066

Fix UTF-8 safe chunking, token boundary visualization, and embedding display

- Implemented UTF-8 safe text splitting to preserve character boundaries
- Show actual model-learned token boundaries instead of chunk boundaries
- Fixed embedding extraction using encoder_hidden_states key
- Added language list (English, Korean, Chinese, Japanese, Arabic, Spanish)
- Enhanced embedding statistics display

Files changed (1) hide show

app.py +88 -40

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
 B2NL (Byte-to-Natural-Language) Tokenizer Demo
 Version 6.1.2 - 18.6:1 Compression with 100% Reconstruction
-Enhanced with chunking, streaming, group visualization, and embeddings
 """
 import gradio as gr
@@ -11,7 +11,6 @@ from pathlib import Path
 import sys
 import time
 from typing import List, Tuple, Dict, Generator
-# Removed matplotlib imports - using text display instead
 # Import from local core directory
 from core.unified_model import IntelligentTokenizerModelV61
@@ -65,7 +64,7 @@ def load_model(checkpoint_path=None):
     return model, tokenizer
 def visualize_groups(byte_seq: List[int], boundaries: torch.Tensor) -> str:
-    """Visualize how bytes are grouped for compression"""
     if boundaries is None:
         return "No boundary information available"
@@ -108,13 +107,17 @@ def visualize_groups(byte_seq: List[int], boundaries: torch.Tensor) -> str:
     return ' '.join(groups)
 def format_embeddings(embeddings: torch.Tensor) -> str:
-    """Format embeddings as text"""
     if embeddings is None:
         return "No embeddings available"
-    # Take first 20 dimensions for display
     if embeddings.dim() > 1:
-        embed_values = embeddings[0, :20].cpu().numpy()
     else:
         embed_values = embeddings[:20].cpu().numpy()
@@ -134,8 +137,30 @@ def format_embeddings(embeddings: torch.Tensor) -> str:
     return result
 def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
-    """Process a single chunk of text"""
     model, tokenizer = load_model()
     # Encode to bytes
@@ -169,18 +194,36 @@ def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
             use_cross_attention=True  # Enable cross-attention for better reconstruction
         )
-    # Extract groups for visualization
     groups_visual = "No groups"
     num_tokens = 1
-    if 'eojeol_boundaries' in outputs:
-        groups_visual = visualize_groups(byte_seq, outputs['eojeol_boundaries'])
-        boundaries = torch.argmax(outputs['eojeol_boundaries'], dim=-1)[0]
-        num_tokens = torch.sum(boundaries == 1).item() + 1
-    # Get embeddings
     embeddings = None
-    if 'encoder_hidden' in outputs:
-        embeddings = outputs['encoder_hidden'][0, 0]  # First token embedding
     # Reconstruction
     reconstructed = ""
@@ -218,25 +261,21 @@ def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
         'embeddings': embeddings
     }
-def stream_process(text: str, chunk_size: int = 62, overlap: int = 8) -> Generator:
-    """Stream process text with sliding window"""
     if not text:
         yield {"error": "Please enter text"}
         return
-    # Process in chunks
-    text_bytes = text.encode('utf-8')
-    step = chunk_size - overlap
-    for chunk_idx, i in enumerate(range(0, len(text_bytes), step)):
-        chunk_bytes = text_bytes[i:i+chunk_size]
         # Skip very small chunks
-        if len(chunk_bytes) < 10 and i > 0:
             continue
         try:
-            chunk_text = chunk_bytes.decode('utf-8', errors='ignore')
             result = process_chunk(chunk_text, chunk_idx)
             yield result
         except Exception as e:
@@ -282,18 +321,28 @@ def process_text_full(text: str, show_embeddings: bool = False):
 - Chunks Processed: {len(all_results)}
 """
-        # Format groups visualization (show first 3 chunks)
-        groups_text = "**Compression Groups (< > shows token boundaries):**\n\n"
-        for i, result in enumerate(all_results[:3]):
-            groups_text += f"Chunk {i+1}: {result['groups']}\n\n"
-        if len(all_results) > 3:
-            groups_text += f"... and {len(all_results)-3} more chunks\n"
         # Format embeddings as text
         embed_text = ""
-        if show_embeddings and all_results and all_results[0]['embeddings'] is not None:
-            embed_text = format_embeddings(all_results[0]['embeddings'])
         return stats, full_reconstructed, groups_text, embed_text, overall_compression
@@ -350,9 +399,8 @@ with gr.Blocks(
     ### 18.6:1 Average Compression with 100% Reconstruction!
     Advanced features:
-    - **Chunked Processing**: Handles long texts with 64-byte chunks
-    - **Sliding Window**: 8-byte overlap for seamless boundaries
-    - **Group Visualization**: See how bytes are compressed into tokens
     - **Embedding Display**: Visualize learned representations
     - **Streaming Support**: Process text in real-time
     """)
@@ -434,7 +482,7 @@ with gr.Blocks(
     with gr.Tab("Streaming Demo"):
         gr.Markdown("""
         ### Real-time Streaming Processing
-        Watch as text is processed chunk by chunk with sliding window overlap.
         """)
         stream_input = gr.Textbox(
@@ -490,9 +538,9 @@ with gr.Blocks(
     - **Version**: 6.1.2 (best_model.pt - Epoch 233)
     - **Architecture**: ByteEncoder + TransformerDecoder with Cross-Attention
     - **Chunk Size**: 64 bytes (62 content + BOS + EOS)
-    - **Sliding Window**: 8-byte overlap for continuity
     - **Boundary Learning**: 3-level hierarchical (char, word, phrase)
-    - **Languages Tested**: 6 core languages
     - **Average Compression**: 18.6:1 (varies by language)
     - **Reconstruction**: 100% accuracy achieved
@@ -500,7 +548,7 @@ with gr.Blocks(
     - Pure byte-level tokenization (no vocabulary)
     - Learning-based compression without language rules
     - Cross-attention for sequence relationships
-    - Boundary detection for optimal grouping
     ---
     *Note: v6.1.3 in training with 204 languages for universal coverage*

 """
 B2NL (Byte-to-Natural-Language) Tokenizer Demo
 Version 6.1.2 - 18.6:1 Compression with 100% Reconstruction
+Enhanced with UTF-8 safe chunking, token boundary visualization, and embeddings
 """
 import gradio as gr
 import sys
 import time
 from typing import List, Tuple, Dict, Generator
 # Import from local core directory
 from core.unified_model import IntelligentTokenizerModelV61
     return model, tokenizer
 def visualize_groups(byte_seq: List[int], boundaries: torch.Tensor) -> str:
+    """Visualize how bytes are grouped for compression based on model boundaries"""
     if boundaries is None:
         return "No boundary information available"
     return ' '.join(groups)
 def format_embeddings(embeddings: torch.Tensor) -> str:
+    """Format embeddings as text with statistics"""
     if embeddings is None:
         return "No embeddings available"
+    # Handle different tensor shapes
     if embeddings.dim() > 1:
+        # If multiple dimensions, flatten or take first
+        if embeddings.shape[0] > 20:
+            embed_values = embeddings[:20].cpu().numpy()
+        else:
+            embed_values = embeddings.flatten()[:20].cpu().numpy()
     else:
         embed_values = embeddings[:20].cpu().numpy()
     return result
+def utf8_safe_split(text: str, chunk_size: int = 62) -> List[str]:
+    """Split text into chunks safely at UTF-8 character boundaries"""
+    chunks = []
+    current = ""
+    current_bytes = 0
+    for char in text:
+        char_bytes = len(char.encode('utf-8'))
+        if current_bytes + char_bytes > chunk_size:
+            if current:  # Only append non-empty chunks
+                chunks.append(current)
+            current = char
+            current_bytes = char_bytes
+        else:
+            current += char
+            current_bytes += char_bytes
+    if current:
+        chunks.append(current)
+    return chunks
 def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
+    """Process a single chunk of text and extract token boundaries"""
     model, tokenizer = load_model()
     # Encode to bytes
             use_cross_attention=True  # Enable cross-attention for better reconstruction
         )
+    # Extract groups for visualization - check all boundary types
     groups_visual = "No groups"
     num_tokens = 1
+    boundaries = None
+    # Check multiple boundary types in order of preference
+    for boundary_key in ['eojeol_boundaries', 'char_boundaries', 'phrase_boundaries']:
+        if boundary_key in outputs:
+            boundaries = outputs[boundary_key]
+            groups_visual = visualize_groups(byte_seq, boundaries)
+            boundary_binary = torch.argmax(boundaries, dim=-1)[0]
+            num_tokens = torch.sum(boundary_binary == 1).item() + 1
+            break
+    # If no boundaries found, show entire chunk as one token
+    if boundaries is None:
+        groups_visual = f"<{text_chunk}>"
+        num_tokens = 1
+    # Get embeddings - check correct key (encoder_hidden_states)
     embeddings = None
+    if 'encoder_hidden_states' in outputs:
+        encoder_states = outputs['encoder_hidden_states']
+        if encoder_states is not None:
+            if encoder_states.dim() >= 3:
+                embeddings = encoder_states[0, 0]  # First token embedding
+            elif encoder_states.dim() == 2:
+                embeddings = encoder_states[0]  # First row
+    elif 'pooled_output' in outputs:
+        embeddings = outputs['pooled_output'][0] if outputs['pooled_output'] is not None else None
     # Reconstruction
     reconstructed = ""
         'embeddings': embeddings
     }
+def stream_process(text: str, chunk_size: int = 62, overlap: int = 0) -> Generator:
+    """Stream process text with UTF-8 safe chunking"""
     if not text:
         yield {"error": "Please enter text"}
         return
+    # Process in UTF-8 safe chunks (no overlap for simplicity with UTF-8 boundaries)
+    chunks = utf8_safe_split(text, chunk_size)
+    for chunk_idx, chunk_text in enumerate(chunks):
         # Skip very small chunks
+        if len(chunk_text) < 3 and chunk_idx > 0:
             continue
         try:
             result = process_chunk(chunk_text, chunk_idx)
             yield result
         except Exception as e:
 - Chunks Processed: {len(all_results)}
 """
+        # Format groups visualization showing actual token boundaries
+        groups_text = "**Token Boundaries (< > shows model-learned token groups):**\n\n"
+        # Show more chunks for shorter texts
+        max_chunks_to_show = min(len(all_results), 5)
+        for i, result in enumerate(all_results[:max_chunks_to_show]):
+            groups_text += f"Chunk {i+1}: {result['groups']}\n"
+            if result['num_tokens'] > 1:
+                groups_text += f"  → {result['num_tokens']} tokens detected\n"
+            groups_text += "\n"
+        if len(all_results) > max_chunks_to_show:
+            groups_text += f"... and {len(all_results)-max_chunks_to_show} more chunks\n"
         # Format embeddings as text
         embed_text = ""
+        if show_embeddings:
+            if all_results and all_results[0]['embeddings'] is not None:
+                embed_text = format_embeddings(all_results[0]['embeddings'])
+            else:
+                embed_text = "**No embeddings available**\n(Model may not have encoder_hidden_states output)"
         return stats, full_reconstructed, groups_text, embed_text, overall_compression
     ### 18.6:1 Average Compression with 100% Reconstruction!
     Advanced features:
+    - **UTF-8 Safe Chunking**: Preserves character boundaries
+    - **Token Boundary Visualization**: Shows model-learned token groups
     - **Embedding Display**: Visualize learned representations
     - **Streaming Support**: Process text in real-time
     """)
     with gr.Tab("Streaming Demo"):
         gr.Markdown("""
         ### Real-time Streaming Processing
+        Watch as text is processed chunk by chunk with UTF-8 safe splitting.
         """)
         stream_input = gr.Textbox(
     - **Version**: 6.1.2 (best_model.pt - Epoch 233)
     - **Architecture**: ByteEncoder + TransformerDecoder with Cross-Attention
     - **Chunk Size**: 64 bytes (62 content + BOS + EOS)
+    - **UTF-8 Safe**: Preserves character boundaries
     - **Boundary Learning**: 3-level hierarchical (char, word, phrase)
+    - **Languages Trained**: English, Korean, Chinese, Japanese, Arabic, Spanish
     - **Average Compression**: 18.6:1 (varies by language)
     - **Reconstruction**: 100% accuracy achieved
     - Pure byte-level tokenization (no vocabulary)
     - Learning-based compression without language rules
     - Cross-attention for sequence relationships
+    - Model-learned token boundaries (not fixed chunks)
     ---
     *Note: v6.1.3 in training with 204 languages for universal coverage*