ggunio commited on
Commit
0250815
Β·
1 Parent(s): 83b0066

Fix UTF-8 safe chunking, token boundary visualization, and embedding display

Browse files

- Implemented UTF-8 safe text splitting to preserve character boundaries
- Show actual model-learned token boundaries instead of chunk boundaries
- Fixed embedding extraction using encoder_hidden_states key
- Added language list (English, Korean, Chinese, Japanese, Arabic, Spanish)
- Enhanced embedding statistics display

Files changed (1) hide show
  1. app.py +88 -40
app.py CHANGED
@@ -1,7 +1,7 @@
1
  """
2
  B2NL (Byte-to-Natural-Language) Tokenizer Demo
3
  Version 6.1.2 - 18.6:1 Compression with 100% Reconstruction
4
- Enhanced with chunking, streaming, group visualization, and embeddings
5
  """
6
 
7
  import gradio as gr
@@ -11,7 +11,6 @@ from pathlib import Path
11
  import sys
12
  import time
13
  from typing import List, Tuple, Dict, Generator
14
- # Removed matplotlib imports - using text display instead
15
 
16
  # Import from local core directory
17
  from core.unified_model import IntelligentTokenizerModelV61
@@ -65,7 +64,7 @@ def load_model(checkpoint_path=None):
65
  return model, tokenizer
66
 
67
  def visualize_groups(byte_seq: List[int], boundaries: torch.Tensor) -> str:
68
- """Visualize how bytes are grouped for compression"""
69
  if boundaries is None:
70
  return "No boundary information available"
71
 
@@ -108,13 +107,17 @@ def visualize_groups(byte_seq: List[int], boundaries: torch.Tensor) -> str:
108
  return ' '.join(groups)
109
 
110
  def format_embeddings(embeddings: torch.Tensor) -> str:
111
- """Format embeddings as text"""
112
  if embeddings is None:
113
  return "No embeddings available"
114
 
115
- # Take first 20 dimensions for display
116
  if embeddings.dim() > 1:
117
- embed_values = embeddings[0, :20].cpu().numpy()
 
 
 
 
118
  else:
119
  embed_values = embeddings[:20].cpu().numpy()
120
 
@@ -134,8 +137,30 @@ def format_embeddings(embeddings: torch.Tensor) -> str:
134
 
135
  return result
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
138
- """Process a single chunk of text"""
139
  model, tokenizer = load_model()
140
 
141
  # Encode to bytes
@@ -169,18 +194,36 @@ def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
169
  use_cross_attention=True # Enable cross-attention for better reconstruction
170
  )
171
 
172
- # Extract groups for visualization
173
  groups_visual = "No groups"
174
  num_tokens = 1
175
- if 'eojeol_boundaries' in outputs:
176
- groups_visual = visualize_groups(byte_seq, outputs['eojeol_boundaries'])
177
- boundaries = torch.argmax(outputs['eojeol_boundaries'], dim=-1)[0]
178
- num_tokens = torch.sum(boundaries == 1).item() + 1
 
 
 
 
 
 
 
 
 
 
 
179
 
180
- # Get embeddings
181
  embeddings = None
182
- if 'encoder_hidden' in outputs:
183
- embeddings = outputs['encoder_hidden'][0, 0] # First token embedding
 
 
 
 
 
 
 
184
 
185
  # Reconstruction
186
  reconstructed = ""
@@ -218,25 +261,21 @@ def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
218
  'embeddings': embeddings
219
  }
220
 
221
- def stream_process(text: str, chunk_size: int = 62, overlap: int = 8) -> Generator:
222
- """Stream process text with sliding window"""
223
  if not text:
224
  yield {"error": "Please enter text"}
225
  return
226
 
227
- # Process in chunks
228
- text_bytes = text.encode('utf-8')
229
- step = chunk_size - overlap
230
-
231
- for chunk_idx, i in enumerate(range(0, len(text_bytes), step)):
232
- chunk_bytes = text_bytes[i:i+chunk_size]
233
 
 
234
  # Skip very small chunks
235
- if len(chunk_bytes) < 10 and i > 0:
236
  continue
237
 
238
  try:
239
- chunk_text = chunk_bytes.decode('utf-8', errors='ignore')
240
  result = process_chunk(chunk_text, chunk_idx)
241
  yield result
242
  except Exception as e:
@@ -282,18 +321,28 @@ def process_text_full(text: str, show_embeddings: bool = False):
282
  - Chunks Processed: {len(all_results)}
283
  """
284
 
285
- # Format groups visualization (show first 3 chunks)
286
- groups_text = "**Compression Groups (< > shows token boundaries):**\n\n"
287
- for i, result in enumerate(all_results[:3]):
288
- groups_text += f"Chunk {i+1}: {result['groups']}\n\n"
 
289
 
290
- if len(all_results) > 3:
291
- groups_text += f"... and {len(all_results)-3} more chunks\n"
 
 
 
 
 
 
292
 
293
  # Format embeddings as text
294
  embed_text = ""
295
- if show_embeddings and all_results and all_results[0]['embeddings'] is not None:
296
- embed_text = format_embeddings(all_results[0]['embeddings'])
 
 
 
297
 
298
  return stats, full_reconstructed, groups_text, embed_text, overall_compression
299
 
@@ -350,9 +399,8 @@ with gr.Blocks(
350
  ### 18.6:1 Average Compression with 100% Reconstruction!
351
 
352
  Advanced features:
353
- - **Chunked Processing**: Handles long texts with 64-byte chunks
354
- - **Sliding Window**: 8-byte overlap for seamless boundaries
355
- - **Group Visualization**: See how bytes are compressed into tokens
356
  - **Embedding Display**: Visualize learned representations
357
  - **Streaming Support**: Process text in real-time
358
  """)
@@ -434,7 +482,7 @@ with gr.Blocks(
434
  with gr.Tab("Streaming Demo"):
435
  gr.Markdown("""
436
  ### Real-time Streaming Processing
437
- Watch as text is processed chunk by chunk with sliding window overlap.
438
  """)
439
 
440
  stream_input = gr.Textbox(
@@ -490,9 +538,9 @@ with gr.Blocks(
490
  - **Version**: 6.1.2 (best_model.pt - Epoch 233)
491
  - **Architecture**: ByteEncoder + TransformerDecoder with Cross-Attention
492
  - **Chunk Size**: 64 bytes (62 content + BOS + EOS)
493
- - **Sliding Window**: 8-byte overlap for continuity
494
  - **Boundary Learning**: 3-level hierarchical (char, word, phrase)
495
- - **Languages Tested**: 6 core languages
496
  - **Average Compression**: 18.6:1 (varies by language)
497
  - **Reconstruction**: 100% accuracy achieved
498
 
@@ -500,7 +548,7 @@ with gr.Blocks(
500
  - Pure byte-level tokenization (no vocabulary)
501
  - Learning-based compression without language rules
502
  - Cross-attention for sequence relationships
503
- - Boundary detection for optimal grouping
504
 
505
  ---
506
  *Note: v6.1.3 in training with 204 languages for universal coverage*
 
1
  """
2
  B2NL (Byte-to-Natural-Language) Tokenizer Demo
3
  Version 6.1.2 - 18.6:1 Compression with 100% Reconstruction
4
+ Enhanced with UTF-8 safe chunking, token boundary visualization, and embeddings
5
  """
6
 
7
  import gradio as gr
 
11
  import sys
12
  import time
13
  from typing import List, Tuple, Dict, Generator
 
14
 
15
  # Import from local core directory
16
  from core.unified_model import IntelligentTokenizerModelV61
 
64
  return model, tokenizer
65
 
66
  def visualize_groups(byte_seq: List[int], boundaries: torch.Tensor) -> str:
67
+ """Visualize how bytes are grouped for compression based on model boundaries"""
68
  if boundaries is None:
69
  return "No boundary information available"
70
 
 
107
  return ' '.join(groups)
108
 
109
  def format_embeddings(embeddings: torch.Tensor) -> str:
110
+ """Format embeddings as text with statistics"""
111
  if embeddings is None:
112
  return "No embeddings available"
113
 
114
+ # Handle different tensor shapes
115
  if embeddings.dim() > 1:
116
+ # If multiple dimensions, flatten or take first
117
+ if embeddings.shape[0] > 20:
118
+ embed_values = embeddings[:20].cpu().numpy()
119
+ else:
120
+ embed_values = embeddings.flatten()[:20].cpu().numpy()
121
  else:
122
  embed_values = embeddings[:20].cpu().numpy()
123
 
 
137
 
138
  return result
139
 
140
+ def utf8_safe_split(text: str, chunk_size: int = 62) -> List[str]:
141
+ """Split text into chunks safely at UTF-8 character boundaries"""
142
+ chunks = []
143
+ current = ""
144
+ current_bytes = 0
145
+
146
+ for char in text:
147
+ char_bytes = len(char.encode('utf-8'))
148
+ if current_bytes + char_bytes > chunk_size:
149
+ if current: # Only append non-empty chunks
150
+ chunks.append(current)
151
+ current = char
152
+ current_bytes = char_bytes
153
+ else:
154
+ current += char
155
+ current_bytes += char_bytes
156
+
157
+ if current:
158
+ chunks.append(current)
159
+
160
+ return chunks
161
+
162
  def process_chunk(text_chunk: str, chunk_idx: int) -> Dict:
163
+ """Process a single chunk of text and extract token boundaries"""
164
  model, tokenizer = load_model()
165
 
166
  # Encode to bytes
 
194
  use_cross_attention=True # Enable cross-attention for better reconstruction
195
  )
196
 
197
+ # Extract groups for visualization - check all boundary types
198
  groups_visual = "No groups"
199
  num_tokens = 1
200
+ boundaries = None
201
+
202
+ # Check multiple boundary types in order of preference
203
+ for boundary_key in ['eojeol_boundaries', 'char_boundaries', 'phrase_boundaries']:
204
+ if boundary_key in outputs:
205
+ boundaries = outputs[boundary_key]
206
+ groups_visual = visualize_groups(byte_seq, boundaries)
207
+ boundary_binary = torch.argmax(boundaries, dim=-1)[0]
208
+ num_tokens = torch.sum(boundary_binary == 1).item() + 1
209
+ break
210
+
211
+ # If no boundaries found, show entire chunk as one token
212
+ if boundaries is None:
213
+ groups_visual = f"<{text_chunk}>"
214
+ num_tokens = 1
215
 
216
+ # Get embeddings - check correct key (encoder_hidden_states)
217
  embeddings = None
218
+ if 'encoder_hidden_states' in outputs:
219
+ encoder_states = outputs['encoder_hidden_states']
220
+ if encoder_states is not None:
221
+ if encoder_states.dim() >= 3:
222
+ embeddings = encoder_states[0, 0] # First token embedding
223
+ elif encoder_states.dim() == 2:
224
+ embeddings = encoder_states[0] # First row
225
+ elif 'pooled_output' in outputs:
226
+ embeddings = outputs['pooled_output'][0] if outputs['pooled_output'] is not None else None
227
 
228
  # Reconstruction
229
  reconstructed = ""
 
261
  'embeddings': embeddings
262
  }
263
 
264
+ def stream_process(text: str, chunk_size: int = 62, overlap: int = 0) -> Generator:
265
+ """Stream process text with UTF-8 safe chunking"""
266
  if not text:
267
  yield {"error": "Please enter text"}
268
  return
269
 
270
+ # Process in UTF-8 safe chunks (no overlap for simplicity with UTF-8 boundaries)
271
+ chunks = utf8_safe_split(text, chunk_size)
 
 
 
 
272
 
273
+ for chunk_idx, chunk_text in enumerate(chunks):
274
  # Skip very small chunks
275
+ if len(chunk_text) < 3 and chunk_idx > 0:
276
  continue
277
 
278
  try:
 
279
  result = process_chunk(chunk_text, chunk_idx)
280
  yield result
281
  except Exception as e:
 
321
  - Chunks Processed: {len(all_results)}
322
  """
323
 
324
+ # Format groups visualization showing actual token boundaries
325
+ groups_text = "**Token Boundaries (< > shows model-learned token groups):**\n\n"
326
+
327
+ # Show more chunks for shorter texts
328
+ max_chunks_to_show = min(len(all_results), 5)
329
 
330
+ for i, result in enumerate(all_results[:max_chunks_to_show]):
331
+ groups_text += f"Chunk {i+1}: {result['groups']}\n"
332
+ if result['num_tokens'] > 1:
333
+ groups_text += f" β†’ {result['num_tokens']} tokens detected\n"
334
+ groups_text += "\n"
335
+
336
+ if len(all_results) > max_chunks_to_show:
337
+ groups_text += f"... and {len(all_results)-max_chunks_to_show} more chunks\n"
338
 
339
  # Format embeddings as text
340
  embed_text = ""
341
+ if show_embeddings:
342
+ if all_results and all_results[0]['embeddings'] is not None:
343
+ embed_text = format_embeddings(all_results[0]['embeddings'])
344
+ else:
345
+ embed_text = "**No embeddings available**\n(Model may not have encoder_hidden_states output)"
346
 
347
  return stats, full_reconstructed, groups_text, embed_text, overall_compression
348
 
 
399
  ### 18.6:1 Average Compression with 100% Reconstruction!
400
 
401
  Advanced features:
402
+ - **UTF-8 Safe Chunking**: Preserves character boundaries
403
+ - **Token Boundary Visualization**: Shows model-learned token groups
 
404
  - **Embedding Display**: Visualize learned representations
405
  - **Streaming Support**: Process text in real-time
406
  """)
 
482
  with gr.Tab("Streaming Demo"):
483
  gr.Markdown("""
484
  ### Real-time Streaming Processing
485
+ Watch as text is processed chunk by chunk with UTF-8 safe splitting.
486
  """)
487
 
488
  stream_input = gr.Textbox(
 
538
  - **Version**: 6.1.2 (best_model.pt - Epoch 233)
539
  - **Architecture**: ByteEncoder + TransformerDecoder with Cross-Attention
540
  - **Chunk Size**: 64 bytes (62 content + BOS + EOS)
541
+ - **UTF-8 Safe**: Preserves character boundaries
542
  - **Boundary Learning**: 3-level hierarchical (char, word, phrase)
543
+ - **Languages Trained**: English, Korean, Chinese, Japanese, Arabic, Spanish
544
  - **Average Compression**: 18.6:1 (varies by language)
545
  - **Reconstruction**: 100% accuracy achieved
546
 
 
548
  - Pure byte-level tokenization (no vocabulary)
549
  - Learning-based compression without language rules
550
  - Cross-attention for sequence relationships
551
+ - Model-learned token boundaries (not fixed chunks)
552
 
553
  ---
554
  *Note: v6.1.3 in training with 204 languages for universal coverage*