Bor Hodošček commited on
Commit
ca0f322
·
1 Parent(s): 8bfa6b3

chore: update dockerfile and deps

Browse files
Files changed (6) hide show
  1. .dockerignore +1 -0
  2. Dockerfile +4 -5
  3. app.py +351 -121
  4. development.md +2 -2
  5. pyproject.toml +3 -3
  6. uv.lock +0 -0
.dockerignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv
Dockerfile CHANGED
@@ -1,5 +1,4 @@
1
- FROM python:3.12-slim
2
- COPY --from=ghcr.io/astral-sh/uv:0.7.3 /uv /bin/uv
3
 
4
  RUN useradd -m -u 1000 user
5
  ENV PATH="/home/user/.local/bin:$PATH"
@@ -7,14 +6,14 @@ ENV UV_SYSTEM_PYTHON=1
7
 
8
  WORKDIR /app
9
 
10
- RUN mkdir -p /app && chown -R user:user /app
11
 
12
- COPY --chown=user ./pyproject.toml ./uv.lock ./pyproject.toml ./app.py /app
13
 
14
  RUN chmod -R u+w /app
15
 
16
  USER user
17
 
18
- RUN uv sync
19
 
20
  CMD ["uv", "run", "marimo", "run", "app.py", "--no-sandbox", "--include-code", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM ghcr.io/astral-sh/uv:0.9.5-python3.13-trixie-slim
 
2
 
3
  RUN useradd -m -u 1000 user
4
  ENV PATH="/home/user/.local/bin:$PATH"
 
6
 
7
  WORKDIR /app
8
 
9
+ RUN chown -R user:user /app
10
 
11
+ COPY --chown=user pyproject.toml uv.lock app.py /app
12
 
13
  RUN chmod -R u+w /app
14
 
15
  USER user
16
 
17
+ RUN uv sync --locked
18
 
19
  CMD ["uv", "run", "marimo", "run", "app.py", "--no-sandbox", "--include-code", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,12 +1,12 @@
1
  # /// script
2
  # dependencies = [
3
  # "marimo>=0.13.0",
4
- # "polars==1.29.0",
5
- # "altair==5.5.0",
6
- # "spacy==3.8.5",
7
  # "en-core-web-md",
8
  # "ja-core-news-md",
9
- # "transformers==4.51.3",
10
  # ]
11
  #
12
  # [tool.uv.sources]
@@ -18,7 +18,7 @@
18
 
19
  import marimo
20
 
21
- __generated_with = "0.13.6"
22
  app = marimo.App(width="medium")
23
 
24
 
@@ -33,26 +33,18 @@ def _():
33
  import marimo as mo
34
  import polars as pl
35
  import spacy
36
- import spacy.language
37
  from transformers import (
38
- AutoTokenizer,
39
  PreTrainedTokenizerBase,
 
40
  )
41
 
42
- # Load spaCy models for English and Japanese
43
- nlp_en: spacy.language.Language = spacy.load("en_core_web_md")
44
- nlp_ja: spacy.language.Language = spacy.load("ja_core_news_md")
45
-
46
- # List of tokenizer models
47
  llm_model_choices: list[str] = [
48
- # "meta-llama/Llama-4-Scout-17B-16E-Instruct",
 
 
49
  "google/gemma-3-27b-it",
50
  "ibm-granite/granite-3.3-8b-instruct",
51
- "shisa-ai/shisa-v2-qwen2.5-7b",
52
- # "deepseek-ai/DeepSeek-R1",
53
- # "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
54
- # "Qwen/Qwen2.5-72B-Instruct",
55
- # "openai-community/gpt2",
56
  "google-bert/bert-large-uncased",
57
  ]
58
  return (
@@ -67,14 +59,29 @@ def _():
67
  llm_model_choices,
68
  math,
69
  mo,
70
- nlp_en,
71
- nlp_ja,
72
  pl,
73
  re,
74
  spacy,
75
  )
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  @app.cell
79
  def _(mo):
80
  mo.md("""# Tokenization for English and Japanese""")
@@ -112,11 +119,11 @@ def _(mo):
112
  @app.cell
113
  def _(
114
  en_placeholder,
115
- get_text_content,
116
  ja_placeholder,
117
- language_selector,
118
  mo,
119
- set_text_content,
120
  ):
121
  # Define text_input dynamically based on language
122
  current_placeholder: str = (
@@ -133,7 +140,7 @@ def _(
133
 
134
 
135
  @app.cell
136
- def _(current_placeholder, mo, set_text_content):
137
  def apply_placeholder() -> None:
138
  set_text_content(current_placeholder)
139
 
@@ -144,7 +151,12 @@ def _(current_placeholder, mo, set_text_content):
144
 
145
 
146
  @app.cell
147
- def _(apply_placeholder_button, language_selector, mo, text_input):
 
 
 
 
 
148
  mo.vstack(
149
  [
150
  text_input,
@@ -152,24 +164,25 @@ def _(apply_placeholder_button, language_selector, mo, text_input):
152
  mo.ui.button(label="Analyze"),
153
  ]
154
  )
155
-
156
  return
157
 
158
 
159
  @app.cell
160
- def _(get_text_content, language_selector, mo, nlp_en, nlp_ja, spacy):
 
 
 
 
 
 
161
  # Analyze text using spaCy based on selected language
162
- current_text: str = get_text_content()
163
- doc: spacy.tokens.Doc
164
- if language_selector.value == "English":
165
- doc = nlp_en(current_text)
166
- else:
167
- doc = nlp_ja(current_text)
168
- model_name: str = (
169
- nlp_en.meta["name"]
170
- if language_selector.value == "English"
171
- else nlp_ja.meta["name"]
172
  )
 
 
 
 
173
 
174
  tokenized_text: list[str] = [token.text for token in doc]
175
  token_count: int = len(tokenized_text)
@@ -181,7 +194,7 @@ def _(get_text_content, language_selector, mo, nlp_en, nlp_ja, spacy):
181
 
182
 
183
  @app.cell
184
- def _(doc, mo, pl):
185
  token_data: pl.DataFrame = pl.DataFrame(
186
  {
187
  "Token": [token.text for token in doc],
@@ -190,12 +203,15 @@ def _(doc, mo, pl):
190
  "Tag": [token.tag_ for token in doc],
191
  "Morph": [str(token.morph) for token in doc],
192
  "OOV": [
193
- token.is_oov for token in doc
194
- ], # FIXME: How to get .is_oov() from sudachi directly? This only works for English now...
195
- "Token Position": list(range(len(doc))),
196
- "Sentence Number": [
197
- i for i, sent in enumerate(doc.sents) for token in sent
198
  ],
 
 
 
 
 
 
199
  }
200
  )
201
 
@@ -216,7 +232,13 @@ def _(mo):
216
 
217
 
218
  @app.cell
219
- def _(alt, column_selector, mo, pl, token_data):
 
 
 
 
 
 
220
  mo.stop(token_data.is_empty(), "Please set input text.")
221
 
222
  selected_column: str = column_selector.value
@@ -243,7 +265,7 @@ def _(alt, column_selector, mo, pl, token_data):
243
 
244
 
245
  @app.cell
246
- def _(llm_model_choices, mo):
247
  llm_tokenizer_selector: mo.ui.dropdown = mo.ui.dropdown(
248
  options=llm_model_choices,
249
  value=llm_model_choices[0],
@@ -254,12 +276,96 @@ def _(llm_model_choices, mo):
254
 
255
 
256
  @app.cell
257
- def _(AutoTokenizer, PreTrainedTokenizerBase, llm_tokenizer_selector):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
259
  selected_model_name: str = llm_tokenizer_selector.value
260
- tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(
261
- selected_model_name
262
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  return (tokenizer,)
264
 
265
 
@@ -297,7 +403,7 @@ def _(Union, math):
297
  len(original_text) / total_tokens if total_tokens > 0 else 0.0
298
  )
299
 
300
- space_tokens: int = sum(1 for t in tokens if t.startswith(("Ġ", " ")))
301
  newline_tokens: int = sum(
302
  1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
303
  )
@@ -339,7 +445,11 @@ def _(Union, math):
339
  variance: float = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
340
  std_dev: float = math.sqrt(variance)
341
  sorted_lengths: list[int] = sorted(lengths)
342
- median_length: float = float(sorted_lengths[len(lengths) // 2])
 
 
 
 
343
 
344
  return {
345
  "basic_stats": {
@@ -362,7 +472,6 @@ def _(Union, math):
362
  "median_length": median_length,
363
  },
364
  }
365
-
366
  return (get_token_stats,)
367
 
368
 
@@ -380,7 +489,6 @@ def _(hashlib):
380
  "background": f"hsl({hue}, {saturation}%, {lightness}%)",
381
  "text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
382
  }
383
-
384
  return (get_varied_color,)
385
 
386
 
@@ -398,14 +506,8 @@ def fix_token(
398
  # Return a clear representation indicating it's a byte
399
  return f"<0x{hex_value}>"
400
 
401
- # Replace SentencePiece space marker U+2581 (' ') with a middle dot
402
- token = token.replace(" ", "·")
403
-
404
- # Replace BPE space marker 'Ġ' with a middle dot
405
- if token.startswith("Ġ"):
406
- space_count = token.count("Ġ")
407
- # Ensure we only replace the leading 'Ġ' markers
408
- return "·" * space_count + token[space_count:]
409
 
410
  # Replace newline markers for display
411
  token = token.replace("Ċ", "↵\n")
@@ -478,7 +580,15 @@ def _(Any, PreTrainedTokenizerBase):
478
  ):
479
  token_name = attr_name
480
  break
481
- special_tokens[token_name] = str(token_value)
 
 
 
 
 
 
 
 
482
  processed_tokens.add(str(token_value))
483
 
484
  # Fallback/Augment with individual attributes if not covered by all_special_tokens
@@ -490,7 +600,15 @@ def _(Any, PreTrainedTokenizerBase):
490
  and str(token_value).strip()
491
  and str(token_value) not in processed_tokens
492
  ):
493
- special_tokens[token_name] = str(token_value)
 
 
 
 
 
 
 
 
494
  processed_tokens.add(str(token_value))
495
 
496
  info["special_tokens"] = special_tokens if special_tokens else "None found"
@@ -499,7 +617,6 @@ def _(Any, PreTrainedTokenizerBase):
499
  info["error"] = f"Error extracting tokenizer info: {str(e)}"
500
 
501
  return info
502
-
503
  return (get_tokenizer_info,)
504
 
505
 
@@ -516,107 +633,209 @@ def _(
516
  Any,
517
  Optional,
518
  Union,
519
- current_text,
 
 
 
520
  get_token_stats,
521
  get_tokenizer_info,
522
  get_varied_color,
523
- llm_tokenizer_selector,
524
  mo,
525
  re,
526
- show_ids_switch,
 
 
 
527
  tokenizer,
528
  ):
529
  # Define the Unicode replacement character
530
  REPLACEMENT_CHARACTER = "\ufffd"
531
 
532
- # Get tokenizer metadata
 
533
  tokenizer_info: dict[str, Any] = get_tokenizer_info(tokenizer)
534
 
535
  # 1. Encode text to get token IDs first.
536
- token_ids: list[int] = tokenizer.encode(current_text, add_special_tokens=False)
537
- # 2. Decode each token ID individually.
538
- # We will check for REPLACEMENT_CHARACTER later.
539
- all_decoded_tokens: list[str] = [
 
 
 
540
  tokenizer.decode(
541
- [token_id], skip_special_tokens=False, clean_up_tokenization_spaces=False
 
 
542
  )
543
- for token_id in token_ids
544
  ]
545
 
546
- total_token_count: int = len(token_ids) # Count based on IDs
 
 
 
 
 
 
 
 
 
 
547
 
548
- # Limit the number of tokens for display
549
- display_limit: int = 1000
550
- # Limit consistently using token IDs and the decoded tokens
551
- display_token_ids: list[int] = token_ids[:display_limit]
552
- display_decoded_tokens: list[str] = all_decoded_tokens[:display_limit]
553
- display_limit_reached: bool = total_token_count > display_limit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
 
555
  # Generate data for visualization
556
  TokenVisData = dict[str, Union[str, int, bool, dict[str, str]]]
557
  llm_token_data: list[TokenVisData] = []
558
 
559
- # Use zip for parallel iteration
560
- for idx, (token_id, token_str) in enumerate(
561
- zip(display_token_ids, display_decoded_tokens)
562
- ):
563
- colors: dict[str, str] = get_varied_color(
564
- token_str
565
- if REPLACEMENT_CHARACTER not in token_str
566
- else f"invalid_{token_id}"
567
- ) # Color based on string or ID if invalid
568
-
569
- is_invalid_utf8 = REPLACEMENT_CHARACTER in token_str
570
- fixed_token_display: str
571
- original_for_title: str = (
572
- token_str # Store the potentially problematic string for title
573
  )
574
 
575
- if is_invalid_utf8:
576
- # If decode failed, show a representation with the hex ID
577
- fixed_token_display = f"<0x{token_id:X}>"
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  else:
579
- # If decode succeeded, apply standard fixes
580
- fixed_token_display = fix_token(token_str, re)
581
 
 
 
 
582
  llm_token_data.append(
583
  {
584
- "original": original_for_title, # Store the raw decoded string (might contain �)
585
- "display": fixed_token_display, # Store the cleaned/invalid representation
 
 
 
 
 
586
  "colors": colors,
587
- "is_newline": "↵" in fixed_token_display, # Check the display version
588
- "token_id": token_id,
589
  "token_index": idx,
590
- "is_invalid": is_invalid_utf8, # Add flag for potential styling/title changes
591
  }
592
  )
593
 
594
- # Calculate statistics using the list of *successfully* decoded token strings
595
- # We might want to reconsider what `all_tokens` means for stats if many are invalid.
596
- # For now, let's use the potentially problematic strings, as stats are mostly length/count based.
597
  token_stats: dict[str, dict[str, Union[int, float]]] = get_token_stats(
598
- all_decoded_tokens,
599
- current_text, # Pass the full list from decode()
600
  )
601
 
602
- # Construct HTML for colored tokens using list comprehension (functional style)
603
  html_parts: list[str] = [
604
  (
605
  lambda item: (
606
  style
607
  := f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
608
- # Add specific style for invalid tokens if needed
609
- + (" border: 1px solid red;" if item.get("is_invalid") else ""),
 
 
 
 
 
 
 
 
610
  # Modify title based on validity
611
  title := (
612
  f"Original: {item['original']}\nID: {item['token_id']}"
613
  + ("\n(Invalid UTF-8)" if item.get("is_invalid") else "")
614
- + ("\n(Byte Token)" if item["display"].startswith("byte[") else "")
 
 
 
 
 
615
  ),
616
  display_content := str(item["token_id"])
617
  if show_ids_switch.value
618
  else item["display"],
619
- f'<span style="{style}" title="{title}">{display_content}</span>',
620
  )[-1] # Get the last element (the formatted string) from the lambda's tuple
621
  )(item)
622
  for item in llm_token_data
@@ -632,6 +851,16 @@ def _(
632
  limit_warning = mo.md(f"""**Warning:** Displaying only the first {display_limit:,} tokens out of {total_token_count:,}.
633
  Statistics are calculated on the full text.""").callout(kind="warn")
634
 
 
 
 
 
 
 
 
 
 
 
635
  # Use dict access safely with .get() for stats
636
  basic_stats: dict[str, Union[int, float]] = token_stats.get("basic_stats", {})
637
  length_stats: dict[str, Union[int, float]] = token_stats.get("length_stats", {})
@@ -670,16 +899,18 @@ def _(
670
 
671
  tokenizer_info_md: str = "\n\n".join(tokenizer_info_md_parts)
672
 
673
- # Display the final markdown output
674
- mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
675
-
676
- ## Tokenizer Info
677
- {tokenizer_info_md}
678
 
 
679
  {show_ids_switch}
680
 
 
 
681
  ## Tokenizer output
682
  {limit_warning if limit_warning else ""}
 
683
  {mo.as_html(token_viz_html)}
684
 
685
  ## Token Statistics
@@ -690,7 +921,6 @@ def _(
690
  {length_stats_md}
691
 
692
  """)
693
-
694
  return
695
 
696
 
 
1
  # /// script
2
  # dependencies = [
3
  # "marimo>=0.13.0",
4
+ # "polars>=1.29.0",
5
+ # "altair>=5.5.0",
6
+ # "spacy==3.8.7",
7
  # "en-core-web-md",
8
  # "ja-core-news-md",
9
+ # "transformers>=4.57.1",
10
  # ]
11
  #
12
  # [tool.uv.sources]
 
18
 
19
  import marimo
20
 
21
+ __generated_with = "0.17.2"
22
  app = marimo.App(width="medium")
23
 
24
 
 
33
  import marimo as mo
34
  import polars as pl
35
  import spacy
 
36
  from transformers import (
 
37
  PreTrainedTokenizerBase,
38
+ AutoTokenizer,
39
  )
40
 
 
 
 
 
 
41
  llm_model_choices: list[str] = [
42
+ "deepseek-ai/DeepSeek-OCR",
43
+ "zai-org/GLM-4.6",
44
+ "openai/gpt-oss-20b",
45
  "google/gemma-3-27b-it",
46
  "ibm-granite/granite-3.3-8b-instruct",
47
+ "deep-analysis-research/Flux-Japanese-Qwen2.5-32B-Instruct-V1.0",
 
 
 
 
48
  "google-bert/bert-large-uncased",
49
  ]
50
  return (
 
59
  llm_model_choices,
60
  math,
61
  mo,
 
 
62
  pl,
63
  re,
64
  spacy,
65
  )
66
 
67
 
68
+ @app.cell
69
+ def _(mo, spacy):
70
+ get_nlp_en, set_nlp_en = mo.state(None)
71
+ get_nlp_ja, set_nlp_ja = mo.state(None)
72
+
73
+ def ensure_nlp(language: str) -> spacy.language.Language:
74
+ if language == "English":
75
+ if get_nlp_en() is None:
76
+ set_nlp_en(spacy.load("en_core_web_md"))
77
+ return get_nlp_en()
78
+ else:
79
+ if get_nlp_ja() is None:
80
+ set_nlp_ja(spacy.load("ja_core_news_md"))
81
+ return get_nlp_ja()
82
+ return (ensure_nlp,)
83
+
84
+
85
  @app.cell
86
  def _(mo):
87
  mo.md("""# Tokenization for English and Japanese""")
 
119
  @app.cell
120
  def _(
121
  en_placeholder,
122
+ get_text_content: "Callable[[], str]",
123
  ja_placeholder,
124
+ language_selector: "mo.ui.radio",
125
  mo,
126
+ set_text_content: "Callable[[str], None]",
127
  ):
128
  # Define text_input dynamically based on language
129
  current_placeholder: str = (
 
140
 
141
 
142
  @app.cell
143
+ def _(current_placeholder: str, mo, set_text_content: "Callable[[str], None]"):
144
  def apply_placeholder() -> None:
145
  set_text_content(current_placeholder)
146
 
 
151
 
152
 
153
  @app.cell
154
+ def _(
155
+ apply_placeholder_button: "mo.ui.button",
156
+ language_selector: "mo.ui.radio",
157
+ mo,
158
+ text_input: "mo.ui.text_area",
159
+ ):
160
  mo.vstack(
161
  [
162
  text_input,
 
164
  mo.ui.button(label="Analyze"),
165
  ]
166
  )
 
167
  return
168
 
169
 
170
  @app.cell
171
+ def _(
172
+ ensure_nlp,
173
+ get_text_content: "Callable[[], str]",
174
+ language_selector: "mo.ui.radio",
175
+ mo,
176
+ spacy,
177
+ ):
178
  # Analyze text using spaCy based on selected language
179
+ mo.md("Note: Loading spaCy pipelines on first use may take a few seconds.").callout(
180
+ kind="info"
 
 
 
 
 
 
 
 
181
  )
182
+ current_text: str = get_text_content()
183
+ nlp = ensure_nlp(language_selector.value)
184
+ doc: spacy.tokens.Doc = nlp(current_text)
185
+ model_name: str = nlp.meta["name"]
186
 
187
  tokenized_text: list[str] = [token.text for token in doc]
188
  token_count: int = len(tokenized_text)
 
194
 
195
 
196
  @app.cell
197
+ def _(doc: "spacy.tokens.Doc", language_selector: "mo.ui.radio", mo, pl):
198
  token_data: pl.DataFrame = pl.DataFrame(
199
  {
200
  "Token": [token.text for token in doc],
 
203
  "Tag": [token.tag_ for token in doc],
204
  "Morph": [str(token.morph) for token in doc],
205
  "OOV": [
206
+ token.is_oov if language_selector.value == "English" else None
207
+ for token in doc
 
 
 
208
  ],
209
+ "Token Position": list(range(len(doc))),
210
+ "Sentence Number": (
211
+ [i for i, sent in enumerate(doc.sents) for _ in sent]
212
+ if doc.has_annotation("SENT_START")
213
+ else [0] * len(doc)
214
+ ),
215
  }
216
  )
217
 
 
232
 
233
 
234
  @app.cell
235
+ def _(
236
+ alt,
237
+ column_selector: "mo.ui.dropdown",
238
+ mo,
239
+ pl,
240
+ token_data: "pl.DataFrame",
241
+ ):
242
  mo.stop(token_data.is_empty(), "Please set input text.")
243
 
244
  selected_column: str = column_selector.value
 
265
 
266
 
267
  @app.cell
268
+ def _(llm_model_choices: list[str], mo):
269
  llm_tokenizer_selector: mo.ui.dropdown = mo.ui.dropdown(
270
  options=llm_model_choices,
271
  value=llm_model_choices[0],
 
276
 
277
 
278
  @app.cell
279
+ def _(mo):
280
+ add_special_tokens_switch = mo.ui.switch(
281
+ label="Add special tokens (encode)", value=False
282
+ )
283
+ skip_special_tokens_on_decode_switch = mo.ui.switch(
284
+ label="Skip special tokens in decoded view", value=False
285
+ )
286
+ representation_radio = mo.ui.radio(
287
+ options=["Auto (recommended)", "Decoded strings", "Raw tokens"],
288
+ value="Auto (recommended)",
289
+ label="LLM token representation",
290
+ )
291
+ display_limit_slider = mo.ui.slider(
292
+ 100, 5000, value=1000, label="Display token limit"
293
+ )
294
+ color_by_radio = mo.ui.radio(
295
+ options=["Token", "ID", "Category"],
296
+ value="Token",
297
+ label="Color by",
298
+ )
299
+ show_spaces_switch = mo.ui.switch(
300
+ label="Show spaces as · (decoded view)", value=False
301
+ )
302
+
303
+ mo.vstack(
304
+ [
305
+ mo.hstack(
306
+ [
307
+ add_special_tokens_switch,
308
+ skip_special_tokens_on_decode_switch,
309
+ ]
310
+ ),
311
+ mo.hstack([representation_radio, display_limit_slider]),
312
+ mo.hstack([color_by_radio, show_spaces_switch]),
313
+ mo.accordion(
314
+ {
315
+ "Tip": mo.md(
316
+ "Many GPT-style tokenizers are byte-level; their raw vocab strings can look garbled. Use Decoded strings or Auto."
317
+ ).callout(kind="info")
318
+ }
319
+ ),
320
+ ]
321
+ )
322
+ return (
323
+ add_special_tokens_switch,
324
+ color_by_radio,
325
+ display_limit_slider,
326
+ representation_radio,
327
+ show_spaces_switch,
328
+ skip_special_tokens_on_decode_switch,
329
+ )
330
+
331
+
332
+ @app.cell
333
+ def _(mo):
334
+ get_tok_cache, set_tok_cache = mo.state({})
335
+ return get_tok_cache, set_tok_cache
336
+
337
+
338
+ @app.cell
339
+ def _(
340
+ AutoTokenizer,
341
+ PreTrainedTokenizerBase,
342
+ get_tok_cache,
343
+ llm_tokenizer_selector: "mo.ui.dropdown",
344
+ mo,
345
+ set_tok_cache,
346
+ ):
347
  # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
348
  selected_model_name: str = llm_tokenizer_selector.value
349
+ key = selected_model_name
350
+ cache = get_tok_cache()
351
+ if key in cache:
352
+ tokenizer = cache[key]
353
+ else:
354
+ tokenizer: PreTrainedTokenizerBase = None
355
+ try:
356
+ tokenizer = AutoTokenizer.from_pretrained(
357
+ selected_model_name,
358
+ use_fast=True,
359
+ trust_remote_code=True,
360
+ )
361
+ except Exception as e:
362
+ mo.md(f"Failed to load tokenizer '{selected_model_name}': {e}").callout(
363
+ kind="error"
364
+ )
365
+ tokenizer = None
366
+
367
+ if tokenizer is not None:
368
+ set_tok_cache({**cache, key: tokenizer})
369
  return (tokenizer,)
370
 
371
 
 
403
  len(original_text) / total_tokens if total_tokens > 0 else 0.0
404
  )
405
 
406
+ space_tokens: int = sum(1 for t in tokens if t.startswith(("Ġ", "▁", " ")))
407
  newline_tokens: int = sum(
408
  1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
409
  )
 
445
  variance: float = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
446
  std_dev: float = math.sqrt(variance)
447
  sorted_lengths: list[int] = sorted(lengths)
448
+ n = len(lengths)
449
+ if n % 2 == 1:
450
+ median_length = float(sorted_lengths[n // 2])
451
+ else:
452
+ median_length = (sorted_lengths[n // 2 - 1] + sorted_lengths[n // 2]) / 2
453
 
454
  return {
455
  "basic_stats": {
 
472
  "median_length": median_length,
473
  },
474
  }
 
475
  return (get_token_stats,)
476
 
477
 
 
489
  "background": f"hsl({hue}, {saturation}%, {lightness}%)",
490
  "text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
491
  }
 
492
  return (get_varied_color,)
493
 
494
 
 
506
  # Return a clear representation indicating it's a byte
507
  return f"<0x{hex_value}>"
508
 
509
+ # Replace SentencePiece space marker U+2581 ('') and BPE space marker 'Ġ' with a middle dot
510
+ token = token.replace("▁", "·").replace("Ġ", "·")
 
 
 
 
 
 
511
 
512
  # Replace newline markers for display
513
  token = token.replace("Ċ", "↵\n")
 
580
  ):
581
  token_name = attr_name
582
  break
583
+ token_str = str(token_value)
584
+ token_id = (
585
+ tokenizer.convert_tokens_to_ids(token_str)
586
+ if hasattr(tokenizer, "convert_tokens_to_ids")
587
+ else None
588
+ )
589
+ special_tokens[token_name] = token_str + (
590
+ f" (id {token_id})" if isinstance(token_id, int) else ""
591
+ )
592
  processed_tokens.add(str(token_value))
593
 
594
  # Fallback/Augment with individual attributes if not covered by all_special_tokens
 
600
  and str(token_value).strip()
601
  and str(token_value) not in processed_tokens
602
  ):
603
+ token_str = str(token_value)
604
+ token_id = (
605
+ tokenizer.convert_tokens_to_ids(token_str)
606
+ if hasattr(tokenizer, "convert_tokens_to_ids")
607
+ else None
608
+ )
609
+ special_tokens[token_name] = token_str + (
610
+ f" (id {token_id})" if isinstance(token_id, int) else ""
611
+ )
612
  processed_tokens.add(str(token_value))
613
 
614
  info["special_tokens"] = special_tokens if special_tokens else "None found"
 
617
  info["error"] = f"Error extracting tokenizer info: {str(e)}"
618
 
619
  return info
 
620
  return (get_tokenizer_info,)
621
 
622
 
 
633
  Any,
634
  Optional,
635
  Union,
636
+ add_special_tokens_switch,
637
+ color_by_radio,
638
+ current_text: str,
639
+ display_limit_slider,
640
  get_token_stats,
641
  get_tokenizer_info,
642
  get_varied_color,
643
+ llm_tokenizer_selector: "mo.ui.dropdown",
644
  mo,
645
  re,
646
+ representation_radio,
647
+ show_ids_switch: "mo.ui.switch",
648
+ show_spaces_switch,
649
+ skip_special_tokens_on_decode_switch,
650
  tokenizer,
651
  ):
652
  # Define the Unicode replacement character
653
  REPLACEMENT_CHARACTER = "\ufffd"
654
 
655
+ mo.stop(tokenizer is None, "Please select a valid tokenizer model.")
656
+
657
  tokenizer_info: dict[str, Any] = get_tokenizer_info(tokenizer)
658
 
659
  # 1. Encode text to get token IDs first.
660
+ token_ids: list[int] = tokenizer.encode(
661
+ current_text, add_special_tokens=add_special_tokens_switch.value
662
+ )
663
+
664
+ # 2. Convert IDs to raw tokens and decode each individually
665
+ raw_tokens: list[str] = tokenizer.convert_ids_to_tokens(token_ids)
666
+ decoded_per_id: list[str] = [
667
  tokenizer.decode(
668
+ [tid],
669
+ skip_special_tokens=skip_special_tokens_on_decode_switch.value,
670
+ clean_up_tokenization_spaces=False,
671
  )
672
+ for tid in token_ids
673
  ]
674
 
675
+ # 3. Get offset mapping for span information
676
+ enc = tokenizer(
677
+ current_text,
678
+ add_special_tokens=add_special_tokens_switch.value,
679
+ return_offsets_mapping=True,
680
+ )
681
+ offsets = (
682
+ enc.get("offset_mapping")
683
+ if isinstance(enc, dict)
684
+ else getattr(enc, "offset_mapping", None)
685
+ )
686
 
687
+ if offsets and len(offsets) == len(token_ids):
688
+ records: list[dict[str, Union[int, str]]] = []
689
+ for tid, raw, dec, (s, e) in zip(
690
+ token_ids, raw_tokens, decoded_per_id, offsets
691
+ ):
692
+ substr = current_text[s:e] if (s is not None and e is not None) else ""
693
+ records.append(
694
+ {
695
+ "id": tid,
696
+ "raw": raw,
697
+ "dec": dec,
698
+ "start": s,
699
+ "end": e,
700
+ "substr": substr,
701
+ }
702
+ )
703
+ else:
704
+ records = [
705
+ {
706
+ "id": tid,
707
+ "raw": raw,
708
+ "dec": dec,
709
+ "start": None,
710
+ "end": None,
711
+ "substr": "",
712
+ }
713
+ for tid, raw, dec in zip(token_ids, raw_tokens, decoded_per_id)
714
+ ]
715
+
716
+ def _is_byte_level(tok) -> bool:
717
+ try:
718
+ if getattr(tok, "is_fast", False):
719
+ pre = tok.backend_tokenizer.pre_tokenizer
720
+ types = [pre.__class__.__name__]
721
+ if hasattr(pre, "pre_tokenizers"):
722
+ types = [p.__class__.__name__ for p in pre.pre_tokenizers]
723
+ return "ByteLevel" in types
724
+ except Exception:
725
+ pass
726
+ return False
727
+
728
+ if representation_radio.value == "Auto (recommended)":
729
+ use_decoded: bool = _is_byte_level(tokenizer) or any(
730
+ ("Ġ" in r["raw"] or "Ċ" in r["raw"]) for r in records[:256]
731
+ )
732
+ elif representation_radio.value == "Decoded strings":
733
+ use_decoded = True
734
+ else:
735
+ use_decoded = False
736
+
737
+ if use_decoded:
738
+ source_records = [r for r in records if r["dec"] != ""]
739
+ stats_tokens_source: list[str] = [r["dec"] for r in records if r["dec"] != ""]
740
+ else:
741
+ source_records = records
742
+ stats_tokens_source = [r["raw"] for r in records]
743
+
744
+ total_token_count: int = len(source_records)
745
+ display_limit: int = display_limit_slider.value
746
+ display_records = source_records[:display_limit]
747
+ display_limit_reached: bool = len(source_records) > display_limit
748
 
749
  # Generate data for visualization
750
  TokenVisData = dict[str, Union[str, int, bool, dict[str, str]]]
751
  llm_token_data: list[TokenVisData] = []
752
 
753
+ for idx, r in enumerate(display_records):
754
+ token_str: str = r["dec"] if use_decoded else r["raw"]
755
+
756
+ # Apply space visualization in decoded view
757
+ if use_decoded and show_spaces_switch.value:
758
+ token_str = token_str.replace(" ", "·")
759
+
760
+ is_invalid_utf8: bool = REPLACEMENT_CHARACTER in token_str
761
+ fixed_token_display: str = (
762
+ f"<0x{r['id']:X}>" if is_invalid_utf8 else fix_token(token_str, re)
 
 
 
 
763
  )
764
 
765
+ # Choose color seed based on color_by_radio
766
+ if color_by_radio.value == "ID":
767
+ seed = f"id_{r['id']}"
768
+ elif color_by_radio.value == "Category":
769
+ probe = r["dec"] if use_decoded else r["raw"]
770
+ if probe.startswith(("Ġ", "▁", " ")):
771
+ cat = "space"
772
+ elif ("\n" in probe) or ("Ċ" in probe):
773
+ cat = "newline"
774
+ elif (probe.startswith("<") and probe.endswith(">")) or (
775
+ probe.startswith("[") and probe.endswith("]")
776
+ ):
777
+ cat = "special"
778
+ else:
779
+ cat = "text"
780
+ seed = f"cat_{cat}"
781
  else:
782
+ seed = token_str
 
783
 
784
+ colors: dict[str, str] = get_varied_color(
785
+ seed if not is_invalid_utf8 else f"invalid_{r['id']}"
786
+ )
787
  llm_token_data.append(
788
  {
789
+ "original": (
790
+ f"Vocab: {r['raw']}\n"
791
+ f"Decoded: {r['dec'] if r['dec'] != '' else '∅'}\n"
792
+ f"Span: [{r['start']}, {r['end']}]\n"
793
+ f"Text: {r['substr']}"
794
+ ),
795
+ "display": fixed_token_display,
796
  "colors": colors,
797
+ "is_newline": "↵" in fixed_token_display,
798
+ "token_id": r["id"],
799
  "token_index": idx,
800
+ "is_invalid": is_invalid_utf8,
801
  }
802
  )
803
 
 
 
 
804
  token_stats: dict[str, dict[str, Union[int, float]]] = get_token_stats(
805
+ stats_tokens_source,
806
+ current_text,
807
  )
808
 
 
809
  html_parts: list[str] = [
810
  (
811
  lambda item: (
812
  style
813
  := f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
814
+ # Add specific style for invalid tokens
815
+ + (
816
+ " border: 1px solid red;"
817
+ if item.get("is_invalid")
818
+ else (
819
+ " border: 1px solid orange;"
820
+ if item["display"].startswith("<0x")
821
+ else ""
822
+ )
823
+ ),
824
  # Modify title based on validity
825
  title := (
826
  f"Original: {item['original']}\nID: {item['token_id']}"
827
  + ("\n(Invalid UTF-8)" if item.get("is_invalid") else "")
828
+ + ("\n(Byte Token)" if item["display"].startswith("<0x") else "")
829
+ ),
830
+ aria_label := (
831
+ ("Token ID " + str(item["token_id"]) + ": " + item["original"])
832
+ .replace("\n", " ")
833
+ .replace('"', "&quot;")
834
  ),
835
  display_content := str(item["token_id"])
836
  if show_ids_switch.value
837
  else item["display"],
838
+ f'<span style="{style}" title="{title}" aria-label="{aria_label}">{display_content}</span>',
839
  )[-1] # Get the last element (the formatted string) from the lambda's tuple
840
  )(item)
841
  for item in llm_token_data
 
851
  limit_warning = mo.md(f"""**Warning:** Displaying only the first {display_limit:,} tokens out of {total_token_count:,}.
852
  Statistics are calculated on the full text.""").callout(kind="warn")
853
 
854
+ representation_hint: Optional[mo.md] = None
855
+ if representation_radio.value == "Raw tokens":
856
+ try:
857
+ if _is_byte_level(tokenizer):
858
+ representation_hint = mo.md(
859
+ "This tokenizer uses byte-level BPE; raw vocab strings are not human-readable. Prefer Decoded strings or Auto."
860
+ ).callout(kind="info")
861
+ except Exception:
862
+ pass
863
+
864
  # Use dict access safely with .get() for stats
865
  basic_stats: dict[str, Union[int, float]] = token_stats.get("basic_stats", {})
866
  length_stats: dict[str, Union[int, float]] = token_stats.get("length_stats", {})
 
899
 
900
  tokenizer_info_md: str = "\n\n".join(tokenizer_info_md_parts)
901
 
902
+ tokenizer_info_accordion = mo.accordion(
903
+ {"Tokenizer Info": mo.md(tokenizer_info_md)}
904
+ )
 
 
905
 
906
+ mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
907
  {show_ids_switch}
908
 
909
+ {tokenizer_info_accordion}
910
+
911
  ## Tokenizer output
912
  {limit_warning if limit_warning else ""}
913
+ {representation_hint if representation_hint else ""}
914
  {mo.as_html(token_viz_html)}
915
 
916
  ## Token Statistics
 
921
  {length_stats_md}
922
 
923
  """)
 
924
  return
925
 
926
 
development.md CHANGED
@@ -3,6 +3,6 @@
3
  ## Testing your Dockerfile locally
4
 
5
  ```bash
6
- docker build -t marimo-app .
7
- docker run -it --rm -p 7860:7860 marimo-app
8
  ```
 
3
  ## Testing your Dockerfile locally
4
 
5
  ```bash
6
+ docker build -t counting-words .
7
+ docker run -it --rm -p 7860:7860 counting-words
8
  ```
pyproject.toml CHANGED
@@ -3,15 +3,15 @@ name = "counting-words"
3
  version = "0.1.0"
4
  description = "Counting words in English and Japanese texts demo"
5
  readme = "README.md"
6
- requires-python = ">=3.12"
7
  dependencies = [
8
  "marimo>=0.13.0",
9
  "polars>=1.27.1",
10
  "altair>=5.5.0",
11
- "spacy>=3.8.5",
12
  "en-core-web-md",
13
  "ja-core-news-md",
14
- "transformers>=4.51.3",
15
  ]
16
 
17
  [tool.uv.sources]
 
3
  version = "0.1.0"
4
  description = "Counting words in English and Japanese texts demo"
5
  readme = "README.md"
6
+ requires-python = ">=3.13"
7
  dependencies = [
8
  "marimo>=0.13.0",
9
  "polars>=1.27.1",
10
  "altair>=5.5.0",
11
+ "spacy>=3.8.7",
12
  "en-core-web-md",
13
  "ja-core-news-md",
14
+ "transformers>=4.57.1",
15
  ]
16
 
17
  [tool.uv.sources]
uv.lock CHANGED
The diff for this file is too large to render. See raw diff