counting_words / app.py
Bor Hodošček
chore: update dockerfile and deps
ca0f322
# /// script
# dependencies = [
# "marimo>=0.13.0",
# "polars>=1.29.0",
# "altair>=5.5.0",
# "spacy==3.8.7",
# "en-core-web-md",
# "ja-core-news-md",
# "transformers>=4.57.1",
# ]
#
# [tool.uv.sources]
# en-core-web-md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl" }
# ja-core-news-md = { url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.8.0/ja_core_news_md-3.8.0-py3-none-any.whl" }
# [tool.marimo.runtime]
# auto_instantiate = false
# ///
import marimo
__generated_with = "0.17.2"
app = marimo.App(width="medium")
@app.cell
def _():
import hashlib
import math
import re
from typing import Any, Callable, Optional, Union
import altair as alt
import marimo as mo
import polars as pl
import spacy
from transformers import (
PreTrainedTokenizerBase,
AutoTokenizer,
)
llm_model_choices: list[str] = [
"deepseek-ai/DeepSeek-OCR",
"zai-org/GLM-4.6",
"openai/gpt-oss-20b",
"google/gemma-3-27b-it",
"ibm-granite/granite-3.3-8b-instruct",
"deep-analysis-research/Flux-Japanese-Qwen2.5-32B-Instruct-V1.0",
"google-bert/bert-large-uncased",
]
return (
Any,
AutoTokenizer,
Callable,
Optional,
PreTrainedTokenizerBase,
Union,
alt,
hashlib,
llm_model_choices,
math,
mo,
pl,
re,
spacy,
)
@app.cell
def _(mo, spacy):
get_nlp_en, set_nlp_en = mo.state(None)
get_nlp_ja, set_nlp_ja = mo.state(None)
def ensure_nlp(language: str) -> spacy.language.Language:
if language == "English":
if get_nlp_en() is None:
set_nlp_en(spacy.load("en_core_web_md"))
return get_nlp_en()
else:
if get_nlp_ja() is None:
set_nlp_ja(spacy.load("ja_core_news_md"))
return get_nlp_ja()
return (ensure_nlp,)
@app.cell
def _(mo):
mo.md("""# Tokenization for English and Japanese""")
return
@app.cell
def _(Callable, mo):
# Central state for the text input content
# Type the getter and setter
get_text_content: Callable[[], str]
set_text_content: Callable[[str], None]
get_text_content, set_text_content = mo.state("")
return get_text_content, set_text_content
@app.cell
def _(mo):
# Placeholder texts
en_placeholder = """
Mrs. Ferrars died on the night of the 16th⁠–⁠17th September⁠—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours.
""".strip()
ja_placeholder = """
吾輩は猫である。名前はまだ無い。
 どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。
""".strip()
# Create UI element for language selection
language_selector: mo.ui.radio = mo.ui.radio(
options=["English", "Japanese"], value="English", label="Language"
)
return en_placeholder, ja_placeholder, language_selector
@app.cell
def _(
en_placeholder,
get_text_content: "Callable[[], str]",
ja_placeholder,
language_selector: "mo.ui.radio",
mo,
set_text_content: "Callable[[str], None]",
):
# Define text_input dynamically based on language
current_placeholder: str = (
en_placeholder if language_selector.value == "English" else ja_placeholder
)
text_input: mo.ui.text_area = mo.ui.text_area(
value=get_text_content(),
label="Enter text",
placeholder=current_placeholder,
full_width=True,
on_change=lambda v: set_text_content(v),
)
return current_placeholder, text_input
@app.cell
def _(current_placeholder: str, mo, set_text_content: "Callable[[str], None]"):
def apply_placeholder() -> None:
set_text_content(current_placeholder)
apply_placeholder_button: mo.ui.button = mo.ui.button(
label="Use Placeholder Text", on_click=lambda _: apply_placeholder()
)
return (apply_placeholder_button,)
@app.cell
def _(
apply_placeholder_button: "mo.ui.button",
language_selector: "mo.ui.radio",
mo,
text_input: "mo.ui.text_area",
):
mo.vstack(
[
text_input,
mo.hstack([language_selector, apply_placeholder_button], justify="start"),
mo.ui.button(label="Analyze"),
]
)
return
@app.cell
def _(
ensure_nlp,
get_text_content: "Callable[[], str]",
language_selector: "mo.ui.radio",
mo,
spacy,
):
# Analyze text using spaCy based on selected language
mo.md("Note: Loading spaCy pipelines on first use may take a few seconds.").callout(
kind="info"
)
current_text: str = get_text_content()
nlp = ensure_nlp(language_selector.value)
doc: spacy.tokens.Doc = nlp(current_text)
model_name: str = nlp.meta["name"]
tokenized_text: list[str] = [token.text for token in doc]
token_count: int = len(tokenized_text)
mo.md(
f"**Tokenized Text using spaCy {'en_' if language_selector.value == 'English' else 'ja_'}{model_name}:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}"
)
return current_text, doc
@app.cell
def _(doc: "spacy.tokens.Doc", language_selector: "mo.ui.radio", mo, pl):
token_data: pl.DataFrame = pl.DataFrame(
{
"Token": [token.text for token in doc],
"Lemma": [token.lemma_ for token in doc],
"POS": [token.pos_ for token in doc],
"Tag": [token.tag_ for token in doc],
"Morph": [str(token.morph) for token in doc],
"OOV": [
token.is_oov if language_selector.value == "English" else None
for token in doc
],
"Token Position": list(range(len(doc))),
"Sentence Number": (
[i for i, sent in enumerate(doc.sents) for _ in sent]
if doc.has_annotation("SENT_START")
else [0] * len(doc)
),
}
)
mo.ui.dataframe(token_data, page_size=50)
return (token_data,)
@app.cell
def _(mo):
column_selector: mo.ui.dropdown = mo.ui.dropdown(
options=["POS", "Tag", "Lemma", "Token", "Morph", "OOV"],
value="POS",
label="Select column to visualize",
)
column_selector
return (column_selector,)
@app.cell
def _(
alt,
column_selector: "mo.ui.dropdown",
mo,
pl,
token_data: "pl.DataFrame",
):
mo.stop(token_data.is_empty(), "Please set input text.")
selected_column: str = column_selector.value
# Calculate value counts for the selected column
counts_df: pl.DataFrame = (
token_data[selected_column]
.value_counts()
.sort(by=["count", selected_column], descending=[True, False])
)
chart: alt.Chart = (
alt.Chart(counts_df)
.mark_bar()
.encode(
x=alt.X("count", title="Frequency"),
y=alt.Y(selected_column, title=selected_column, sort=None),
tooltip=[selected_column, "count"],
)
.properties(title=f"{selected_column} Distribution")
.interactive()
)
mo.ui.altair_chart(chart)
return
@app.cell
def _(llm_model_choices: list[str], mo):
llm_tokenizer_selector: mo.ui.dropdown = mo.ui.dropdown(
options=llm_model_choices,
value=llm_model_choices[0],
label="Select LLM Tokenizer Model",
)
llm_tokenizer_selector
return (llm_tokenizer_selector,)
@app.cell
def _(mo):
add_special_tokens_switch = mo.ui.switch(
label="Add special tokens (encode)", value=False
)
skip_special_tokens_on_decode_switch = mo.ui.switch(
label="Skip special tokens in decoded view", value=False
)
representation_radio = mo.ui.radio(
options=["Auto (recommended)", "Decoded strings", "Raw tokens"],
value="Auto (recommended)",
label="LLM token representation",
)
display_limit_slider = mo.ui.slider(
100, 5000, value=1000, label="Display token limit"
)
color_by_radio = mo.ui.radio(
options=["Token", "ID", "Category"],
value="Token",
label="Color by",
)
show_spaces_switch = mo.ui.switch(
label="Show spaces as · (decoded view)", value=False
)
mo.vstack(
[
mo.hstack(
[
add_special_tokens_switch,
skip_special_tokens_on_decode_switch,
]
),
mo.hstack([representation_radio, display_limit_slider]),
mo.hstack([color_by_radio, show_spaces_switch]),
mo.accordion(
{
"Tip": mo.md(
"Many GPT-style tokenizers are byte-level; their raw vocab strings can look garbled. Use Decoded strings or Auto."
).callout(kind="info")
}
),
]
)
return (
add_special_tokens_switch,
color_by_radio,
display_limit_slider,
representation_radio,
show_spaces_switch,
skip_special_tokens_on_decode_switch,
)
@app.cell
def _(mo):
get_tok_cache, set_tok_cache = mo.state({})
return get_tok_cache, set_tok_cache
@app.cell
def _(
AutoTokenizer,
PreTrainedTokenizerBase,
get_tok_cache,
llm_tokenizer_selector: "mo.ui.dropdown",
mo,
set_tok_cache,
):
# Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
selected_model_name: str = llm_tokenizer_selector.value
key = selected_model_name
cache = get_tok_cache()
if key in cache:
tokenizer = cache[key]
else:
tokenizer: PreTrainedTokenizerBase = None
try:
tokenizer = AutoTokenizer.from_pretrained(
selected_model_name,
use_fast=True,
trust_remote_code=True,
)
except Exception as e:
mo.md(f"Failed to load tokenizer '{selected_model_name}': {e}").callout(
kind="error"
)
tokenizer = None
if tokenizer is not None:
set_tok_cache({**cache, key: tokenizer})
return (tokenizer,)
@app.cell
def _(Union, math):
TokenStatsDict = dict[str, dict[str, Union[int, float]]]
def get_token_stats(tokens: list[str], original_text: str) -> TokenStatsDict:
"""Calculate enhanced statistics about the tokens."""
if not tokens:
# Return default structure matching TokenStatsDict
return {
"basic_stats": {
"total_tokens": 0,
"unique_tokens": 0,
"compression_ratio": 0.0,
"space_tokens": 0,
"newline_tokens": 0,
"special_tokens": 0,
"punctuation_tokens": 0,
"unique_percentage": 0.0,
},
"length_stats": {
"avg_length": 0.0,
"std_dev": 0.0,
"min_length": 0,
"max_length": 0,
"median_length": 0.0,
},
}
total_tokens: int = len(tokens)
unique_tokens: int = len(set(tokens))
compression_ratio: float = (
len(original_text) / total_tokens if total_tokens > 0 else 0.0
)
space_tokens: int = sum(1 for t in tokens if t.startswith(("Ġ", "▁", " ")))
newline_tokens: int = sum(
1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
)
special_tokens: int = sum(
1
for t in tokens
if (t.startswith("<") and t.endswith(">"))
or (t.startswith("[") and t.endswith("]"))
)
punctuation_tokens: int = sum(
1
for t in tokens
if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"]
)
lengths: list[int] = [len(t) for t in tokens]
if not lengths: # Should not happen if tokens is not empty, but safe check
return { # Return default structure matching TokenStatsDict
"basic_stats": {
"total_tokens": 0,
"unique_tokens": 0,
"compression_ratio": 0.0,
"space_tokens": 0,
"newline_tokens": 0,
"special_tokens": 0,
"punctuation_tokens": 0,
"unique_percentage": 0.0,
},
"length_stats": {
"avg_length": 0.0,
"std_dev": 0.0,
"min_length": 0,
"max_length": 0,
"median_length": 0.0,
},
}
mean_length: float = sum(lengths) / len(lengths)
variance: float = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
std_dev: float = math.sqrt(variance)
sorted_lengths: list[int] = sorted(lengths)
n = len(lengths)
if n % 2 == 1:
median_length = float(sorted_lengths[n // 2])
else:
median_length = (sorted_lengths[n // 2 - 1] + sorted_lengths[n // 2]) / 2
return {
"basic_stats": {
"total_tokens": total_tokens,
"unique_tokens": unique_tokens,
"compression_ratio": round(compression_ratio, 2),
"space_tokens": space_tokens,
"newline_tokens": newline_tokens,
"special_tokens": special_tokens,
"punctuation_tokens": punctuation_tokens,
"unique_percentage": round(unique_tokens / total_tokens * 100, 1)
if total_tokens > 0
else 0.0,
},
"length_stats": {
"avg_length": round(mean_length, 2),
"std_dev": round(std_dev, 2),
"min_length": min(lengths),
"max_length": max(lengths),
"median_length": median_length,
},
}
return (get_token_stats,)
@app.cell
def _(hashlib):
def get_varied_color(token: str) -> dict[str, str]:
"""Generate vibrant colors with HSL for better visual distinction."""
token_hash: str = hashlib.md5(token.encode()).hexdigest()
hue: int = int(token_hash[:3], 16) % 360
saturation: int = 70 + (int(token_hash[3:5], 16) % 20)
lightness: int = 80 + (int(token_hash[5:7], 16) % 10)
text_lightness: int = 20
return {
"background": f"hsl({hue}, {saturation}%, {lightness}%)",
"text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
}
return (get_varied_color,)
@app.function
def fix_token(
token: str, re
) -> (
str
): # re module type is complex, leave as Any implicitly or import types.ModuleType
"""Fix token for display, handling byte fallbacks and spaces."""
# Check for byte fallback pattern <0xHH> using a full match
byte_match = re.fullmatch(r"<0x([0-9A-Fa-f]{2})>", token)
if byte_match:
hex_value = byte_match.group(1).upper()
# Return a clear representation indicating it's a byte
return f"<0x{hex_value}>"
# Replace SentencePiece space marker U+2581 ('▁') and BPE space marker 'Ġ' with a middle dot
token = token.replace("▁", "·").replace("Ġ", "·")
# Replace newline markers for display
token = token.replace("Ċ", "↵\n")
# Handle byte representation of newline AFTER general byte check
# This specific check might become redundant if <0x0A> is caught by the byte_match above
# Keep it for now as a fallback.
token = token.replace("<0x0A>", "↵\n")
return token
@app.cell
def _(Any, PreTrainedTokenizerBase):
def get_tokenizer_info(
tokenizer: PreTrainedTokenizerBase,
) -> dict[str, Any]:
"""
Extract useful information from a tokenizer.
Returns a dictionary with tokenizer details.
"""
info: dict[str, Any] = {}
try:
if hasattr(tokenizer, "vocab_size"):
info["vocab_size"] = tokenizer.vocab_size
elif hasattr(tokenizer, "get_vocab"):
info["vocab_size"] = len(tokenizer.get_vocab())
if (
hasattr(tokenizer, "model_max_length")
and isinstance(tokenizer.model_max_length, int)
and tokenizer.model_max_length < 1000000
):
info["model_max_length"] = tokenizer.model_max_length
else:
info["model_max_length"] = "Not specified or very large"
info["tokenizer_type"] = tokenizer.__class__.__name__
special_tokens: dict[str, str] = {}
special_token_attributes: list[str] = [
"pad_token",
"eos_token",
"bos_token",
"sep_token",
"cls_token",
"unk_token",
"mask_token",
]
processed_tokens: set[str] = (
set()
) # Keep track of processed tokens to avoid duplicates
# Prefer all_special_tokens if available
if hasattr(tokenizer, "all_special_tokens"):
for token_value in tokenizer.all_special_tokens:
if (
not token_value
or not str(token_value).strip()
or str(token_value) in processed_tokens
):
continue
token_name = "special_token" # Default name
# Find the attribute name corresponding to the token value
for attr_name in special_token_attributes:
if (
hasattr(tokenizer, attr_name)
and getattr(tokenizer, attr_name) == token_value
):
token_name = attr_name
break
token_str = str(token_value)
token_id = (
tokenizer.convert_tokens_to_ids(token_str)
if hasattr(tokenizer, "convert_tokens_to_ids")
else None
)
special_tokens[token_name] = token_str + (
f" (id {token_id})" if isinstance(token_id, int) else ""
)
processed_tokens.add(str(token_value))
# Fallback/Augment with individual attributes if not covered by all_special_tokens
for token_name in special_token_attributes:
if hasattr(tokenizer, token_name):
token_value = getattr(tokenizer, token_name)
if (
token_value
and str(token_value).strip()
and str(token_value) not in processed_tokens
):
token_str = str(token_value)
token_id = (
tokenizer.convert_tokens_to_ids(token_str)
if hasattr(tokenizer, "convert_tokens_to_ids")
else None
)
special_tokens[token_name] = token_str + (
f" (id {token_id})" if isinstance(token_id, int) else ""
)
processed_tokens.add(str(token_value))
info["special_tokens"] = special_tokens if special_tokens else "None found"
except Exception as e:
info["error"] = f"Error extracting tokenizer info: {str(e)}"
return info
return (get_tokenizer_info,)
@app.cell
def _(mo):
show_ids_switch: mo.ui.switch = mo.ui.switch(
label="Show token IDs instead of text", value=False
)
return (show_ids_switch,)
@app.cell
def _(
Any,
Optional,
Union,
add_special_tokens_switch,
color_by_radio,
current_text: str,
display_limit_slider,
get_token_stats,
get_tokenizer_info,
get_varied_color,
llm_tokenizer_selector: "mo.ui.dropdown",
mo,
re,
representation_radio,
show_ids_switch: "mo.ui.switch",
show_spaces_switch,
skip_special_tokens_on_decode_switch,
tokenizer,
):
# Define the Unicode replacement character
REPLACEMENT_CHARACTER = "\ufffd"
mo.stop(tokenizer is None, "Please select a valid tokenizer model.")
tokenizer_info: dict[str, Any] = get_tokenizer_info(tokenizer)
# 1. Encode text to get token IDs first.
token_ids: list[int] = tokenizer.encode(
current_text, add_special_tokens=add_special_tokens_switch.value
)
# 2. Convert IDs to raw tokens and decode each individually
raw_tokens: list[str] = tokenizer.convert_ids_to_tokens(token_ids)
decoded_per_id: list[str] = [
tokenizer.decode(
[tid],
skip_special_tokens=skip_special_tokens_on_decode_switch.value,
clean_up_tokenization_spaces=False,
)
for tid in token_ids
]
# 3. Get offset mapping for span information
enc = tokenizer(
current_text,
add_special_tokens=add_special_tokens_switch.value,
return_offsets_mapping=True,
)
offsets = (
enc.get("offset_mapping")
if isinstance(enc, dict)
else getattr(enc, "offset_mapping", None)
)
if offsets and len(offsets) == len(token_ids):
records: list[dict[str, Union[int, str]]] = []
for tid, raw, dec, (s, e) in zip(
token_ids, raw_tokens, decoded_per_id, offsets
):
substr = current_text[s:e] if (s is not None and e is not None) else ""
records.append(
{
"id": tid,
"raw": raw,
"dec": dec,
"start": s,
"end": e,
"substr": substr,
}
)
else:
records = [
{
"id": tid,
"raw": raw,
"dec": dec,
"start": None,
"end": None,
"substr": "",
}
for tid, raw, dec in zip(token_ids, raw_tokens, decoded_per_id)
]
def _is_byte_level(tok) -> bool:
try:
if getattr(tok, "is_fast", False):
pre = tok.backend_tokenizer.pre_tokenizer
types = [pre.__class__.__name__]
if hasattr(pre, "pre_tokenizers"):
types = [p.__class__.__name__ for p in pre.pre_tokenizers]
return "ByteLevel" in types
except Exception:
pass
return False
if representation_radio.value == "Auto (recommended)":
use_decoded: bool = _is_byte_level(tokenizer) or any(
("Ġ" in r["raw"] or "Ċ" in r["raw"]) for r in records[:256]
)
elif representation_radio.value == "Decoded strings":
use_decoded = True
else:
use_decoded = False
if use_decoded:
source_records = [r for r in records if r["dec"] != ""]
stats_tokens_source: list[str] = [r["dec"] for r in records if r["dec"] != ""]
else:
source_records = records
stats_tokens_source = [r["raw"] for r in records]
total_token_count: int = len(source_records)
display_limit: int = display_limit_slider.value
display_records = source_records[:display_limit]
display_limit_reached: bool = len(source_records) > display_limit
# Generate data for visualization
TokenVisData = dict[str, Union[str, int, bool, dict[str, str]]]
llm_token_data: list[TokenVisData] = []
for idx, r in enumerate(display_records):
token_str: str = r["dec"] if use_decoded else r["raw"]
# Apply space visualization in decoded view
if use_decoded and show_spaces_switch.value:
token_str = token_str.replace(" ", "·")
is_invalid_utf8: bool = REPLACEMENT_CHARACTER in token_str
fixed_token_display: str = (
f"<0x{r['id']:X}>" if is_invalid_utf8 else fix_token(token_str, re)
)
# Choose color seed based on color_by_radio
if color_by_radio.value == "ID":
seed = f"id_{r['id']}"
elif color_by_radio.value == "Category":
probe = r["dec"] if use_decoded else r["raw"]
if probe.startswith(("Ġ", "▁", " ")):
cat = "space"
elif ("\n" in probe) or ("Ċ" in probe):
cat = "newline"
elif (probe.startswith("<") and probe.endswith(">")) or (
probe.startswith("[") and probe.endswith("]")
):
cat = "special"
else:
cat = "text"
seed = f"cat_{cat}"
else:
seed = token_str
colors: dict[str, str] = get_varied_color(
seed if not is_invalid_utf8 else f"invalid_{r['id']}"
)
llm_token_data.append(
{
"original": (
f"Vocab: {r['raw']}\n"
f"Decoded: {r['dec'] if r['dec'] != '' else '∅'}\n"
f"Span: [{r['start']}, {r['end']}]\n"
f"Text: {r['substr']}"
),
"display": fixed_token_display,
"colors": colors,
"is_newline": "↵" in fixed_token_display,
"token_id": r["id"],
"token_index": idx,
"is_invalid": is_invalid_utf8,
}
)
token_stats: dict[str, dict[str, Union[int, float]]] = get_token_stats(
stats_tokens_source,
current_text,
)
html_parts: list[str] = [
(
lambda item: (
style
:= f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
# Add specific style for invalid tokens
+ (
" border: 1px solid red;"
if item.get("is_invalid")
else (
" border: 1px solid orange;"
if item["display"].startswith("<0x")
else ""
)
),
# Modify title based on validity
title := (
f"Original: {item['original']}\nID: {item['token_id']}"
+ ("\n(Invalid UTF-8)" if item.get("is_invalid") else "")
+ ("\n(Byte Token)" if item["display"].startswith("<0x") else "")
),
aria_label := (
("Token ID " + str(item["token_id"]) + ": " + item["original"])
.replace("\n", " ")
.replace('"', "&quot;")
),
display_content := str(item["token_id"])
if show_ids_switch.value
else item["display"],
f'<span style="{style}" title="{title}" aria-label="{aria_label}">{display_content}</span>',
)[-1] # Get the last element (the formatted string) from the lambda's tuple
)(item)
for item in llm_token_data
]
token_viz_html: mo.Html = mo.Html(
f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
)
# Optional: Add a warning if the display limit was reached
limit_warning: Optional[mo.md] = None # Use Optional type
if display_limit_reached:
limit_warning = mo.md(f"""**Warning:** Displaying only the first {display_limit:,} tokens out of {total_token_count:,}.
Statistics are calculated on the full text.""").callout(kind="warn")
representation_hint: Optional[mo.md] = None
if representation_radio.value == "Raw tokens":
try:
if _is_byte_level(tokenizer):
representation_hint = mo.md(
"This tokenizer uses byte-level BPE; raw vocab strings are not human-readable. Prefer Decoded strings or Auto."
).callout(kind="info")
except Exception:
pass
# Use dict access safely with .get() for stats
basic_stats: dict[str, Union[int, float]] = token_stats.get("basic_stats", {})
length_stats: dict[str, Union[int, float]] = token_stats.get("length_stats", {})
# Use list comprehensions for markdown generation (functional style)
basic_stats_md: str = "**Basic Stats:**\n\n" + "\n".join(
f"- **{key.replace('_', ' ').title()}:** `{value}`"
for key, value in basic_stats.items()
)
length_stats_md: str = "**Length (Character) Stats:**\n\n" + "\n".join(
f"- **{key.replace('_', ' ').title()}:** `{value}`"
for key, value in length_stats.items()
)
# Build tokenizer info markdown parts
tokenizer_info_md_parts: list[str] = [
f"**Tokenizer Type:** `{tokenizer_info.get('tokenizer_type', 'N/A')}`"
]
if vocab_size := tokenizer_info.get("vocab_size"):
tokenizer_info_md_parts.append(f"**Vocab Size:** `{vocab_size:,}`")
if max_len := tokenizer_info.get("model_max_length"):
tokenizer_info_md_parts.append(f"**Model Max Length:** `{max_len}`")
special_tokens_info = tokenizer_info.get("special_tokens")
if isinstance(special_tokens_info, dict) and special_tokens_info:
tokenizer_info_md_parts.append("**Special Tokens:**")
tokenizer_info_md_parts.extend(
f" - `{name}`: `{str(val)}`" for name, val in special_tokens_info.items()
)
elif isinstance(special_tokens_info, str): # Handle "None found" case
tokenizer_info_md_parts.append(f"**Special Tokens:** `{special_tokens_info}`")
if error_info := tokenizer_info.get("error"):
tokenizer_info_md_parts.append(f"**Info Error:** `{error_info}`")
tokenizer_info_md: str = "\n\n".join(tokenizer_info_md_parts)
tokenizer_info_accordion = mo.accordion(
{"Tokenizer Info": mo.md(tokenizer_info_md)}
)
mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
{show_ids_switch}
{tokenizer_info_accordion}
## Tokenizer output
{limit_warning if limit_warning else ""}
{representation_hint if representation_hint else ""}
{mo.as_html(token_viz_html)}
## Token Statistics
(Calculated on full text if truncated above)
{basic_stats_md}
{length_stats_md}
""")
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()