Spaces:

bor
/

counting_words

Running

counting_words / app.py

Bor Hodošček

chore: update dockerfile and deps

ca0f322 11 days ago

31.2 kB

	# /// script
	# dependencies = [
	# "marimo>=0.13.0",
	# "polars>=1.29.0",
	# "altair>=5.5.0",
	# "spacy==3.8.7",
	# "en-core-web-md",
	# "ja-core-news-md",
	# "transformers>=4.57.1",
	# ]
	#
	# [tool.uv.sources]
	# en-core-web-md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl" }
	# ja-core-news-md = { url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.8.0/ja_core_news_md-3.8.0-py3-none-any.whl" }
	# [tool.marimo.runtime]
	# auto_instantiate = false
	# ///

	import marimo

	__generated_with = "0.17.2"
	app = marimo.App(width="medium")


	@app.cell
	def _():
	import hashlib
	import math
	import re
	from typing import Any, Callable, Optional, Union

	import altair as alt
	import marimo as mo
	import polars as pl
	import spacy
	from transformers import (
	PreTrainedTokenizerBase,
	AutoTokenizer,
	)

	llm_model_choices: list[str] = [
	"deepseek-ai/DeepSeek-OCR",
	"zai-org/GLM-4.6",
	"openai/gpt-oss-20b",
	"google/gemma-3-27b-it",
	"ibm-granite/granite-3.3-8b-instruct",
	"deep-analysis-research/Flux-Japanese-Qwen2.5-32B-Instruct-V1.0",
	"google-bert/bert-large-uncased",
	]
	return (
	Any,
	AutoTokenizer,
	Callable,
	Optional,
	PreTrainedTokenizerBase,
	Union,
	alt,
	hashlib,
	llm_model_choices,
	math,
	mo,
	pl,
	re,
	spacy,
	)


	@app.cell
	def _(mo, spacy):
	get_nlp_en, set_nlp_en = mo.state(None)
	get_nlp_ja, set_nlp_ja = mo.state(None)

	def ensure_nlp(language: str) -> spacy.language.Language:
	if language == "English":
	if get_nlp_en() is None:
	set_nlp_en(spacy.load("en_core_web_md"))
	return get_nlp_en()
	else:
	if get_nlp_ja() is None:
	set_nlp_ja(spacy.load("ja_core_news_md"))
	return get_nlp_ja()
	return (ensure_nlp,)


	@app.cell
	def _(mo):
	mo.md("""# Tokenization for English and Japanese""")
	return


	@app.cell
	def _(Callable, mo):
	# Central state for the text input content
	# Type the getter and setter
	get_text_content: Callable[[], str]
	set_text_content: Callable[[str], None]
	get_text_content, set_text_content = mo.state("")
	return get_text_content, set_text_content


	@app.cell
	def _(mo):
	# Placeholder texts
	en_placeholder = """
	Mrs. Ferrars died on the night of the 16th⁠–⁠17th September⁠—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours.
	""".strip()
	ja_placeholder = """
	吾輩は猫である。名前はまだ無い。
	どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。
	""".strip()

	# Create UI element for language selection
	language_selector: mo.ui.radio = mo.ui.radio(
	options=["English", "Japanese"], value="English", label="Language"
	)
	return en_placeholder, ja_placeholder, language_selector


	@app.cell
	def _(
	en_placeholder,
	get_text_content: "Callable[[], str]",
	ja_placeholder,
	language_selector: "mo.ui.radio",
	mo,
	set_text_content: "Callable[[str], None]",
	):
	# Define text_input dynamically based on language
	current_placeholder: str = (
	en_placeholder if language_selector.value == "English" else ja_placeholder
	)
	text_input: mo.ui.text_area = mo.ui.text_area(
	value=get_text_content(),
	label="Enter text",
	placeholder=current_placeholder,
	full_width=True,
	on_change=lambda v: set_text_content(v),
	)
	return current_placeholder, text_input


	@app.cell
	def _(current_placeholder: str, mo, set_text_content: "Callable[[str], None]"):
	def apply_placeholder() -> None:
	set_text_content(current_placeholder)

	apply_placeholder_button: mo.ui.button = mo.ui.button(
	label="Use Placeholder Text", on_click=lambda _: apply_placeholder()
	)
	return (apply_placeholder_button,)


	@app.cell
	def _(
	apply_placeholder_button: "mo.ui.button",
	language_selector: "mo.ui.radio",
	mo,
	text_input: "mo.ui.text_area",
	):
	mo.vstack(
	[
	text_input,
	mo.hstack([language_selector, apply_placeholder_button], justify="start"),
	mo.ui.button(label="Analyze"),
	]
	)
	return


	@app.cell
	def _(
	ensure_nlp,
	get_text_content: "Callable[[], str]",
	language_selector: "mo.ui.radio",
	mo,
	spacy,
	):
	# Analyze text using spaCy based on selected language
	mo.md("Note: Loading spaCy pipelines on first use may take a few seconds.").callout(
	kind="info"
	)
	current_text: str = get_text_content()
	nlp = ensure_nlp(language_selector.value)
	doc: spacy.tokens.Doc = nlp(current_text)
	model_name: str = nlp.meta["name"]

	tokenized_text: list[str] = [token.text for token in doc]
	token_count: int = len(tokenized_text)

	mo.md(
	f"Tokenized Text using spaCy {'en_' if language_selector.value == 'English' else 'ja_'}{model_name}: {' \| '.join(tokenized_text)}\n\nToken Count: {token_count}"
	)
	return current_text, doc


	@app.cell
	def _(doc: "spacy.tokens.Doc", language_selector: "mo.ui.radio", mo, pl):
	token_data: pl.DataFrame = pl.DataFrame(
	{
	"Token": [token.text for token in doc],
	"Lemma": [token.lemma_ for token in doc],
	"POS": [token.pos_ for token in doc],
	"Tag": [token.tag_ for token in doc],
	"Morph": [str(token.morph) for token in doc],
	"OOV": [
	token.is_oov if language_selector.value == "English" else None
	for token in doc
	],
	"Token Position": list(range(len(doc))),
	"Sentence Number": (
	[i for i, sent in enumerate(doc.sents) for _ in sent]
	if doc.has_annotation("SENT_START")
	else [0] * len(doc)
	),
	}
	)

	mo.ui.dataframe(token_data, page_size=50)
	return (token_data,)


	@app.cell
	def _(mo):
	column_selector: mo.ui.dropdown = mo.ui.dropdown(
	options=["POS", "Tag", "Lemma", "Token", "Morph", "OOV"],
	value="POS",
	label="Select column to visualize",
	)

	column_selector
	return (column_selector,)


	@app.cell
	def _(
	alt,
	column_selector: "mo.ui.dropdown",
	mo,
	pl,
	token_data: "pl.DataFrame",
	):
	mo.stop(token_data.is_empty(), "Please set input text.")

	selected_column: str = column_selector.value
	# Calculate value counts for the selected column
	counts_df: pl.DataFrame = (
	token_data[selected_column]
	.value_counts()
	.sort(by=["count", selected_column], descending=[True, False])
	)

	chart: alt.Chart = (
	alt.Chart(counts_df)
	.mark_bar()
	.encode(
	x=alt.X("count", title="Frequency"),
	y=alt.Y(selected_column, title=selected_column, sort=None),
	tooltip=[selected_column, "count"],
	)
	.properties(title=f"{selected_column} Distribution")
	.interactive()
	)
	mo.ui.altair_chart(chart)
	return


	@app.cell
	def _(llm_model_choices: list[str], mo):
	llm_tokenizer_selector: mo.ui.dropdown = mo.ui.dropdown(
	options=llm_model_choices,
	value=llm_model_choices[0],
	label="Select LLM Tokenizer Model",
	)
	llm_tokenizer_selector
	return (llm_tokenizer_selector,)


	@app.cell
	def _(mo):
	add_special_tokens_switch = mo.ui.switch(
	label="Add special tokens (encode)", value=False
	)
	skip_special_tokens_on_decode_switch = mo.ui.switch(
	label="Skip special tokens in decoded view", value=False
	)
	representation_radio = mo.ui.radio(
	options=["Auto (recommended)", "Decoded strings", "Raw tokens"],
	value="Auto (recommended)",
	label="LLM token representation",
	)
	display_limit_slider = mo.ui.slider(
	100, 5000, value=1000, label="Display token limit"
	)
	color_by_radio = mo.ui.radio(
	options=["Token", "ID", "Category"],
	value="Token",
	label="Color by",
	)
	show_spaces_switch = mo.ui.switch(
	label="Show spaces as · (decoded view)", value=False
	)

	mo.vstack(
	[
	mo.hstack(
	[
	add_special_tokens_switch,
	skip_special_tokens_on_decode_switch,
	]
	),
	mo.hstack([representation_radio, display_limit_slider]),
	mo.hstack([color_by_radio, show_spaces_switch]),
	mo.accordion(
	{
	"Tip": mo.md(
	"Many GPT-style tokenizers are byte-level; their raw vocab strings can look garbled. Use Decoded strings or Auto."
	).callout(kind="info")
	}
	),
	]
	)
	return (
	add_special_tokens_switch,
	color_by_radio,
	display_limit_slider,
	representation_radio,
	show_spaces_switch,
	skip_special_tokens_on_decode_switch,
	)


	@app.cell
	def _(mo):
	get_tok_cache, set_tok_cache = mo.state({})
	return get_tok_cache, set_tok_cache


	@app.cell
	def _(
	AutoTokenizer,
	PreTrainedTokenizerBase,
	get_tok_cache,
	llm_tokenizer_selector: "mo.ui.dropdown",
	mo,
	set_tok_cache,
	):
	# Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
	selected_model_name: str = llm_tokenizer_selector.value
	key = selected_model_name
	cache = get_tok_cache()
	if key in cache:
	tokenizer = cache[key]
	else:
	tokenizer: PreTrainedTokenizerBase = None
	try:
	tokenizer = AutoTokenizer.from_pretrained(
	selected_model_name,
	use_fast=True,
	trust_remote_code=True,
	)
	except Exception as e:
	mo.md(f"Failed to load tokenizer '{selected_model_name}': {e}").callout(
	kind="error"
	)
	tokenizer = None

	if tokenizer is not None:
	set_tok_cache({**cache, key: tokenizer})
	return (tokenizer,)


	@app.cell
	def _(Union, math):
	TokenStatsDict = dict[str, dict[str, Union[int, float]]]

	def get_token_stats(tokens: list[str], original_text: str) -> TokenStatsDict:
	"""Calculate enhanced statistics about the tokens."""
	if not tokens:
	# Return default structure matching TokenStatsDict
	return {
	"basic_stats": {
	"total_tokens": 0,
	"unique_tokens": 0,
	"compression_ratio": 0.0,
	"space_tokens": 0,
	"newline_tokens": 0,
	"special_tokens": 0,
	"punctuation_tokens": 0,
	"unique_percentage": 0.0,
	},
	"length_stats": {
	"avg_length": 0.0,
	"std_dev": 0.0,
	"min_length": 0,
	"max_length": 0,
	"median_length": 0.0,
	},
	}

	total_tokens: int = len(tokens)
	unique_tokens: int = len(set(tokens))
	compression_ratio: float = (
	len(original_text) / total_tokens if total_tokens > 0 else 0.0
	)

	space_tokens: int = sum(1 for t in tokens if t.startswith(("Ġ", "▁", " ")))
	newline_tokens: int = sum(
	1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
	)
	special_tokens: int = sum(
	1
	for t in tokens
	if (t.startswith("<") and t.endswith(">"))
	or (t.startswith("[") and t.endswith("]"))
	)
	punctuation_tokens: int = sum(
	1
	for t in tokens
	if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"]
	)

	lengths: list[int] = [len(t) for t in tokens]
	if not lengths: # Should not happen if tokens is not empty, but safe check
	return { # Return default structure matching TokenStatsDict
	"basic_stats": {
	"total_tokens": 0,
	"unique_tokens": 0,
	"compression_ratio": 0.0,
	"space_tokens": 0,
	"newline_tokens": 0,
	"special_tokens": 0,
	"punctuation_tokens": 0,
	"unique_percentage": 0.0,
	},
	"length_stats": {
	"avg_length": 0.0,
	"std_dev": 0.0,
	"min_length": 0,
	"max_length": 0,
	"median_length": 0.0,
	},
	}

	mean_length: float = sum(lengths) / len(lengths)
	variance: float = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
	std_dev: float = math.sqrt(variance)
	sorted_lengths: list[int] = sorted(lengths)
	n = len(lengths)
	if n % 2 == 1:
	median_length = float(sorted_lengths[n // 2])
	else:
	median_length = (sorted_lengths[n // 2 - 1] + sorted_lengths[n // 2]) / 2

	return {
	"basic_stats": {
	"total_tokens": total_tokens,
	"unique_tokens": unique_tokens,
	"compression_ratio": round(compression_ratio, 2),
	"space_tokens": space_tokens,
	"newline_tokens": newline_tokens,
	"special_tokens": special_tokens,
	"punctuation_tokens": punctuation_tokens,
	"unique_percentage": round(unique_tokens / total_tokens * 100, 1)
	if total_tokens > 0
	else 0.0,
	},
	"length_stats": {
	"avg_length": round(mean_length, 2),
	"std_dev": round(std_dev, 2),
	"min_length": min(lengths),
	"max_length": max(lengths),
	"median_length": median_length,
	},
	}
	return (get_token_stats,)


	@app.cell
	def _(hashlib):
	def get_varied_color(token: str) -> dict[str, str]:
	"""Generate vibrant colors with HSL for better visual distinction."""
	token_hash: str = hashlib.md5(token.encode()).hexdigest()
	hue: int = int(token_hash[:3], 16) % 360
	saturation: int = 70 + (int(token_hash[3:5], 16) % 20)
	lightness: int = 80 + (int(token_hash[5:7], 16) % 10)
	text_lightness: int = 20

	return {
	"background": f"hsl({hue}, {saturation}%, {lightness}%)",
	"text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
	}
	return (get_varied_color,)


	@app.function
	def fix_token(
	token: str, re
	) -> (
	str
	): # re module type is complex, leave as Any implicitly or import types.ModuleType
	"""Fix token for display, handling byte fallbacks and spaces."""
	# Check for byte fallback pattern <0xHH> using a full match
	byte_match = re.fullmatch(r"<0x([0-9A-Fa-f]{2})>", token)
	if byte_match:
	hex_value = byte_match.group(1).upper()
	# Return a clear representation indicating it's a byte
	return f"<0x{hex_value}>"

	# Replace SentencePiece space marker U+2581 ('▁') and BPE space marker 'Ġ' with a middle dot
	token = token.replace("▁", "·").replace("Ġ", "·")

	# Replace newline markers for display
	token = token.replace("Ċ", "↵\n")
	# Handle byte representation of newline AFTER general byte check
	# This specific check might become redundant if <0x0A> is caught by the byte_match above
	# Keep it for now as a fallback.
	token = token.replace("<0x0A>", "↵\n")

	return token


	@app.cell
	def _(Any, PreTrainedTokenizerBase):
	def get_tokenizer_info(
	tokenizer: PreTrainedTokenizerBase,
	) -> dict[str, Any]:
	"""
	Extract useful information from a tokenizer.
	Returns a dictionary with tokenizer details.
	"""
	info: dict[str, Any] = {}
	try:
	if hasattr(tokenizer, "vocab_size"):
	info["vocab_size"] = tokenizer.vocab_size
	elif hasattr(tokenizer, "get_vocab"):
	info["vocab_size"] = len(tokenizer.get_vocab())

	if (
	hasattr(tokenizer, "model_max_length")
	and isinstance(tokenizer.model_max_length, int)
	and tokenizer.model_max_length < 1000000
	):
	info["model_max_length"] = tokenizer.model_max_length
	else:
	info["model_max_length"] = "Not specified or very large"

	info["tokenizer_type"] = tokenizer.__class__.__name__

	special_tokens: dict[str, str] = {}
	special_token_attributes: list[str] = [
	"pad_token",
	"eos_token",
	"bos_token",
	"sep_token",
	"cls_token",
	"unk_token",
	"mask_token",
	]

	processed_tokens: set[str] = (
	set()
	) # Keep track of processed tokens to avoid duplicates

	# Prefer all_special_tokens if available
	if hasattr(tokenizer, "all_special_tokens"):
	for token_value in tokenizer.all_special_tokens:
	if (
	not token_value
	or not str(token_value).strip()
	or str(token_value) in processed_tokens
	):
	continue

	token_name = "special_token" # Default name
	# Find the attribute name corresponding to the token value
	for attr_name in special_token_attributes:
	if (
	hasattr(tokenizer, attr_name)
	and getattr(tokenizer, attr_name) == token_value
	):
	token_name = attr_name
	break
	token_str = str(token_value)
	token_id = (
	tokenizer.convert_tokens_to_ids(token_str)
	if hasattr(tokenizer, "convert_tokens_to_ids")
	else None
	)
	special_tokens[token_name] = token_str + (
	f" (id {token_id})" if isinstance(token_id, int) else ""
	)
	processed_tokens.add(str(token_value))

	# Fallback/Augment with individual attributes if not covered by all_special_tokens
	for token_name in special_token_attributes:
	if hasattr(tokenizer, token_name):
	token_value = getattr(tokenizer, token_name)
	if (
	token_value
	and str(token_value).strip()
	and str(token_value) not in processed_tokens
	):
	token_str = str(token_value)
	token_id = (
	tokenizer.convert_tokens_to_ids(token_str)
	if hasattr(tokenizer, "convert_tokens_to_ids")
	else None
	)
	special_tokens[token_name] = token_str + (
	f" (id {token_id})" if isinstance(token_id, int) else ""
	)
	processed_tokens.add(str(token_value))

	info["special_tokens"] = special_tokens if special_tokens else "None found"

	except Exception as e:
	info["error"] = f"Error extracting tokenizer info: {str(e)}"

	return info
	return (get_tokenizer_info,)


	@app.cell
	def _(mo):
	show_ids_switch: mo.ui.switch = mo.ui.switch(
	label="Show token IDs instead of text", value=False
	)
	return (show_ids_switch,)


	@app.cell
	def _(
	Any,
	Optional,
	Union,
	add_special_tokens_switch,
	color_by_radio,
	current_text: str,
	display_limit_slider,
	get_token_stats,
	get_tokenizer_info,
	get_varied_color,
	llm_tokenizer_selector: "mo.ui.dropdown",
	mo,
	re,
	representation_radio,
	show_ids_switch: "mo.ui.switch",
	show_spaces_switch,
	skip_special_tokens_on_decode_switch,
	tokenizer,
	):
	# Define the Unicode replacement character
	REPLACEMENT_CHARACTER = "\ufffd"

	mo.stop(tokenizer is None, "Please select a valid tokenizer model.")

	tokenizer_info: dict[str, Any] = get_tokenizer_info(tokenizer)

	# 1. Encode text to get token IDs first.
	token_ids: list[int] = tokenizer.encode(
	current_text, add_special_tokens=add_special_tokens_switch.value
	)

	# 2. Convert IDs to raw tokens and decode each individually
	raw_tokens: list[str] = tokenizer.convert_ids_to_tokens(token_ids)
	decoded_per_id: list[str] = [
	tokenizer.decode(
	[tid],
	skip_special_tokens=skip_special_tokens_on_decode_switch.value,
	clean_up_tokenization_spaces=False,
	)
	for tid in token_ids
	]

	# 3. Get offset mapping for span information
	enc = tokenizer(
	current_text,
	add_special_tokens=add_special_tokens_switch.value,
	return_offsets_mapping=True,
	)
	offsets = (
	enc.get("offset_mapping")
	if isinstance(enc, dict)
	else getattr(enc, "offset_mapping", None)
	)

	if offsets and len(offsets) == len(token_ids):
	records: list[dict[str, Union[int, str]]] = []
	for tid, raw, dec, (s, e) in zip(
	token_ids, raw_tokens, decoded_per_id, offsets
	):
	substr = current_text[s:e] if (s is not None and e is not None) else ""
	records.append(
	{
	"id": tid,
	"raw": raw,
	"dec": dec,
	"start": s,
	"end": e,
	"substr": substr,
	}
	)
	else:
	records = [
	{
	"id": tid,
	"raw": raw,
	"dec": dec,
	"start": None,
	"end": None,
	"substr": "",
	}
	for tid, raw, dec in zip(token_ids, raw_tokens, decoded_per_id)
	]

	def _is_byte_level(tok) -> bool:
	try:
	if getattr(tok, "is_fast", False):
	pre = tok.backend_tokenizer.pre_tokenizer
	types = [pre.__class__.__name__]
	if hasattr(pre, "pre_tokenizers"):
	types = [p.__class__.__name__ for p in pre.pre_tokenizers]
	return "ByteLevel" in types
	except Exception:
	pass
	return False

	if representation_radio.value == "Auto (recommended)":
	use_decoded: bool = _is_byte_level(tokenizer) or any(
	("Ġ" in r["raw"] or "Ċ" in r["raw"]) for r in records[:256]
	)
	elif representation_radio.value == "Decoded strings":
	use_decoded = True
	else:
	use_decoded = False

	if use_decoded:
	source_records = [r for r in records if r["dec"] != ""]
	stats_tokens_source: list[str] = [r["dec"] for r in records if r["dec"] != ""]
	else:
	source_records = records
	stats_tokens_source = [r["raw"] for r in records]

	total_token_count: int = len(source_records)
	display_limit: int = display_limit_slider.value
	display_records = source_records[:display_limit]
	display_limit_reached: bool = len(source_records) > display_limit

	# Generate data for visualization
	TokenVisData = dict[str, Union[str, int, bool, dict[str, str]]]
	llm_token_data: list[TokenVisData] = []

	for idx, r in enumerate(display_records):
	token_str: str = r["dec"] if use_decoded else r["raw"]

	# Apply space visualization in decoded view
	if use_decoded and show_spaces_switch.value:
	token_str = token_str.replace(" ", "·")

	is_invalid_utf8: bool = REPLACEMENT_CHARACTER in token_str
	fixed_token_display: str = (
	f"<0x{r['id']:X}>" if is_invalid_utf8 else fix_token(token_str, re)
	)

	# Choose color seed based on color_by_radio
	if color_by_radio.value == "ID":
	seed = f"id_{r['id']}"
	elif color_by_radio.value == "Category":
	probe = r["dec"] if use_decoded else r["raw"]
	if probe.startswith(("Ġ", "▁", " ")):
	cat = "space"
	elif ("\n" in probe) or ("Ċ" in probe):
	cat = "newline"
	elif (probe.startswith("<") and probe.endswith(">")) or (
	probe.startswith("[") and probe.endswith("]")
	):
	cat = "special"
	else:
	cat = "text"
	seed = f"cat_{cat}"
	else:
	seed = token_str

	colors: dict[str, str] = get_varied_color(
	seed if not is_invalid_utf8 else f"invalid_{r['id']}"
	)
	llm_token_data.append(
	{
	"original": (
	f"Vocab: {r['raw']}\n"
	f"Decoded: {r['dec'] if r['dec'] != '' else '∅'}\n"
	f"Span: [{r['start']}, {r['end']}]\n"
	f"Text: {r['substr']}"
	),
	"display": fixed_token_display,
	"colors": colors,
	"is_newline": "↵" in fixed_token_display,
	"token_id": r["id"],
	"token_index": idx,
	"is_invalid": is_invalid_utf8,
	}
	)

	token_stats: dict[str, dict[str, Union[int, float]]] = get_token_stats(
	stats_tokens_source,
	current_text,
	)

	html_parts: list[str] = [
	(
	lambda item: (
	style
	:= f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
	# Add specific style for invalid tokens
	+ (
	" border: 1px solid red;"
	if item.get("is_invalid")
	else (
	" border: 1px solid orange;"
	if item["display"].startswith("<0x")
	else ""
	)
	),
	# Modify title based on validity
	title := (
	f"Original: {item['original']}\nID: {item['token_id']}"
	+ ("\n(Invalid UTF-8)" if item.get("is_invalid") else "")
	+ ("\n(Byte Token)" if item["display"].startswith("<0x") else "")
	),
	aria_label := (
	("Token ID " + str(item["token_id"]) + ": " + item["original"])
	.replace("\n", " ")
	.replace('"', """)
	),
	display_content := str(item["token_id"])
	if show_ids_switch.value
	else item["display"],
	f'<span style="{style}" title="{title}" aria-label="{aria_label}">{display_content}</span>',
	)[-1] # Get the last element (the formatted string) from the lambda's tuple
	)(item)
	for item in llm_token_data
	]

	token_viz_html: mo.Html = mo.Html(
	f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
	)

	# Optional: Add a warning if the display limit was reached
	limit_warning: Optional[mo.md] = None # Use Optional type
	if display_limit_reached:
	limit_warning = mo.md(f"""Warning: Displaying only the first {display_limit:,} tokens out of {total_token_count:,}.
	Statistics are calculated on the full text.""").callout(kind="warn")

	representation_hint: Optional[mo.md] = None
	if representation_radio.value == "Raw tokens":
	try:
	if _is_byte_level(tokenizer):
	representation_hint = mo.md(
	"This tokenizer uses byte-level BPE; raw vocab strings are not human-readable. Prefer Decoded strings or Auto."
	).callout(kind="info")
	except Exception:
	pass

	# Use dict access safely with .get() for stats
	basic_stats: dict[str, Union[int, float]] = token_stats.get("basic_stats", {})
	length_stats: dict[str, Union[int, float]] = token_stats.get("length_stats", {})

	# Use list comprehensions for markdown generation (functional style)
	basic_stats_md: str = "Basic Stats:\n\n" + "\n".join(
	f"- {key.replace('_', ' ').title()}: `{value}`"
	for key, value in basic_stats.items()
	)

	length_stats_md: str = "Length (Character) Stats:\n\n" + "\n".join(
	f"- {key.replace('_', ' ').title()}: `{value}`"
	for key, value in length_stats.items()
	)

	# Build tokenizer info markdown parts
	tokenizer_info_md_parts: list[str] = [
	f"Tokenizer Type: `{tokenizer_info.get('tokenizer_type', 'N/A')}`"
	]
	if vocab_size := tokenizer_info.get("vocab_size"):
	tokenizer_info_md_parts.append(f"Vocab Size: `{vocab_size:,}`")
	if max_len := tokenizer_info.get("model_max_length"):
	tokenizer_info_md_parts.append(f"Model Max Length: `{max_len}`")

	special_tokens_info = tokenizer_info.get("special_tokens")
	if isinstance(special_tokens_info, dict) and special_tokens_info:
	tokenizer_info_md_parts.append("Special Tokens:")
	tokenizer_info_md_parts.extend(
	f" - `{name}`: `{str(val)}`" for name, val in special_tokens_info.items()
	)
	elif isinstance(special_tokens_info, str): # Handle "None found" case
	tokenizer_info_md_parts.append(f"Special Tokens: `{special_tokens_info}`")

	if error_info := tokenizer_info.get("error"):
	tokenizer_info_md_parts.append(f"Info Error: `{error_info}`")

	tokenizer_info_md: str = "\n\n".join(tokenizer_info_md_parts)

	tokenizer_info_accordion = mo.accordion(
	{"Tokenizer Info": mo.md(tokenizer_info_md)}
	)

	mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
	{show_ids_switch}

	{tokenizer_info_accordion}

	## Tokenizer output
	{limit_warning if limit_warning else ""}
	{representation_hint if representation_hint else ""}
	{mo.as_html(token_viz_html)}

	## Token Statistics
	(Calculated on full text if truncated above)

	{basic_stats_md}

	{length_stats_md}

	""")
	return


	@app.cell
	def _():
	return


	if __name__ == "__main__":
	app.run()