Spaces:

bor
/

counting_words

Running

File size: 31,197 Bytes

# /// script
# dependencies = [
#     "marimo>=0.13.0",
#     "polars>=1.29.0",
#     "altair>=5.5.0",
#     "spacy==3.8.7",
#     "en-core-web-md",
#     "ja-core-news-md",
#     "transformers>=4.57.1",
# ]
#
# [tool.uv.sources]
# en-core-web-md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl" }
# ja-core-news-md = { url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.8.0/ja_core_news_md-3.8.0-py3-none-any.whl" }
# [tool.marimo.runtime]
# auto_instantiate = false
# ///

import marimo

__generated_with = "0.17.2"
app = marimo.App(width="medium")


@app.cell
def _():
    import hashlib
    import math
    import re
    from typing import Any, Callable, Optional, Union

    import altair as alt
    import marimo as mo
    import polars as pl
    import spacy
    from transformers import (
        PreTrainedTokenizerBase,
        AutoTokenizer,
    )

    llm_model_choices: list[str] = [
        "deepseek-ai/DeepSeek-OCR",
        "zai-org/GLM-4.6",
        "openai/gpt-oss-20b",
        "google/gemma-3-27b-it",
        "ibm-granite/granite-3.3-8b-instruct",
        "deep-analysis-research/Flux-Japanese-Qwen2.5-32B-Instruct-V1.0",
        "google-bert/bert-large-uncased",
    ]
    return (
        Any,
        AutoTokenizer,
        Callable,
        Optional,
        PreTrainedTokenizerBase,
        Union,
        alt,
        hashlib,
        llm_model_choices,
        math,
        mo,
        pl,
        re,
        spacy,
    )


@app.cell
def _(mo, spacy):
    get_nlp_en, set_nlp_en = mo.state(None)
    get_nlp_ja, set_nlp_ja = mo.state(None)

    def ensure_nlp(language: str) -> spacy.language.Language:
        if language == "English":
            if get_nlp_en() is None:
                set_nlp_en(spacy.load("en_core_web_md"))
            return get_nlp_en()
        else:
            if get_nlp_ja() is None:
                set_nlp_ja(spacy.load("ja_core_news_md"))
            return get_nlp_ja()
    return (ensure_nlp,)


@app.cell
def _(mo):
    mo.md("""# Tokenization for English and Japanese""")
    return


@app.cell
def _(Callable, mo):
    # Central state for the text input content
    # Type the getter and setter
    get_text_content: Callable[[], str]
    set_text_content: Callable[[str], None]
    get_text_content, set_text_content = mo.state("")
    return get_text_content, set_text_content


@app.cell
def _(mo):
    # Placeholder texts
    en_placeholder = """
    Mrs. Ferrars died on the night of the 16th⁠–⁠17th September⁠—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours.
    """.strip()
    ja_placeholder = """
    吾輩は猫である。名前はまだ無い。
    　どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。
    """.strip()

    # Create UI element for language selection
    language_selector: mo.ui.radio = mo.ui.radio(
        options=["English", "Japanese"], value="English", label="Language"
    )
    return en_placeholder, ja_placeholder, language_selector


@app.cell
def _(
    en_placeholder,
    get_text_content: "Callable[[], str]",
    ja_placeholder,
    language_selector: "mo.ui.radio",
    mo,
    set_text_content: "Callable[[str], None]",
):
    # Define text_input dynamically based on language
    current_placeholder: str = (
        en_placeholder if language_selector.value == "English" else ja_placeholder
    )
    text_input: mo.ui.text_area = mo.ui.text_area(
        value=get_text_content(),
        label="Enter text",
        placeholder=current_placeholder,
        full_width=True,
        on_change=lambda v: set_text_content(v),
    )
    return current_placeholder, text_input


@app.cell
def _(current_placeholder: str, mo, set_text_content: "Callable[[str], None]"):
    def apply_placeholder() -> None:
        set_text_content(current_placeholder)

    apply_placeholder_button: mo.ui.button = mo.ui.button(
        label="Use Placeholder Text", on_click=lambda _: apply_placeholder()
    )
    return (apply_placeholder_button,)


@app.cell
def _(
    apply_placeholder_button: "mo.ui.button",
    language_selector: "mo.ui.radio",
    mo,
    text_input: "mo.ui.text_area",
):
    mo.vstack(
        [
            text_input,
            mo.hstack([language_selector, apply_placeholder_button], justify="start"),
            mo.ui.button(label="Analyze"),
        ]
    )
    return


@app.cell
def _(
    ensure_nlp,
    get_text_content: "Callable[[], str]",
    language_selector: "mo.ui.radio",
    mo,
    spacy,
):
    # Analyze text using spaCy based on selected language
    mo.md("Note: Loading spaCy pipelines on first use may take a few seconds.").callout(
        kind="info"
    )
    current_text: str = get_text_content()
    nlp = ensure_nlp(language_selector.value)
    doc: spacy.tokens.Doc = nlp(current_text)
    model_name: str = nlp.meta["name"]

    tokenized_text: list[str] = [token.text for token in doc]
    token_count: int = len(tokenized_text)

    mo.md(
        f"**Tokenized Text using spaCy {'en_' if language_selector.value == 'English' else 'ja_'}{model_name}:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}"
    )
    return current_text, doc


@app.cell
def _(doc: "spacy.tokens.Doc", language_selector: "mo.ui.radio", mo, pl):
    token_data: pl.DataFrame = pl.DataFrame(
        {
            "Token": [token.text for token in doc],
            "Lemma": [token.lemma_ for token in doc],
            "POS": [token.pos_ for token in doc],
            "Tag": [token.tag_ for token in doc],
            "Morph": [str(token.morph) for token in doc],
            "OOV": [
                token.is_oov if language_selector.value == "English" else None
                for token in doc
            ],
            "Token Position": list(range(len(doc))),
            "Sentence Number": (
                [i for i, sent in enumerate(doc.sents) for _ in sent]
                if doc.has_annotation("SENT_START")
                else [0] * len(doc)
            ),
        }
    )

    mo.ui.dataframe(token_data, page_size=50)
    return (token_data,)


@app.cell
def _(mo):
    column_selector: mo.ui.dropdown = mo.ui.dropdown(
        options=["POS", "Tag", "Lemma", "Token", "Morph", "OOV"],
        value="POS",
        label="Select column to visualize",
    )

    column_selector
    return (column_selector,)


@app.cell
def _(
    alt,
    column_selector: "mo.ui.dropdown",
    mo,
    pl,
    token_data: "pl.DataFrame",
):
    mo.stop(token_data.is_empty(), "Please set input text.")

    selected_column: str = column_selector.value
    # Calculate value counts for the selected column
    counts_df: pl.DataFrame = (
        token_data[selected_column]
        .value_counts()
        .sort(by=["count", selected_column], descending=[True, False])
    )

    chart: alt.Chart = (
        alt.Chart(counts_df)
        .mark_bar()
        .encode(
            x=alt.X("count", title="Frequency"),
            y=alt.Y(selected_column, title=selected_column, sort=None),
            tooltip=[selected_column, "count"],
        )
        .properties(title=f"{selected_column} Distribution")
        .interactive()
    )
    mo.ui.altair_chart(chart)
    return


@app.cell
def _(llm_model_choices: list[str], mo):
    llm_tokenizer_selector: mo.ui.dropdown = mo.ui.dropdown(
        options=llm_model_choices,
        value=llm_model_choices[0],
        label="Select LLM Tokenizer Model",
    )
    llm_tokenizer_selector
    return (llm_tokenizer_selector,)


@app.cell
def _(mo):
    add_special_tokens_switch = mo.ui.switch(
        label="Add special tokens (encode)", value=False
    )
    skip_special_tokens_on_decode_switch = mo.ui.switch(
        label="Skip special tokens in decoded view", value=False
    )
    representation_radio = mo.ui.radio(
        options=["Auto (recommended)", "Decoded strings", "Raw tokens"],
        value="Auto (recommended)",
        label="LLM token representation",
    )
    display_limit_slider = mo.ui.slider(
        100, 5000, value=1000, label="Display token limit"
    )
    color_by_radio = mo.ui.radio(
        options=["Token", "ID", "Category"],
        value="Token",
        label="Color by",
    )
    show_spaces_switch = mo.ui.switch(
        label="Show spaces as · (decoded view)", value=False
    )

    mo.vstack(
        [
            mo.hstack(
                [
                    add_special_tokens_switch,
                    skip_special_tokens_on_decode_switch,
                ]
            ),
            mo.hstack([representation_radio, display_limit_slider]),
            mo.hstack([color_by_radio, show_spaces_switch]),
            mo.accordion(
                {
                    "Tip": mo.md(
                        "Many GPT-style tokenizers are byte-level; their raw vocab strings can look garbled. Use Decoded strings or Auto."
                    ).callout(kind="info")
                }
            ),
        ]
    )
    return (
        add_special_tokens_switch,
        color_by_radio,
        display_limit_slider,
        representation_radio,
        show_spaces_switch,
        skip_special_tokens_on_decode_switch,
    )


@app.cell
def _(mo):
    get_tok_cache, set_tok_cache = mo.state({})
    return get_tok_cache, set_tok_cache


@app.cell
def _(
    AutoTokenizer,
    PreTrainedTokenizerBase,
    get_tok_cache,
    llm_tokenizer_selector: "mo.ui.dropdown",
    mo,
    set_tok_cache,
):
    # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
    selected_model_name: str = llm_tokenizer_selector.value
    key = selected_model_name
    cache = get_tok_cache()
    if key in cache:
        tokenizer = cache[key]
    else:
        tokenizer: PreTrainedTokenizerBase = None
        try:
            tokenizer = AutoTokenizer.from_pretrained(
                selected_model_name,
                use_fast=True,
                trust_remote_code=True,
            )
        except Exception as e:
            mo.md(f"Failed to load tokenizer '{selected_model_name}': {e}").callout(
                kind="error"
            )
            tokenizer = None

        if tokenizer is not None:
            set_tok_cache({**cache, key: tokenizer})
    return (tokenizer,)


@app.cell
def _(Union, math):
    TokenStatsDict = dict[str, dict[str, Union[int, float]]]

    def get_token_stats(tokens: list[str], original_text: str) -> TokenStatsDict:
        """Calculate enhanced statistics about the tokens."""
        if not tokens:
            # Return default structure matching TokenStatsDict
            return {
                "basic_stats": {
                    "total_tokens": 0,
                    "unique_tokens": 0,
                    "compression_ratio": 0.0,
                    "space_tokens": 0,
                    "newline_tokens": 0,
                    "special_tokens": 0,
                    "punctuation_tokens": 0,
                    "unique_percentage": 0.0,
                },
                "length_stats": {
                    "avg_length": 0.0,
                    "std_dev": 0.0,
                    "min_length": 0,
                    "max_length": 0,
                    "median_length": 0.0,
                },
            }

        total_tokens: int = len(tokens)
        unique_tokens: int = len(set(tokens))
        compression_ratio: float = (
            len(original_text) / total_tokens if total_tokens > 0 else 0.0
        )

        space_tokens: int = sum(1 for t in tokens if t.startswith(("Ġ", "▁", " ")))
        newline_tokens: int = sum(
            1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
        )
        special_tokens: int = sum(
            1
            for t in tokens
            if (t.startswith("<") and t.endswith(">"))
            or (t.startswith("[") and t.endswith("]"))
        )
        punctuation_tokens: int = sum(
            1
            for t in tokens
            if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"]
        )

        lengths: list[int] = [len(t) for t in tokens]
        if not lengths:  # Should not happen if tokens is not empty, but safe check
            return {  # Return default structure matching TokenStatsDict
                "basic_stats": {
                    "total_tokens": 0,
                    "unique_tokens": 0,
                    "compression_ratio": 0.0,
                    "space_tokens": 0,
                    "newline_tokens": 0,
                    "special_tokens": 0,
                    "punctuation_tokens": 0,
                    "unique_percentage": 0.0,
                },
                "length_stats": {
                    "avg_length": 0.0,
                    "std_dev": 0.0,
                    "min_length": 0,
                    "max_length": 0,
                    "median_length": 0.0,
                },
            }

        mean_length: float = sum(lengths) / len(lengths)
        variance: float = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
        std_dev: float = math.sqrt(variance)
        sorted_lengths: list[int] = sorted(lengths)
        n = len(lengths)
        if n % 2 == 1:
            median_length = float(sorted_lengths[n // 2])
        else:
            median_length = (sorted_lengths[n // 2 - 1] + sorted_lengths[n // 2]) / 2

        return {
            "basic_stats": {
                "total_tokens": total_tokens,
                "unique_tokens": unique_tokens,
                "compression_ratio": round(compression_ratio, 2),
                "space_tokens": space_tokens,
                "newline_tokens": newline_tokens,
                "special_tokens": special_tokens,
                "punctuation_tokens": punctuation_tokens,
                "unique_percentage": round(unique_tokens / total_tokens * 100, 1)
                if total_tokens > 0
                else 0.0,
            },
            "length_stats": {
                "avg_length": round(mean_length, 2),
                "std_dev": round(std_dev, 2),
                "min_length": min(lengths),
                "max_length": max(lengths),
                "median_length": median_length,
            },
        }
    return (get_token_stats,)


@app.cell
def _(hashlib):
    def get_varied_color(token: str) -> dict[str, str]:
        """Generate vibrant colors with HSL for better visual distinction."""
        token_hash: str = hashlib.md5(token.encode()).hexdigest()
        hue: int = int(token_hash[:3], 16) % 360
        saturation: int = 70 + (int(token_hash[3:5], 16) % 20)
        lightness: int = 80 + (int(token_hash[5:7], 16) % 10)
        text_lightness: int = 20

        return {
            "background": f"hsl({hue}, {saturation}%, {lightness}%)",
            "text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
        }
    return (get_varied_color,)


@app.function
def fix_token(
    token: str, re
) -> (
    str
):  # re module type is complex, leave as Any implicitly or import types.ModuleType
    """Fix token for display, handling byte fallbacks and spaces."""
    # Check for byte fallback pattern <0xHH> using a full match
    byte_match = re.fullmatch(r"<0x([0-9A-Fa-f]{2})>", token)
    if byte_match:
        hex_value = byte_match.group(1).upper()
        # Return a clear representation indicating it's a byte
        return f"<0x{hex_value}>"

    # Replace SentencePiece space marker U+2581 ('▁') and BPE space marker 'Ġ' with a middle dot
    token = token.replace("▁", "·").replace("Ġ", "·")

    # Replace newline markers for display
    token = token.replace("Ċ", "↵\n")
    # Handle byte representation of newline AFTER general byte check
    # This specific check might become redundant if <0x0A> is caught by the byte_match above
    # Keep it for now as a fallback.
    token = token.replace("<0x0A>", "↵\n")

    return token


@app.cell
def _(Any, PreTrainedTokenizerBase):
    def get_tokenizer_info(
        tokenizer: PreTrainedTokenizerBase,
    ) -> dict[str, Any]:
        """
        Extract useful information from a tokenizer.
        Returns a dictionary with tokenizer details.
        """
        info: dict[str, Any] = {}
        try:
            if hasattr(tokenizer, "vocab_size"):
                info["vocab_size"] = tokenizer.vocab_size
            elif hasattr(tokenizer, "get_vocab"):
                info["vocab_size"] = len(tokenizer.get_vocab())

            if (
                hasattr(tokenizer, "model_max_length")
                and isinstance(tokenizer.model_max_length, int)
                and tokenizer.model_max_length < 1000000
            ):
                info["model_max_length"] = tokenizer.model_max_length
            else:
                info["model_max_length"] = "Not specified or very large"

            info["tokenizer_type"] = tokenizer.__class__.__name__

            special_tokens: dict[str, str] = {}
            special_token_attributes: list[str] = [
                "pad_token",
                "eos_token",
                "bos_token",
                "sep_token",
                "cls_token",
                "unk_token",
                "mask_token",
            ]

            processed_tokens: set[str] = (
                set()
            )  # Keep track of processed tokens to avoid duplicates

            # Prefer all_special_tokens if available
            if hasattr(tokenizer, "all_special_tokens"):
                for token_value in tokenizer.all_special_tokens:
                    if (
                        not token_value
                        or not str(token_value).strip()
                        or str(token_value) in processed_tokens
                    ):
                        continue

                    token_name = "special_token"  # Default name
                    # Find the attribute name corresponding to the token value
                    for attr_name in special_token_attributes:
                        if (
                            hasattr(tokenizer, attr_name)
                            and getattr(tokenizer, attr_name) == token_value
                        ):
                            token_name = attr_name
                            break
                    token_str = str(token_value)
                    token_id = (
                        tokenizer.convert_tokens_to_ids(token_str)
                        if hasattr(tokenizer, "convert_tokens_to_ids")
                        else None
                    )
                    special_tokens[token_name] = token_str + (
                        f" (id {token_id})" if isinstance(token_id, int) else ""
                    )
                    processed_tokens.add(str(token_value))

            # Fallback/Augment with individual attributes if not covered by all_special_tokens
            for token_name in special_token_attributes:
                if hasattr(tokenizer, token_name):
                    token_value = getattr(tokenizer, token_name)
                    if (
                        token_value
                        and str(token_value).strip()
                        and str(token_value) not in processed_tokens
                    ):
                        token_str = str(token_value)
                        token_id = (
                            tokenizer.convert_tokens_to_ids(token_str)
                            if hasattr(tokenizer, "convert_tokens_to_ids")
                            else None
                        )
                        special_tokens[token_name] = token_str + (
                            f" (id {token_id})" if isinstance(token_id, int) else ""
                        )
                        processed_tokens.add(str(token_value))

            info["special_tokens"] = special_tokens if special_tokens else "None found"

        except Exception as e:
            info["error"] = f"Error extracting tokenizer info: {str(e)}"

        return info
    return (get_tokenizer_info,)


@app.cell
def _(mo):
    show_ids_switch: mo.ui.switch = mo.ui.switch(
        label="Show token IDs instead of text", value=False
    )
    return (show_ids_switch,)


@app.cell
def _(
    Any,
    Optional,
    Union,
    add_special_tokens_switch,
    color_by_radio,
    current_text: str,
    display_limit_slider,
    get_token_stats,
    get_tokenizer_info,
    get_varied_color,
    llm_tokenizer_selector: "mo.ui.dropdown",
    mo,
    re,
    representation_radio,
    show_ids_switch: "mo.ui.switch",
    show_spaces_switch,
    skip_special_tokens_on_decode_switch,
    tokenizer,
):
    # Define the Unicode replacement character
    REPLACEMENT_CHARACTER = "\ufffd"

    mo.stop(tokenizer is None, "Please select a valid tokenizer model.")

    tokenizer_info: dict[str, Any] = get_tokenizer_info(tokenizer)

    # 1. Encode text to get token IDs first.
    token_ids: list[int] = tokenizer.encode(
        current_text, add_special_tokens=add_special_tokens_switch.value
    )

    # 2. Convert IDs to raw tokens and decode each individually
    raw_tokens: list[str] = tokenizer.convert_ids_to_tokens(token_ids)
    decoded_per_id: list[str] = [
        tokenizer.decode(
            [tid],
            skip_special_tokens=skip_special_tokens_on_decode_switch.value,
            clean_up_tokenization_spaces=False,
        )
        for tid in token_ids
    ]

    # 3. Get offset mapping for span information
    enc = tokenizer(
        current_text,
        add_special_tokens=add_special_tokens_switch.value,
        return_offsets_mapping=True,
    )
    offsets = (
        enc.get("offset_mapping")
        if isinstance(enc, dict)
        else getattr(enc, "offset_mapping", None)
    )

    if offsets and len(offsets) == len(token_ids):
        records: list[dict[str, Union[int, str]]] = []
        for tid, raw, dec, (s, e) in zip(
            token_ids, raw_tokens, decoded_per_id, offsets
        ):
            substr = current_text[s:e] if (s is not None and e is not None) else ""
            records.append(
                {
                    "id": tid,
                    "raw": raw,
                    "dec": dec,
                    "start": s,
                    "end": e,
                    "substr": substr,
                }
            )
    else:
        records = [
            {
                "id": tid,
                "raw": raw,
                "dec": dec,
                "start": None,
                "end": None,
                "substr": "",
            }
            for tid, raw, dec in zip(token_ids, raw_tokens, decoded_per_id)
        ]

    def _is_byte_level(tok) -> bool:
        try:
            if getattr(tok, "is_fast", False):
                pre = tok.backend_tokenizer.pre_tokenizer
                types = [pre.__class__.__name__]
                if hasattr(pre, "pre_tokenizers"):
                    types = [p.__class__.__name__ for p in pre.pre_tokenizers]
                return "ByteLevel" in types
        except Exception:
            pass
        return False

    if representation_radio.value == "Auto (recommended)":
        use_decoded: bool = _is_byte_level(tokenizer) or any(
            ("Ġ" in r["raw"] or "Ċ" in r["raw"]) for r in records[:256]
        )
    elif representation_radio.value == "Decoded strings":
        use_decoded = True
    else:
        use_decoded = False

    if use_decoded:
        source_records = [r for r in records if r["dec"] != ""]
        stats_tokens_source: list[str] = [r["dec"] for r in records if r["dec"] != ""]
    else:
        source_records = records
        stats_tokens_source = [r["raw"] for r in records]

    total_token_count: int = len(source_records)
    display_limit: int = display_limit_slider.value
    display_records = source_records[:display_limit]
    display_limit_reached: bool = len(source_records) > display_limit

    # Generate data for visualization
    TokenVisData = dict[str, Union[str, int, bool, dict[str, str]]]
    llm_token_data: list[TokenVisData] = []

    for idx, r in enumerate(display_records):
        token_str: str = r["dec"] if use_decoded else r["raw"]

        # Apply space visualization in decoded view
        if use_decoded and show_spaces_switch.value:
            token_str = token_str.replace(" ", "·")

        is_invalid_utf8: bool = REPLACEMENT_CHARACTER in token_str
        fixed_token_display: str = (
            f"<0x{r['id']:X}>" if is_invalid_utf8 else fix_token(token_str, re)
        )

        # Choose color seed based on color_by_radio
        if color_by_radio.value == "ID":
            seed = f"id_{r['id']}"
        elif color_by_radio.value == "Category":
            probe = r["dec"] if use_decoded else r["raw"]
            if probe.startswith(("Ġ", "▁", " ")):
                cat = "space"
            elif ("\n" in probe) or ("Ċ" in probe):
                cat = "newline"
            elif (probe.startswith("<") and probe.endswith(">")) or (
                probe.startswith("[") and probe.endswith("]")
            ):
                cat = "special"
            else:
                cat = "text"
            seed = f"cat_{cat}"
        else:
            seed = token_str

        colors: dict[str, str] = get_varied_color(
            seed if not is_invalid_utf8 else f"invalid_{r['id']}"
        )
        llm_token_data.append(
            {
                "original": (
                    f"Vocab: {r['raw']}\n"
                    f"Decoded: {r['dec'] if r['dec'] != '' else '∅'}\n"
                    f"Span: [{r['start']}, {r['end']}]\n"
                    f"Text: {r['substr']}"
                ),
                "display": fixed_token_display,
                "colors": colors,
                "is_newline": "↵" in fixed_token_display,
                "token_id": r["id"],
                "token_index": idx,
                "is_invalid": is_invalid_utf8,
            }
        )

    token_stats: dict[str, dict[str, Union[int, float]]] = get_token_stats(
        stats_tokens_source,
        current_text,
    )

    html_parts: list[str] = [
        (
            lambda item: (
                style
                := f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
                # Add specific style for invalid tokens
                + (
                    " border: 1px solid red;"
                    if item.get("is_invalid")
                    else (
                        " border: 1px solid orange;"
                        if item["display"].startswith("<0x")
                        else ""
                    )
                ),
                # Modify title based on validity
                title := (
                    f"Original: {item['original']}\nID: {item['token_id']}"
                    + ("\n(Invalid UTF-8)" if item.get("is_invalid") else "")
                    + ("\n(Byte Token)" if item["display"].startswith("<0x") else "")
                ),
                aria_label := (
                    ("Token ID " + str(item["token_id"]) + ": " + item["original"])
                    .replace("\n", " ")
                    .replace('"', "&quot;")
                ),
                display_content := str(item["token_id"])
                if show_ids_switch.value
                else item["display"],
                f'<span style="{style}" title="{title}" aria-label="{aria_label}">{display_content}</span>',
            )[-1]  # Get the last element (the formatted string) from the lambda's tuple
        )(item)
        for item in llm_token_data
    ]

    token_viz_html: mo.Html = mo.Html(
        f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
    )

    # Optional: Add a warning if the display limit was reached
    limit_warning: Optional[mo.md] = None  # Use Optional type
    if display_limit_reached:
        limit_warning = mo.md(f"""**Warning:** Displaying only the first {display_limit:,} tokens out of {total_token_count:,}.
        Statistics are calculated on the full text.""").callout(kind="warn")

    representation_hint: Optional[mo.md] = None
    if representation_radio.value == "Raw tokens":
        try:
            if _is_byte_level(tokenizer):
                representation_hint = mo.md(
                    "This tokenizer uses byte-level BPE; raw vocab strings are not human-readable. Prefer Decoded strings or Auto."
                ).callout(kind="info")
        except Exception:
            pass

    # Use dict access safely with .get() for stats
    basic_stats: dict[str, Union[int, float]] = token_stats.get("basic_stats", {})
    length_stats: dict[str, Union[int, float]] = token_stats.get("length_stats", {})

    # Use list comprehensions for markdown generation (functional style)
    basic_stats_md: str = "**Basic Stats:**\n\n" + "\n".join(
        f"-   **{key.replace('_', ' ').title()}:** `{value}`"
        for key, value in basic_stats.items()
    )

    length_stats_md: str = "**Length (Character) Stats:**\n\n" + "\n".join(
        f"-   **{key.replace('_', ' ').title()}:** `{value}`"
        for key, value in length_stats.items()
    )

    # Build tokenizer info markdown parts
    tokenizer_info_md_parts: list[str] = [
        f"**Tokenizer Type:** `{tokenizer_info.get('tokenizer_type', 'N/A')}`"
    ]
    if vocab_size := tokenizer_info.get("vocab_size"):
        tokenizer_info_md_parts.append(f"**Vocab Size:** `{vocab_size:,}`")
    if max_len := tokenizer_info.get("model_max_length"):
        tokenizer_info_md_parts.append(f"**Model Max Length:** `{max_len}`")

    special_tokens_info = tokenizer_info.get("special_tokens")
    if isinstance(special_tokens_info, dict) and special_tokens_info:
        tokenizer_info_md_parts.append("**Special Tokens:**")
        tokenizer_info_md_parts.extend(
            f"  - `{name}`: `{str(val)}`" for name, val in special_tokens_info.items()
        )
    elif isinstance(special_tokens_info, str):  # Handle "None found" case
        tokenizer_info_md_parts.append(f"**Special Tokens:** `{special_tokens_info}`")

    if error_info := tokenizer_info.get("error"):
        tokenizer_info_md_parts.append(f"**Info Error:** `{error_info}`")

    tokenizer_info_md: str = "\n\n".join(tokenizer_info_md_parts)

    tokenizer_info_accordion = mo.accordion(
        {"Tokenizer Info": mo.md(tokenizer_info_md)}
    )

    mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
    {show_ids_switch}

    {tokenizer_info_accordion}

    ## Tokenizer output
    {limit_warning if limit_warning else ""}
    {representation_hint if representation_hint else ""}
    {mo.as_html(token_viz_html)}

    ## Token Statistics
    (Calculated on full text if truncated above)

    {basic_stats_md}

    {length_stats_md}

    """)
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()