Spaces:

bor
/

counting_words

Running

App Files Files Community

Bor Hodošček commited on 13 days ago

Commit

ca0f322

1 Parent(s): 8bfa6b3

chore: update dockerfile and deps

Browse files

Files changed (6) hide show

.dockerignore +1 -0
Dockerfile +4 -5
app.py +351 -121
development.md +2 -2
pyproject.toml +3 -3
uv.lock +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .venv

Dockerfile CHANGED Viewed

@@ -1,5 +1,4 @@
-FROM python:3.12-slim
-COPY --from=ghcr.io/astral-sh/uv:0.7.3 /uv /bin/uv
 RUN useradd -m -u 1000 user
 ENV PATH="/home/user/.local/bin:$PATH"
@@ -7,14 +6,14 @@ ENV UV_SYSTEM_PYTHON=1
 WORKDIR /app
-RUN mkdir -p /app && chown -R user:user /app
-COPY --chown=user ./pyproject.toml ./uv.lock ./pyproject.toml ./app.py /app
 RUN chmod -R u+w /app
 USER user
-RUN uv sync
 CMD ["uv", "run", "marimo", "run", "app.py", "--no-sandbox", "--include-code", "--host", "0.0.0.0", "--port", "7860"]

+FROM ghcr.io/astral-sh/uv:0.9.5-python3.13-trixie-slim
 RUN useradd -m -u 1000 user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
+RUN chown -R user:user /app
+COPY --chown=user pyproject.toml uv.lock app.py /app
 RUN chmod -R u+w /app
 USER user
+RUN uv sync --locked
 CMD ["uv", "run", "marimo", "run", "app.py", "--no-sandbox", "--include-code", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 # /// script
 # dependencies = [
 #     "marimo>=0.13.0",
-#     "polars==1.29.0",
-#     "altair==5.5.0",
-#     "spacy==3.8.5",
 #     "en-core-web-md",
 #     "ja-core-news-md",
-#     "transformers==4.51.3",
 # ]
 #
 # [tool.uv.sources]
@@ -18,7 +18,7 @@
 import marimo
-__generated_with = "0.13.6"
 app = marimo.App(width="medium")
@@ -33,26 +33,18 @@ def _():
     import marimo as mo
     import polars as pl
     import spacy
-    import spacy.language
     from transformers import (
-        AutoTokenizer,
         PreTrainedTokenizerBase,
     )
-    # Load spaCy models for English and Japanese
-    nlp_en: spacy.language.Language = spacy.load("en_core_web_md")
-    nlp_ja: spacy.language.Language = spacy.load("ja_core_news_md")
-    # List of tokenizer models
     llm_model_choices: list[str] = [
-        # "meta-llama/Llama-4-Scout-17B-16E-Instruct",
         "google/gemma-3-27b-it",
         "ibm-granite/granite-3.3-8b-instruct",
-        "shisa-ai/shisa-v2-qwen2.5-7b",
-        # "deepseek-ai/DeepSeek-R1",
-        # "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
-        # "Qwen/Qwen2.5-72B-Instruct",
-        # "openai-community/gpt2",
         "google-bert/bert-large-uncased",
     ]
     return (
@@ -67,14 +59,29 @@ def _():
         llm_model_choices,
         math,
         mo,
-        nlp_en,
-        nlp_ja,
         pl,
         re,
         spacy,
     )
 @app.cell
 def _(mo):
     mo.md("""# Tokenization for English and Japanese""")
@@ -112,11 +119,11 @@ def _(mo):
 @app.cell
 def _(
     en_placeholder,
-    get_text_content,
     ja_placeholder,
-    language_selector,
     mo,
-    set_text_content,
 ):
     # Define text_input dynamically based on language
     current_placeholder: str = (
@@ -133,7 +140,7 @@ def _(
 @app.cell
-def _(current_placeholder, mo, set_text_content):
     def apply_placeholder() -> None:
         set_text_content(current_placeholder)
@@ -144,7 +151,12 @@ def _(current_placeholder, mo, set_text_content):
 @app.cell
-def _(apply_placeholder_button, language_selector, mo, text_input):
     mo.vstack(
         [
             text_input,
@@ -152,24 +164,25 @@ def _(apply_placeholder_button, language_selector, mo, text_input):
             mo.ui.button(label="Analyze"),
         ]
     )
     return
 @app.cell
-def _(get_text_content, language_selector, mo, nlp_en, nlp_ja, spacy):
     # Analyze text using spaCy based on selected language
-    current_text: str = get_text_content()
-    doc: spacy.tokens.Doc
-    if language_selector.value == "English":
-        doc = nlp_en(current_text)
-    else:
-        doc = nlp_ja(current_text)
-    model_name: str = (
-        nlp_en.meta["name"]
-        if language_selector.value == "English"
-        else nlp_ja.meta["name"]
     )
     tokenized_text: list[str] = [token.text for token in doc]
     token_count: int = len(tokenized_text)
@@ -181,7 +194,7 @@ def _(get_text_content, language_selector, mo, nlp_en, nlp_ja, spacy):
 @app.cell
-def _(doc, mo, pl):
     token_data: pl.DataFrame = pl.DataFrame(
         {
             "Token": [token.text for token in doc],
@@ -190,12 +203,15 @@ def _(doc, mo, pl):
             "Tag": [token.tag_ for token in doc],
             "Morph": [str(token.morph) for token in doc],
             "OOV": [
-                token.is_oov for token in doc
-            ],  # FIXME: How to get .is_oov() from sudachi directly? This only works for English now...
-            "Token Position": list(range(len(doc))),
-            "Sentence Number": [
-                i for i, sent in enumerate(doc.sents) for token in sent
             ],
         }
     )
@@ -216,7 +232,13 @@ def _(mo):
 @app.cell
-def _(alt, column_selector, mo, pl, token_data):
     mo.stop(token_data.is_empty(), "Please set input text.")
     selected_column: str = column_selector.value
@@ -243,7 +265,7 @@ def _(alt, column_selector, mo, pl, token_data):
 @app.cell
-def _(llm_model_choices, mo):
     llm_tokenizer_selector: mo.ui.dropdown = mo.ui.dropdown(
         options=llm_model_choices,
         value=llm_model_choices[0],
@@ -254,12 +276,96 @@ def _(llm_model_choices, mo):
 @app.cell
-def _(AutoTokenizer, PreTrainedTokenizerBase, llm_tokenizer_selector):
     # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
     selected_model_name: str = llm_tokenizer_selector.value
-    tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(
-        selected_model_name
-    )
     return (tokenizer,)
@@ -297,7 +403,7 @@ def _(Union, math):
             len(original_text) / total_tokens if total_tokens > 0 else 0.0
         )
-        space_tokens: int = sum(1 for t in tokens if t.startswith(("Ġ", " ")))
         newline_tokens: int = sum(
             1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
         )
@@ -339,7 +445,11 @@ def _(Union, math):
         variance: float = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
         std_dev: float = math.sqrt(variance)
         sorted_lengths: list[int] = sorted(lengths)
-        median_length: float = float(sorted_lengths[len(lengths) // 2])
         return {
             "basic_stats": {
@@ -362,7 +472,6 @@ def _(Union, math):
                 "median_length": median_length,
             },
         }
     return (get_token_stats,)
@@ -380,7 +489,6 @@ def _(hashlib):
             "background": f"hsl({hue}, {saturation}%, {lightness}%)",
             "text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
         }
     return (get_varied_color,)
@@ -398,14 +506,8 @@ def fix_token(
         # Return a clear representation indicating it's a byte
         return f"<0x{hex_value}>"
-    # Replace SentencePiece space marker U+2581 (' ') with a middle dot
-    token = token.replace(" ", "·")
-    # Replace BPE space marker 'Ġ' with a middle dot
-    if token.startswith("Ġ"):
-        space_count = token.count("Ġ")
-        # Ensure we only replace the leading 'Ġ' markers
-        return "·" * space_count + token[space_count:]
     # Replace newline markers for display
     token = token.replace("Ċ", "↵\n")
@@ -478,7 +580,15 @@ def _(Any, PreTrainedTokenizerBase):
                         ):
                             token_name = attr_name
                             break
-                    special_tokens[token_name] = str(token_value)
                     processed_tokens.add(str(token_value))
             # Fallback/Augment with individual attributes if not covered by all_special_tokens
@@ -490,7 +600,15 @@ def _(Any, PreTrainedTokenizerBase):
                         and str(token_value).strip()
                         and str(token_value) not in processed_tokens
                     ):
-                        special_tokens[token_name] = str(token_value)
                         processed_tokens.add(str(token_value))
             info["special_tokens"] = special_tokens if special_tokens else "None found"
@@ -499,7 +617,6 @@ def _(Any, PreTrainedTokenizerBase):
             info["error"] = f"Error extracting tokenizer info: {str(e)}"
         return info
     return (get_tokenizer_info,)
@@ -516,107 +633,209 @@ def _(
     Any,
     Optional,
     Union,
-    current_text,
     get_token_stats,
     get_tokenizer_info,
     get_varied_color,
-    llm_tokenizer_selector,
     mo,
     re,
-    show_ids_switch,
     tokenizer,
 ):
     # Define the Unicode replacement character
     REPLACEMENT_CHARACTER = "\ufffd"
-    # Get tokenizer metadata
     tokenizer_info: dict[str, Any] = get_tokenizer_info(tokenizer)
     # 1. Encode text to get token IDs first.
-    token_ids: list[int] = tokenizer.encode(current_text, add_special_tokens=False)
-    # 2. Decode each token ID individually.
-    #    We will check for REPLACEMENT_CHARACTER later.
-    all_decoded_tokens: list[str] = [
         tokenizer.decode(
-            [token_id], skip_special_tokens=False, clean_up_tokenization_spaces=False
         )
-        for token_id in token_ids
     ]
-    total_token_count: int = len(token_ids)  # Count based on IDs
-    # Limit the number of tokens for display
-    display_limit: int = 1000
-    # Limit consistently using token IDs and the decoded tokens
-    display_token_ids: list[int] = token_ids[:display_limit]
-    display_decoded_tokens: list[str] = all_decoded_tokens[:display_limit]
-    display_limit_reached: bool = total_token_count > display_limit
     # Generate data for visualization
     TokenVisData = dict[str, Union[str, int, bool, dict[str, str]]]
     llm_token_data: list[TokenVisData] = []
-    # Use zip for parallel iteration
-    for idx, (token_id, token_str) in enumerate(
-        zip(display_token_ids, display_decoded_tokens)
-    ):
-        colors: dict[str, str] = get_varied_color(
-            token_str
-            if REPLACEMENT_CHARACTER not in token_str
-            else f"invalid_{token_id}"
-        )  # Color based on string or ID if invalid
-        is_invalid_utf8 = REPLACEMENT_CHARACTER in token_str
-        fixed_token_display: str
-        original_for_title: str = (
-            token_str  # Store the potentially problematic string for title
         )
-        if is_invalid_utf8:
-            # If decode failed, show a representation with the hex ID
-            fixed_token_display = f"<0x{token_id:X}>"
         else:
-            # If decode succeeded, apply standard fixes
-            fixed_token_display = fix_token(token_str, re)
         llm_token_data.append(
             {
-                "original": original_for_title,  # Store the raw decoded string (might contain �)
-                "display": fixed_token_display,  # Store the cleaned/invalid representation
                 "colors": colors,
-                "is_newline": "↵" in fixed_token_display,  # Check the display version
-                "token_id": token_id,
                 "token_index": idx,
-                "is_invalid": is_invalid_utf8,  # Add flag for potential styling/title changes
             }
         )
-    # Calculate statistics using the list of *successfully* decoded token strings
-    # We might want to reconsider what `all_tokens` means for stats if many are invalid.
-    # For now, let's use the potentially problematic strings, as stats are mostly length/count based.
     token_stats: dict[str, dict[str, Union[int, float]]] = get_token_stats(
-        all_decoded_tokens,
-        current_text,  # Pass the full list from decode()
     )
-    # Construct HTML for colored tokens using list comprehension (functional style)
     html_parts: list[str] = [
         (
             lambda item: (
                 style
                 := f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
-                # Add specific style for invalid tokens if needed
-                + (" border: 1px solid red;" if item.get("is_invalid") else ""),
                 # Modify title based on validity
                 title := (
                     f"Original: {item['original']}\nID: {item['token_id']}"
                     + ("\n(Invalid UTF-8)" if item.get("is_invalid") else "")
-                    + ("\n(Byte Token)" if item["display"].startswith("byte[") else "")
                 ),
                 display_content := str(item["token_id"])
                 if show_ids_switch.value
                 else item["display"],
-                f'<span style="{style}" title="{title}">{display_content}</span>',
             )[-1]  # Get the last element (the formatted string) from the lambda's tuple
         )(item)
         for item in llm_token_data
@@ -632,6 +851,16 @@ def _(
         limit_warning = mo.md(f"""**Warning:** Displaying only the first {display_limit:,} tokens out of {total_token_count:,}.
         Statistics are calculated on the full text.""").callout(kind="warn")
     # Use dict access safely with .get() for stats
     basic_stats: dict[str, Union[int, float]] = token_stats.get("basic_stats", {})
     length_stats: dict[str, Union[int, float]] = token_stats.get("length_stats", {})
@@ -670,16 +899,18 @@ def _(
     tokenizer_info_md: str = "\n\n".join(tokenizer_info_md_parts)
-    # Display the final markdown output
-    mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
-    ## Tokenizer Info
-    {tokenizer_info_md}
     {show_ids_switch}
     ## Tokenizer output
     {limit_warning if limit_warning else ""}
     {mo.as_html(token_viz_html)}
     ## Token Statistics
@@ -690,7 +921,6 @@ def _(
     {length_stats_md}
     """)
     return

 # /// script
 # dependencies = [
 #     "marimo>=0.13.0",
+#     "polars>=1.29.0",
+#     "altair>=5.5.0",
+#     "spacy==3.8.7",
 #     "en-core-web-md",
 #     "ja-core-news-md",
+#     "transformers>=4.57.1",
 # ]
 #
 # [tool.uv.sources]
 import marimo
+__generated_with = "0.17.2"
 app = marimo.App(width="medium")
     import marimo as mo
     import polars as pl
     import spacy
     from transformers import (
         PreTrainedTokenizerBase,
+        AutoTokenizer,
     )
     llm_model_choices: list[str] = [
+        "deepseek-ai/DeepSeek-OCR",
+        "zai-org/GLM-4.6",
+        "openai/gpt-oss-20b",
         "google/gemma-3-27b-it",
         "ibm-granite/granite-3.3-8b-instruct",
+        "deep-analysis-research/Flux-Japanese-Qwen2.5-32B-Instruct-V1.0",
         "google-bert/bert-large-uncased",
     ]
     return (
         llm_model_choices,
         math,
         mo,
         pl,
         re,
         spacy,
     )
+@app.cell
+def _(mo, spacy):
+    get_nlp_en, set_nlp_en = mo.state(None)
+    get_nlp_ja, set_nlp_ja = mo.state(None)
+    def ensure_nlp(language: str) -> spacy.language.Language:
+        if language == "English":
+            if get_nlp_en() is None:
+                set_nlp_en(spacy.load("en_core_web_md"))
+            return get_nlp_en()
+        else:
+            if get_nlp_ja() is None:
+                set_nlp_ja(spacy.load("ja_core_news_md"))
+            return get_nlp_ja()
+    return (ensure_nlp,)
 @app.cell
 def _(mo):
     mo.md("""# Tokenization for English and Japanese""")
 @app.cell
 def _(
     en_placeholder,
+    get_text_content: "Callable[[], str]",
     ja_placeholder,
+    language_selector: "mo.ui.radio",
     mo,
+    set_text_content: "Callable[[str], None]",
 ):
     # Define text_input dynamically based on language
     current_placeholder: str = (
 @app.cell
+def _(current_placeholder: str, mo, set_text_content: "Callable[[str], None]"):
     def apply_placeholder() -> None:
         set_text_content(current_placeholder)
 @app.cell
+def _(
+    apply_placeholder_button: "mo.ui.button",
+    language_selector: "mo.ui.radio",
+    mo,
+    text_input: "mo.ui.text_area",
+):
     mo.vstack(
         [
             text_input,
             mo.ui.button(label="Analyze"),
         ]
     )
     return
 @app.cell
+def _(
+    ensure_nlp,
+    get_text_content: "Callable[[], str]",
+    language_selector: "mo.ui.radio",
+    mo,
+    spacy,
+):
     # Analyze text using spaCy based on selected language
+    mo.md("Note: Loading spaCy pipelines on first use may take a few seconds.").callout(
+        kind="info"
     )
+    current_text: str = get_text_content()
+    nlp = ensure_nlp(language_selector.value)
+    doc: spacy.tokens.Doc = nlp(current_text)
+    model_name: str = nlp.meta["name"]
     tokenized_text: list[str] = [token.text for token in doc]
     token_count: int = len(tokenized_text)
 @app.cell
+def _(doc: "spacy.tokens.Doc", language_selector: "mo.ui.radio", mo, pl):
     token_data: pl.DataFrame = pl.DataFrame(
         {
             "Token": [token.text for token in doc],
             "Tag": [token.tag_ for token in doc],
             "Morph": [str(token.morph) for token in doc],
             "OOV": [
+                token.is_oov if language_selector.value == "English" else None
+                for token in doc
             ],
+            "Token Position": list(range(len(doc))),
+            "Sentence Number": (
+                [i for i, sent in enumerate(doc.sents) for _ in sent]
+                if doc.has_annotation("SENT_START")
+                else [0] * len(doc)
+            ),
         }
     )
 @app.cell
+def _(
+    alt,
+    column_selector: "mo.ui.dropdown",
+    mo,
+    pl,
+    token_data: "pl.DataFrame",
+):
     mo.stop(token_data.is_empty(), "Please set input text.")
     selected_column: str = column_selector.value
 @app.cell
+def _(llm_model_choices: list[str], mo):
     llm_tokenizer_selector: mo.ui.dropdown = mo.ui.dropdown(
         options=llm_model_choices,
         value=llm_model_choices[0],
 @app.cell
+def _(mo):
+    add_special_tokens_switch = mo.ui.switch(
+        label="Add special tokens (encode)", value=False
+    )
+    skip_special_tokens_on_decode_switch = mo.ui.switch(
+        label="Skip special tokens in decoded view", value=False
+    )
+    representation_radio = mo.ui.radio(
+        options=["Auto (recommended)", "Decoded strings", "Raw tokens"],
+        value="Auto (recommended)",
+        label="LLM token representation",
+    )
+    display_limit_slider = mo.ui.slider(
+        100, 5000, value=1000, label="Display token limit"
+    )
+    color_by_radio = mo.ui.radio(
+        options=["Token", "ID", "Category"],
+        value="Token",
+        label="Color by",
+    )
+    show_spaces_switch = mo.ui.switch(
+        label="Show spaces as · (decoded view)", value=False
+    )
+    mo.vstack(
+        [
+            mo.hstack(
+                [
+                    add_special_tokens_switch,
+                    skip_special_tokens_on_decode_switch,
+                ]
+            ),
+            mo.hstack([representation_radio, display_limit_slider]),
+            mo.hstack([color_by_radio, show_spaces_switch]),
+            mo.accordion(
+                {
+                    "Tip": mo.md(
+                        "Many GPT-style tokenizers are byte-level; their raw vocab strings can look garbled. Use Decoded strings or Auto."
+                    ).callout(kind="info")
+                }
+            ),
+        ]
+    )
+    return (
+        add_special_tokens_switch,
+        color_by_radio,
+        display_limit_slider,
+        representation_radio,
+        show_spaces_switch,
+        skip_special_tokens_on_decode_switch,
+    )
+@app.cell
+def _(mo):
+    get_tok_cache, set_tok_cache = mo.state({})
+    return get_tok_cache, set_tok_cache
+@app.cell
+def _(
+    AutoTokenizer,
+    PreTrainedTokenizerBase,
+    get_tok_cache,
+    llm_tokenizer_selector: "mo.ui.dropdown",
+    mo,
+    set_tok_cache,
+):
     # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
     selected_model_name: str = llm_tokenizer_selector.value
+    key = selected_model_name
+    cache = get_tok_cache()
+    if key in cache:
+        tokenizer = cache[key]
+    else:
+        tokenizer: PreTrainedTokenizerBase = None
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                selected_model_name,
+                use_fast=True,
+                trust_remote_code=True,
+            )
+        except Exception as e:
+            mo.md(f"Failed to load tokenizer '{selected_model_name}': {e}").callout(
+                kind="error"
+            )
+            tokenizer = None
+        if tokenizer is not None:
+            set_tok_cache({**cache, key: tokenizer})
     return (tokenizer,)
             len(original_text) / total_tokens if total_tokens > 0 else 0.0
         )
+        space_tokens: int = sum(1 for t in tokens if t.startswith(("Ġ", "▁", " ")))
         newline_tokens: int = sum(
             1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
         )
         variance: float = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
         std_dev: float = math.sqrt(variance)
         sorted_lengths: list[int] = sorted(lengths)
+        n = len(lengths)
+        if n % 2 == 1:
+            median_length = float(sorted_lengths[n // 2])
+        else:
+            median_length = (sorted_lengths[n // 2 - 1] + sorted_lengths[n // 2]) / 2
         return {
             "basic_stats": {
                 "median_length": median_length,
             },
         }
     return (get_token_stats,)
             "background": f"hsl({hue}, {saturation}%, {lightness}%)",
             "text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
         }
     return (get_varied_color,)
         # Return a clear representation indicating it's a byte
         return f"<0x{hex_value}>"
+    # Replace SentencePiece space marker U+2581 ('▁') and BPE space marker 'Ġ' with a middle dot
+    token = token.replace("▁", "·").replace("Ġ", "·")
     # Replace newline markers for display
     token = token.replace("Ċ", "↵\n")
                         ):
                             token_name = attr_name
                             break
+                    token_str = str(token_value)
+                    token_id = (
+                        tokenizer.convert_tokens_to_ids(token_str)
+                        if hasattr(tokenizer, "convert_tokens_to_ids")
+                        else None
+                    )
+                    special_tokens[token_name] = token_str + (
+                        f" (id {token_id})" if isinstance(token_id, int) else ""
+                    )
                     processed_tokens.add(str(token_value))
             # Fallback/Augment with individual attributes if not covered by all_special_tokens
                         and str(token_value).strip()
                         and str(token_value) not in processed_tokens
                     ):
+                        token_str = str(token_value)
+                        token_id = (
+                            tokenizer.convert_tokens_to_ids(token_str)
+                            if hasattr(tokenizer, "convert_tokens_to_ids")
+                            else None
+                        )
+                        special_tokens[token_name] = token_str + (
+                            f" (id {token_id})" if isinstance(token_id, int) else ""
+                        )
                         processed_tokens.add(str(token_value))
             info["special_tokens"] = special_tokens if special_tokens else "None found"
             info["error"] = f"Error extracting tokenizer info: {str(e)}"
         return info
     return (get_tokenizer_info,)
     Any,
     Optional,
     Union,
+    add_special_tokens_switch,
+    color_by_radio,
+    current_text: str,
+    display_limit_slider,
     get_token_stats,
     get_tokenizer_info,
     get_varied_color,
+    llm_tokenizer_selector: "mo.ui.dropdown",
     mo,
     re,
+    representation_radio,
+    show_ids_switch: "mo.ui.switch",
+    show_spaces_switch,
+    skip_special_tokens_on_decode_switch,
     tokenizer,
 ):
     # Define the Unicode replacement character
     REPLACEMENT_CHARACTER = "\ufffd"
+    mo.stop(tokenizer is None, "Please select a valid tokenizer model.")
     tokenizer_info: dict[str, Any] = get_tokenizer_info(tokenizer)
     # 1. Encode text to get token IDs first.
+    token_ids: list[int] = tokenizer.encode(
+        current_text, add_special_tokens=add_special_tokens_switch.value
+    )
+    # 2. Convert IDs to raw tokens and decode each individually
+    raw_tokens: list[str] = tokenizer.convert_ids_to_tokens(token_ids)
+    decoded_per_id: list[str] = [
         tokenizer.decode(
+            [tid],
+            skip_special_tokens=skip_special_tokens_on_decode_switch.value,
+            clean_up_tokenization_spaces=False,
         )
+        for tid in token_ids
     ]
+    # 3. Get offset mapping for span information
+    enc = tokenizer(
+        current_text,
+        add_special_tokens=add_special_tokens_switch.value,
+        return_offsets_mapping=True,
+    )
+    offsets = (
+        enc.get("offset_mapping")
+        if isinstance(enc, dict)
+        else getattr(enc, "offset_mapping", None)
+    )
+    if offsets and len(offsets) == len(token_ids):
+        records: list[dict[str, Union[int, str]]] = []
+        for tid, raw, dec, (s, e) in zip(
+            token_ids, raw_tokens, decoded_per_id, offsets
+        ):
+            substr = current_text[s:e] if (s is not None and e is not None) else ""
+            records.append(
+                {
+                    "id": tid,
+                    "raw": raw,
+                    "dec": dec,
+                    "start": s,
+                    "end": e,
+                    "substr": substr,
+                }
+            )
+    else:
+        records = [
+            {
+                "id": tid,
+                "raw": raw,
+                "dec": dec,
+                "start": None,
+                "end": None,
+                "substr": "",
+            }
+            for tid, raw, dec in zip(token_ids, raw_tokens, decoded_per_id)
+        ]
+    def _is_byte_level(tok) -> bool:
+        try:
+            if getattr(tok, "is_fast", False):
+                pre = tok.backend_tokenizer.pre_tokenizer
+                types = [pre.__class__.__name__]
+                if hasattr(pre, "pre_tokenizers"):
+                    types = [p.__class__.__name__ for p in pre.pre_tokenizers]
+                return "ByteLevel" in types
+        except Exception:
+            pass
+        return False
+    if representation_radio.value == "Auto (recommended)":
+        use_decoded: bool = _is_byte_level(tokenizer) or any(
+            ("Ġ" in r["raw"] or "Ċ" in r["raw"]) for r in records[:256]
+        )
+    elif representation_radio.value == "Decoded strings":
+        use_decoded = True
+    else:
+        use_decoded = False
+    if use_decoded:
+        source_records = [r for r in records if r["dec"] != ""]
+        stats_tokens_source: list[str] = [r["dec"] for r in records if r["dec"] != ""]
+    else:
+        source_records = records
+        stats_tokens_source = [r["raw"] for r in records]
+    total_token_count: int = len(source_records)
+    display_limit: int = display_limit_slider.value
+    display_records = source_records[:display_limit]
+    display_limit_reached: bool = len(source_records) > display_limit
     # Generate data for visualization
     TokenVisData = dict[str, Union[str, int, bool, dict[str, str]]]
     llm_token_data: list[TokenVisData] = []
+    for idx, r in enumerate(display_records):
+        token_str: str = r["dec"] if use_decoded else r["raw"]
+        # Apply space visualization in decoded view
+        if use_decoded and show_spaces_switch.value:
+            token_str = token_str.replace(" ", "·")
+        is_invalid_utf8: bool = REPLACEMENT_CHARACTER in token_str
+        fixed_token_display: str = (
+            f"<0x{r['id']:X}>" if is_invalid_utf8 else fix_token(token_str, re)
         )
+        # Choose color seed based on color_by_radio
+        if color_by_radio.value == "ID":
+            seed = f"id_{r['id']}"
+        elif color_by_radio.value == "Category":
+            probe = r["dec"] if use_decoded else r["raw"]
+            if probe.startswith(("Ġ", "▁", " ")):
+                cat = "space"
+            elif ("\n" in probe) or ("Ċ" in probe):
+                cat = "newline"
+            elif (probe.startswith("<") and probe.endswith(">")) or (
+                probe.startswith("[") and probe.endswith("]")
+            ):
+                cat = "special"
+            else:
+                cat = "text"
+            seed = f"cat_{cat}"
         else:
+            seed = token_str
+        colors: dict[str, str] = get_varied_color(
+            seed if not is_invalid_utf8 else f"invalid_{r['id']}"
+        )
         llm_token_data.append(
             {
+                "original": (
+                    f"Vocab: {r['raw']}\n"
+                    f"Decoded: {r['dec'] if r['dec'] != '' else '∅'}\n"
+                    f"Span: [{r['start']}, {r['end']}]\n"
+                    f"Text: {r['substr']}"
+                ),
+                "display": fixed_token_display,
                 "colors": colors,
+                "is_newline": "↵" in fixed_token_display,
+                "token_id": r["id"],
                 "token_index": idx,
+                "is_invalid": is_invalid_utf8,
             }
         )
     token_stats: dict[str, dict[str, Union[int, float]]] = get_token_stats(
+        stats_tokens_source,
+        current_text,
     )
     html_parts: list[str] = [
         (
             lambda item: (
                 style
                 := f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
+                # Add specific style for invalid tokens
+                + (
+                    " border: 1px solid red;"
+                    if item.get("is_invalid")
+                    else (
+                        " border: 1px solid orange;"
+                        if item["display"].startswith("<0x")
+                        else ""
+                    )
+                ),
                 # Modify title based on validity
                 title := (
                     f"Original: {item['original']}\nID: {item['token_id']}"
                     + ("\n(Invalid UTF-8)" if item.get("is_invalid") else "")
+                    + ("\n(Byte Token)" if item["display"].startswith("<0x") else "")
+                ),
+                aria_label := (
+                    ("Token ID " + str(item["token_id"]) + ": " + item["original"])
+                    .replace("\n", " ")
+                    .replace('"', "&quot;")
                 ),
                 display_content := str(item["token_id"])
                 if show_ids_switch.value
                 else item["display"],
+                f'<span style="{style}" title="{title}" aria-label="{aria_label}">{display_content}</span>',
             )[-1]  # Get the last element (the formatted string) from the lambda's tuple
         )(item)
         for item in llm_token_data
         limit_warning = mo.md(f"""**Warning:** Displaying only the first {display_limit:,} tokens out of {total_token_count:,}.
         Statistics are calculated on the full text.""").callout(kind="warn")
+    representation_hint: Optional[mo.md] = None
+    if representation_radio.value == "Raw tokens":
+        try:
+            if _is_byte_level(tokenizer):
+                representation_hint = mo.md(
+                    "This tokenizer uses byte-level BPE; raw vocab strings are not human-readable. Prefer Decoded strings or Auto."
+                ).callout(kind="info")
+        except Exception:
+            pass
     # Use dict access safely with .get() for stats
     basic_stats: dict[str, Union[int, float]] = token_stats.get("basic_stats", {})
     length_stats: dict[str, Union[int, float]] = token_stats.get("length_stats", {})
     tokenizer_info_md: str = "\n\n".join(tokenizer_info_md_parts)
+    tokenizer_info_accordion = mo.accordion(
+        {"Tokenizer Info": mo.md(tokenizer_info_md)}
+    )
+    mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}
     {show_ids_switch}
+    {tokenizer_info_accordion}
     ## Tokenizer output
     {limit_warning if limit_warning else ""}
+    {representation_hint if representation_hint else ""}
     {mo.as_html(token_viz_html)}
     ## Token Statistics
     {length_stats_md}
     """)
     return

development.md CHANGED Viewed

@@ -3,6 +3,6 @@
 ## Testing your Dockerfile locally
 ```bash
-docker build -t marimo-app .
-docker run -it --rm -p 7860:7860 marimo-app
 ```

 ## Testing your Dockerfile locally
 ```bash
+docker build -t counting-words .
+docker run -it --rm -p 7860:7860 counting-words
 ```

pyproject.toml CHANGED Viewed

@@ -3,15 +3,15 @@ name = "counting-words"
 version = "0.1.0"
 description = "Counting words in English and Japanese texts demo"
 readme = "README.md"
-requires-python = ">=3.12"
 dependencies = [
     "marimo>=0.13.0",
     "polars>=1.27.1",
     "altair>=5.5.0",
-    "spacy>=3.8.5",
     "en-core-web-md",
     "ja-core-news-md",
-    "transformers>=4.51.3",
 ]
 [tool.uv.sources]

 version = "0.1.0"
 description = "Counting words in English and Japanese texts demo"
 readme = "README.md"
+requires-python = ">=3.13"
 dependencies = [
     "marimo>=0.13.0",
     "polars>=1.27.1",
     "altair>=5.5.0",
+    "spacy>=3.8.7",
     "en-core-web-md",
     "ja-core-news-md",
+    "transformers>=4.57.1",
 ]
 [tool.uv.sources]

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff