# /// script # dependencies = [ # "marimo>=0.13.0", # "polars>=1.29.0", # "altair>=5.5.0", # "spacy==3.8.7", # "en-core-web-md", # "ja-core-news-md", # "transformers>=4.57.1", # ] # # [tool.uv.sources] # en-core-web-md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl" } # ja-core-news-md = { url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.8.0/ja_core_news_md-3.8.0-py3-none-any.whl" } # [tool.marimo.runtime] # auto_instantiate = false # /// import marimo __generated_with = "0.17.2" app = marimo.App(width="medium") @app.cell def _(): import hashlib import math import re from typing import Any, Callable, Optional, Union import altair as alt import marimo as mo import polars as pl import spacy from transformers import ( PreTrainedTokenizerBase, AutoTokenizer, ) llm_model_choices: list[str] = [ "deepseek-ai/DeepSeek-OCR", "zai-org/GLM-4.6", "openai/gpt-oss-20b", "google/gemma-3-27b-it", "ibm-granite/granite-3.3-8b-instruct", "deep-analysis-research/Flux-Japanese-Qwen2.5-32B-Instruct-V1.0", "google-bert/bert-large-uncased", ] return ( Any, AutoTokenizer, Callable, Optional, PreTrainedTokenizerBase, Union, alt, hashlib, llm_model_choices, math, mo, pl, re, spacy, ) @app.cell def _(mo, spacy): get_nlp_en, set_nlp_en = mo.state(None) get_nlp_ja, set_nlp_ja = mo.state(None) def ensure_nlp(language: str) -> spacy.language.Language: if language == "English": if get_nlp_en() is None: set_nlp_en(spacy.load("en_core_web_md")) return get_nlp_en() else: if get_nlp_ja() is None: set_nlp_ja(spacy.load("ja_core_news_md")) return get_nlp_ja() return (ensure_nlp,) @app.cell def _(mo): mo.md("""# Tokenization for English and Japanese""") return @app.cell def _(Callable, mo): # Central state for the text input content # Type the getter and setter get_text_content: Callable[[], str] set_text_content: Callable[[str], None] get_text_content, set_text_content = mo.state("") return get_text_content, set_text_content @app.cell def _(mo): # Placeholder texts en_placeholder = """ Mrs. Ferrars died on the night of the 16th–17th September—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours. """.strip() ja_placeholder = """ 吾輩は猫である。名前はまだ無い。 どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。 """.strip() # Create UI element for language selection language_selector: mo.ui.radio = mo.ui.radio( options=["English", "Japanese"], value="English", label="Language" ) return en_placeholder, ja_placeholder, language_selector @app.cell def _( en_placeholder, get_text_content: "Callable[[], str]", ja_placeholder, language_selector: "mo.ui.radio", mo, set_text_content: "Callable[[str], None]", ): # Define text_input dynamically based on language current_placeholder: str = ( en_placeholder if language_selector.value == "English" else ja_placeholder ) text_input: mo.ui.text_area = mo.ui.text_area( value=get_text_content(), label="Enter text", placeholder=current_placeholder, full_width=True, on_change=lambda v: set_text_content(v), ) return current_placeholder, text_input @app.cell def _(current_placeholder: str, mo, set_text_content: "Callable[[str], None]"): def apply_placeholder() -> None: set_text_content(current_placeholder) apply_placeholder_button: mo.ui.button = mo.ui.button( label="Use Placeholder Text", on_click=lambda _: apply_placeholder() ) return (apply_placeholder_button,) @app.cell def _( apply_placeholder_button: "mo.ui.button", language_selector: "mo.ui.radio", mo, text_input: "mo.ui.text_area", ): mo.vstack( [ text_input, mo.hstack([language_selector, apply_placeholder_button], justify="start"), mo.ui.button(label="Analyze"), ] ) return @app.cell def _( ensure_nlp, get_text_content: "Callable[[], str]", language_selector: "mo.ui.radio", mo, spacy, ): # Analyze text using spaCy based on selected language mo.md("Note: Loading spaCy pipelines on first use may take a few seconds.").callout( kind="info" ) current_text: str = get_text_content() nlp = ensure_nlp(language_selector.value) doc: spacy.tokens.Doc = nlp(current_text) model_name: str = nlp.meta["name"] tokenized_text: list[str] = [token.text for token in doc] token_count: int = len(tokenized_text) mo.md( f"**Tokenized Text using spaCy {'en_' if language_selector.value == 'English' else 'ja_'}{model_name}:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}" ) return current_text, doc @app.cell def _(doc: "spacy.tokens.Doc", language_selector: "mo.ui.radio", mo, pl): token_data: pl.DataFrame = pl.DataFrame( { "Token": [token.text for token in doc], "Lemma": [token.lemma_ for token in doc], "POS": [token.pos_ for token in doc], "Tag": [token.tag_ for token in doc], "Morph": [str(token.morph) for token in doc], "OOV": [ token.is_oov if language_selector.value == "English" else None for token in doc ], "Token Position": list(range(len(doc))), "Sentence Number": ( [i for i, sent in enumerate(doc.sents) for _ in sent] if doc.has_annotation("SENT_START") else [0] * len(doc) ), } ) mo.ui.dataframe(token_data, page_size=50) return (token_data,) @app.cell def _(mo): column_selector: mo.ui.dropdown = mo.ui.dropdown( options=["POS", "Tag", "Lemma", "Token", "Morph", "OOV"], value="POS", label="Select column to visualize", ) column_selector return (column_selector,) @app.cell def _( alt, column_selector: "mo.ui.dropdown", mo, pl, token_data: "pl.DataFrame", ): mo.stop(token_data.is_empty(), "Please set input text.") selected_column: str = column_selector.value # Calculate value counts for the selected column counts_df: pl.DataFrame = ( token_data[selected_column] .value_counts() .sort(by=["count", selected_column], descending=[True, False]) ) chart: alt.Chart = ( alt.Chart(counts_df) .mark_bar() .encode( x=alt.X("count", title="Frequency"), y=alt.Y(selected_column, title=selected_column, sort=None), tooltip=[selected_column, "count"], ) .properties(title=f"{selected_column} Distribution") .interactive() ) mo.ui.altair_chart(chart) return @app.cell def _(llm_model_choices: list[str], mo): llm_tokenizer_selector: mo.ui.dropdown = mo.ui.dropdown( options=llm_model_choices, value=llm_model_choices[0], label="Select LLM Tokenizer Model", ) llm_tokenizer_selector return (llm_tokenizer_selector,) @app.cell def _(mo): add_special_tokens_switch = mo.ui.switch( label="Add special tokens (encode)", value=False ) skip_special_tokens_on_decode_switch = mo.ui.switch( label="Skip special tokens in decoded view", value=False ) representation_radio = mo.ui.radio( options=["Auto (recommended)", "Decoded strings", "Raw tokens"], value="Auto (recommended)", label="LLM token representation", ) display_limit_slider = mo.ui.slider( 100, 5000, value=1000, label="Display token limit" ) color_by_radio = mo.ui.radio( options=["Token", "ID", "Category"], value="Token", label="Color by", ) show_spaces_switch = mo.ui.switch( label="Show spaces as · (decoded view)", value=False ) mo.vstack( [ mo.hstack( [ add_special_tokens_switch, skip_special_tokens_on_decode_switch, ] ), mo.hstack([representation_radio, display_limit_slider]), mo.hstack([color_by_radio, show_spaces_switch]), mo.accordion( { "Tip": mo.md( "Many GPT-style tokenizers are byte-level; their raw vocab strings can look garbled. Use Decoded strings or Auto." ).callout(kind="info") } ), ] ) return ( add_special_tokens_switch, color_by_radio, display_limit_slider, representation_radio, show_spaces_switch, skip_special_tokens_on_decode_switch, ) @app.cell def _(mo): get_tok_cache, set_tok_cache = mo.state({}) return get_tok_cache, set_tok_cache @app.cell def _( AutoTokenizer, PreTrainedTokenizerBase, get_tok_cache, llm_tokenizer_selector: "mo.ui.dropdown", mo, set_tok_cache, ): # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py selected_model_name: str = llm_tokenizer_selector.value key = selected_model_name cache = get_tok_cache() if key in cache: tokenizer = cache[key] else: tokenizer: PreTrainedTokenizerBase = None try: tokenizer = AutoTokenizer.from_pretrained( selected_model_name, use_fast=True, trust_remote_code=True, ) except Exception as e: mo.md(f"Failed to load tokenizer '{selected_model_name}': {e}").callout( kind="error" ) tokenizer = None if tokenizer is not None: set_tok_cache({**cache, key: tokenizer}) return (tokenizer,) @app.cell def _(Union, math): TokenStatsDict = dict[str, dict[str, Union[int, float]]] def get_token_stats(tokens: list[str], original_text: str) -> TokenStatsDict: """Calculate enhanced statistics about the tokens.""" if not tokens: # Return default structure matching TokenStatsDict return { "basic_stats": { "total_tokens": 0, "unique_tokens": 0, "compression_ratio": 0.0, "space_tokens": 0, "newline_tokens": 0, "special_tokens": 0, "punctuation_tokens": 0, "unique_percentage": 0.0, }, "length_stats": { "avg_length": 0.0, "std_dev": 0.0, "min_length": 0, "max_length": 0, "median_length": 0.0, }, } total_tokens: int = len(tokens) unique_tokens: int = len(set(tokens)) compression_ratio: float = ( len(original_text) / total_tokens if total_tokens > 0 else 0.0 ) space_tokens: int = sum(1 for t in tokens if t.startswith(("Ġ", "▁", " "))) newline_tokens: int = sum( 1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>" ) special_tokens: int = sum( 1 for t in tokens if (t.startswith("<") and t.endswith(">")) or (t.startswith("[") and t.endswith("]")) ) punctuation_tokens: int = sum( 1 for t in tokens if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"] ) lengths: list[int] = [len(t) for t in tokens] if not lengths: # Should not happen if tokens is not empty, but safe check return { # Return default structure matching TokenStatsDict "basic_stats": { "total_tokens": 0, "unique_tokens": 0, "compression_ratio": 0.0, "space_tokens": 0, "newline_tokens": 0, "special_tokens": 0, "punctuation_tokens": 0, "unique_percentage": 0.0, }, "length_stats": { "avg_length": 0.0, "std_dev": 0.0, "min_length": 0, "max_length": 0, "median_length": 0.0, }, } mean_length: float = sum(lengths) / len(lengths) variance: float = sum((x - mean_length) ** 2 for x in lengths) / len(lengths) std_dev: float = math.sqrt(variance) sorted_lengths: list[int] = sorted(lengths) n = len(lengths) if n % 2 == 1: median_length = float(sorted_lengths[n // 2]) else: median_length = (sorted_lengths[n // 2 - 1] + sorted_lengths[n // 2]) / 2 return { "basic_stats": { "total_tokens": total_tokens, "unique_tokens": unique_tokens, "compression_ratio": round(compression_ratio, 2), "space_tokens": space_tokens, "newline_tokens": newline_tokens, "special_tokens": special_tokens, "punctuation_tokens": punctuation_tokens, "unique_percentage": round(unique_tokens / total_tokens * 100, 1) if total_tokens > 0 else 0.0, }, "length_stats": { "avg_length": round(mean_length, 2), "std_dev": round(std_dev, 2), "min_length": min(lengths), "max_length": max(lengths), "median_length": median_length, }, } return (get_token_stats,) @app.cell def _(hashlib): def get_varied_color(token: str) -> dict[str, str]: """Generate vibrant colors with HSL for better visual distinction.""" token_hash: str = hashlib.md5(token.encode()).hexdigest() hue: int = int(token_hash[:3], 16) % 360 saturation: int = 70 + (int(token_hash[3:5], 16) % 20) lightness: int = 80 + (int(token_hash[5:7], 16) % 10) text_lightness: int = 20 return { "background": f"hsl({hue}, {saturation}%, {lightness}%)", "text": f"hsl({hue}, {saturation}%, {text_lightness}%)", } return (get_varied_color,) @app.function def fix_token( token: str, re ) -> ( str ): # re module type is complex, leave as Any implicitly or import types.ModuleType """Fix token for display, handling byte fallbacks and spaces.""" # Check for byte fallback pattern <0xHH> using a full match byte_match = re.fullmatch(r"<0x([0-9A-Fa-f]{2})>", token) if byte_match: hex_value = byte_match.group(1).upper() # Return a clear representation indicating it's a byte return f"<0x{hex_value}>" # Replace SentencePiece space marker U+2581 ('▁') and BPE space marker 'Ġ' with a middle dot token = token.replace("▁", "·").replace("Ġ", "·") # Replace newline markers for display token = token.replace("Ċ", "↵\n") # Handle byte representation of newline AFTER general byte check # This specific check might become redundant if <0x0A> is caught by the byte_match above # Keep it for now as a fallback. token = token.replace("<0x0A>", "↵\n") return token @app.cell def _(Any, PreTrainedTokenizerBase): def get_tokenizer_info( tokenizer: PreTrainedTokenizerBase, ) -> dict[str, Any]: """ Extract useful information from a tokenizer. Returns a dictionary with tokenizer details. """ info: dict[str, Any] = {} try: if hasattr(tokenizer, "vocab_size"): info["vocab_size"] = tokenizer.vocab_size elif hasattr(tokenizer, "get_vocab"): info["vocab_size"] = len(tokenizer.get_vocab()) if ( hasattr(tokenizer, "model_max_length") and isinstance(tokenizer.model_max_length, int) and tokenizer.model_max_length < 1000000 ): info["model_max_length"] = tokenizer.model_max_length else: info["model_max_length"] = "Not specified or very large" info["tokenizer_type"] = tokenizer.__class__.__name__ special_tokens: dict[str, str] = {} special_token_attributes: list[str] = [ "pad_token", "eos_token", "bos_token", "sep_token", "cls_token", "unk_token", "mask_token", ] processed_tokens: set[str] = ( set() ) # Keep track of processed tokens to avoid duplicates # Prefer all_special_tokens if available if hasattr(tokenizer, "all_special_tokens"): for token_value in tokenizer.all_special_tokens: if ( not token_value or not str(token_value).strip() or str(token_value) in processed_tokens ): continue token_name = "special_token" # Default name # Find the attribute name corresponding to the token value for attr_name in special_token_attributes: if ( hasattr(tokenizer, attr_name) and getattr(tokenizer, attr_name) == token_value ): token_name = attr_name break token_str = str(token_value) token_id = ( tokenizer.convert_tokens_to_ids(token_str) if hasattr(tokenizer, "convert_tokens_to_ids") else None ) special_tokens[token_name] = token_str + ( f" (id {token_id})" if isinstance(token_id, int) else "" ) processed_tokens.add(str(token_value)) # Fallback/Augment with individual attributes if not covered by all_special_tokens for token_name in special_token_attributes: if hasattr(tokenizer, token_name): token_value = getattr(tokenizer, token_name) if ( token_value and str(token_value).strip() and str(token_value) not in processed_tokens ): token_str = str(token_value) token_id = ( tokenizer.convert_tokens_to_ids(token_str) if hasattr(tokenizer, "convert_tokens_to_ids") else None ) special_tokens[token_name] = token_str + ( f" (id {token_id})" if isinstance(token_id, int) else "" ) processed_tokens.add(str(token_value)) info["special_tokens"] = special_tokens if special_tokens else "None found" except Exception as e: info["error"] = f"Error extracting tokenizer info: {str(e)}" return info return (get_tokenizer_info,) @app.cell def _(mo): show_ids_switch: mo.ui.switch = mo.ui.switch( label="Show token IDs instead of text", value=False ) return (show_ids_switch,) @app.cell def _( Any, Optional, Union, add_special_tokens_switch, color_by_radio, current_text: str, display_limit_slider, get_token_stats, get_tokenizer_info, get_varied_color, llm_tokenizer_selector: "mo.ui.dropdown", mo, re, representation_radio, show_ids_switch: "mo.ui.switch", show_spaces_switch, skip_special_tokens_on_decode_switch, tokenizer, ): # Define the Unicode replacement character REPLACEMENT_CHARACTER = "\ufffd" mo.stop(tokenizer is None, "Please select a valid tokenizer model.") tokenizer_info: dict[str, Any] = get_tokenizer_info(tokenizer) # 1. Encode text to get token IDs first. token_ids: list[int] = tokenizer.encode( current_text, add_special_tokens=add_special_tokens_switch.value ) # 2. Convert IDs to raw tokens and decode each individually raw_tokens: list[str] = tokenizer.convert_ids_to_tokens(token_ids) decoded_per_id: list[str] = [ tokenizer.decode( [tid], skip_special_tokens=skip_special_tokens_on_decode_switch.value, clean_up_tokenization_spaces=False, ) for tid in token_ids ] # 3. Get offset mapping for span information enc = tokenizer( current_text, add_special_tokens=add_special_tokens_switch.value, return_offsets_mapping=True, ) offsets = ( enc.get("offset_mapping") if isinstance(enc, dict) else getattr(enc, "offset_mapping", None) ) if offsets and len(offsets) == len(token_ids): records: list[dict[str, Union[int, str]]] = [] for tid, raw, dec, (s, e) in zip( token_ids, raw_tokens, decoded_per_id, offsets ): substr = current_text[s:e] if (s is not None and e is not None) else "" records.append( { "id": tid, "raw": raw, "dec": dec, "start": s, "end": e, "substr": substr, } ) else: records = [ { "id": tid, "raw": raw, "dec": dec, "start": None, "end": None, "substr": "", } for tid, raw, dec in zip(token_ids, raw_tokens, decoded_per_id) ] def _is_byte_level(tok) -> bool: try: if getattr(tok, "is_fast", False): pre = tok.backend_tokenizer.pre_tokenizer types = [pre.__class__.__name__] if hasattr(pre, "pre_tokenizers"): types = [p.__class__.__name__ for p in pre.pre_tokenizers] return "ByteLevel" in types except Exception: pass return False if representation_radio.value == "Auto (recommended)": use_decoded: bool = _is_byte_level(tokenizer) or any( ("Ġ" in r["raw"] or "Ċ" in r["raw"]) for r in records[:256] ) elif representation_radio.value == "Decoded strings": use_decoded = True else: use_decoded = False if use_decoded: source_records = [r for r in records if r["dec"] != ""] stats_tokens_source: list[str] = [r["dec"] for r in records if r["dec"] != ""] else: source_records = records stats_tokens_source = [r["raw"] for r in records] total_token_count: int = len(source_records) display_limit: int = display_limit_slider.value display_records = source_records[:display_limit] display_limit_reached: bool = len(source_records) > display_limit # Generate data for visualization TokenVisData = dict[str, Union[str, int, bool, dict[str, str]]] llm_token_data: list[TokenVisData] = [] for idx, r in enumerate(display_records): token_str: str = r["dec"] if use_decoded else r["raw"] # Apply space visualization in decoded view if use_decoded and show_spaces_switch.value: token_str = token_str.replace(" ", "·") is_invalid_utf8: bool = REPLACEMENT_CHARACTER in token_str fixed_token_display: str = ( f"<0x{r['id']:X}>" if is_invalid_utf8 else fix_token(token_str, re) ) # Choose color seed based on color_by_radio if color_by_radio.value == "ID": seed = f"id_{r['id']}" elif color_by_radio.value == "Category": probe = r["dec"] if use_decoded else r["raw"] if probe.startswith(("Ġ", "▁", " ")): cat = "space" elif ("\n" in probe) or ("Ċ" in probe): cat = "newline" elif (probe.startswith("<") and probe.endswith(">")) or ( probe.startswith("[") and probe.endswith("]") ): cat = "special" else: cat = "text" seed = f"cat_{cat}" else: seed = token_str colors: dict[str, str] = get_varied_color( seed if not is_invalid_utf8 else f"invalid_{r['id']}" ) llm_token_data.append( { "original": ( f"Vocab: {r['raw']}\n" f"Decoded: {r['dec'] if r['dec'] != '' else '∅'}\n" f"Span: [{r['start']}, {r['end']}]\n" f"Text: {r['substr']}" ), "display": fixed_token_display, "colors": colors, "is_newline": "↵" in fixed_token_display, "token_id": r["id"], "token_index": idx, "is_invalid": is_invalid_utf8, } ) token_stats: dict[str, dict[str, Union[int, float]]] = get_token_stats( stats_tokens_source, current_text, ) html_parts: list[str] = [ ( lambda item: ( style := f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;" # Add specific style for invalid tokens + ( " border: 1px solid red;" if item.get("is_invalid") else ( " border: 1px solid orange;" if item["display"].startswith("<0x") else "" ) ), # Modify title based on validity title := ( f"Original: {item['original']}\nID: {item['token_id']}" + ("\n(Invalid UTF-8)" if item.get("is_invalid") else "") + ("\n(Byte Token)" if item["display"].startswith("<0x") else "") ), aria_label := ( ("Token ID " + str(item["token_id"]) + ": " + item["original"]) .replace("\n", " ") .replace('"', """) ), display_content := str(item["token_id"]) if show_ids_switch.value else item["display"], f'{display_content}', )[-1] # Get the last element (the formatted string) from the lambda's tuple )(item) for item in llm_token_data ] token_viz_html: mo.Html = mo.Html( f'