Spaces:
Running
Running
| # /// script | |
| # dependencies = [ | |
| # "marimo>=0.13.0", | |
| # "polars>=1.29.0", | |
| # "altair>=5.5.0", | |
| # "spacy==3.8.7", | |
| # "en-core-web-md", | |
| # "ja-core-news-md", | |
| # "transformers>=4.57.1", | |
| # ] | |
| # | |
| # [tool.uv.sources] | |
| # en-core-web-md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl" } | |
| # ja-core-news-md = { url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.8.0/ja_core_news_md-3.8.0-py3-none-any.whl" } | |
| # [tool.marimo.runtime] | |
| # auto_instantiate = false | |
| # /// | |
| import marimo | |
| __generated_with = "0.17.2" | |
| app = marimo.App(width="medium") | |
| def _(): | |
| import hashlib | |
| import math | |
| import re | |
| from typing import Any, Callable, Optional, Union | |
| import altair as alt | |
| import marimo as mo | |
| import polars as pl | |
| import spacy | |
| from transformers import ( | |
| PreTrainedTokenizerBase, | |
| AutoTokenizer, | |
| ) | |
| llm_model_choices: list[str] = [ | |
| "deepseek-ai/DeepSeek-OCR", | |
| "zai-org/GLM-4.6", | |
| "openai/gpt-oss-20b", | |
| "google/gemma-3-27b-it", | |
| "ibm-granite/granite-3.3-8b-instruct", | |
| "deep-analysis-research/Flux-Japanese-Qwen2.5-32B-Instruct-V1.0", | |
| "google-bert/bert-large-uncased", | |
| ] | |
| return ( | |
| Any, | |
| AutoTokenizer, | |
| Callable, | |
| Optional, | |
| PreTrainedTokenizerBase, | |
| Union, | |
| alt, | |
| hashlib, | |
| llm_model_choices, | |
| math, | |
| mo, | |
| pl, | |
| re, | |
| spacy, | |
| ) | |
| def _(mo, spacy): | |
| get_nlp_en, set_nlp_en = mo.state(None) | |
| get_nlp_ja, set_nlp_ja = mo.state(None) | |
| def ensure_nlp(language: str) -> spacy.language.Language: | |
| if language == "English": | |
| if get_nlp_en() is None: | |
| set_nlp_en(spacy.load("en_core_web_md")) | |
| return get_nlp_en() | |
| else: | |
| if get_nlp_ja() is None: | |
| set_nlp_ja(spacy.load("ja_core_news_md")) | |
| return get_nlp_ja() | |
| return (ensure_nlp,) | |
| def _(mo): | |
| mo.md("""# Tokenization for English and Japanese""") | |
| return | |
| def _(Callable, mo): | |
| # Central state for the text input content | |
| # Type the getter and setter | |
| get_text_content: Callable[[], str] | |
| set_text_content: Callable[[str], None] | |
| get_text_content, set_text_content = mo.state("") | |
| return get_text_content, set_text_content | |
| def _(mo): | |
| # Placeholder texts | |
| en_placeholder = """ | |
| Mrs. Ferrars died on the night of the 16th–17th September—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours. | |
| """.strip() | |
| ja_placeholder = """ | |
| 吾輩は猫である。名前はまだ無い。 | |
| どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。 | |
| """.strip() | |
| # Create UI element for language selection | |
| language_selector: mo.ui.radio = mo.ui.radio( | |
| options=["English", "Japanese"], value="English", label="Language" | |
| ) | |
| return en_placeholder, ja_placeholder, language_selector | |
| def _( | |
| en_placeholder, | |
| get_text_content: "Callable[[], str]", | |
| ja_placeholder, | |
| language_selector: "mo.ui.radio", | |
| mo, | |
| set_text_content: "Callable[[str], None]", | |
| ): | |
| # Define text_input dynamically based on language | |
| current_placeholder: str = ( | |
| en_placeholder if language_selector.value == "English" else ja_placeholder | |
| ) | |
| text_input: mo.ui.text_area = mo.ui.text_area( | |
| value=get_text_content(), | |
| label="Enter text", | |
| placeholder=current_placeholder, | |
| full_width=True, | |
| on_change=lambda v: set_text_content(v), | |
| ) | |
| return current_placeholder, text_input | |
| def _(current_placeholder: str, mo, set_text_content: "Callable[[str], None]"): | |
| def apply_placeholder() -> None: | |
| set_text_content(current_placeholder) | |
| apply_placeholder_button: mo.ui.button = mo.ui.button( | |
| label="Use Placeholder Text", on_click=lambda _: apply_placeholder() | |
| ) | |
| return (apply_placeholder_button,) | |
| def _( | |
| apply_placeholder_button: "mo.ui.button", | |
| language_selector: "mo.ui.radio", | |
| mo, | |
| text_input: "mo.ui.text_area", | |
| ): | |
| mo.vstack( | |
| [ | |
| text_input, | |
| mo.hstack([language_selector, apply_placeholder_button], justify="start"), | |
| mo.ui.button(label="Analyze"), | |
| ] | |
| ) | |
| return | |
| def _( | |
| ensure_nlp, | |
| get_text_content: "Callable[[], str]", | |
| language_selector: "mo.ui.radio", | |
| mo, | |
| spacy, | |
| ): | |
| # Analyze text using spaCy based on selected language | |
| mo.md("Note: Loading spaCy pipelines on first use may take a few seconds.").callout( | |
| kind="info" | |
| ) | |
| current_text: str = get_text_content() | |
| nlp = ensure_nlp(language_selector.value) | |
| doc: spacy.tokens.Doc = nlp(current_text) | |
| model_name: str = nlp.meta["name"] | |
| tokenized_text: list[str] = [token.text for token in doc] | |
| token_count: int = len(tokenized_text) | |
| mo.md( | |
| f"**Tokenized Text using spaCy {'en_' if language_selector.value == 'English' else 'ja_'}{model_name}:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}" | |
| ) | |
| return current_text, doc | |
| def _(doc: "spacy.tokens.Doc", language_selector: "mo.ui.radio", mo, pl): | |
| token_data: pl.DataFrame = pl.DataFrame( | |
| { | |
| "Token": [token.text for token in doc], | |
| "Lemma": [token.lemma_ for token in doc], | |
| "POS": [token.pos_ for token in doc], | |
| "Tag": [token.tag_ for token in doc], | |
| "Morph": [str(token.morph) for token in doc], | |
| "OOV": [ | |
| token.is_oov if language_selector.value == "English" else None | |
| for token in doc | |
| ], | |
| "Token Position": list(range(len(doc))), | |
| "Sentence Number": ( | |
| [i for i, sent in enumerate(doc.sents) for _ in sent] | |
| if doc.has_annotation("SENT_START") | |
| else [0] * len(doc) | |
| ), | |
| } | |
| ) | |
| mo.ui.dataframe(token_data, page_size=50) | |
| return (token_data,) | |
| def _(mo): | |
| column_selector: mo.ui.dropdown = mo.ui.dropdown( | |
| options=["POS", "Tag", "Lemma", "Token", "Morph", "OOV"], | |
| value="POS", | |
| label="Select column to visualize", | |
| ) | |
| column_selector | |
| return (column_selector,) | |
| def _( | |
| alt, | |
| column_selector: "mo.ui.dropdown", | |
| mo, | |
| pl, | |
| token_data: "pl.DataFrame", | |
| ): | |
| mo.stop(token_data.is_empty(), "Please set input text.") | |
| selected_column: str = column_selector.value | |
| # Calculate value counts for the selected column | |
| counts_df: pl.DataFrame = ( | |
| token_data[selected_column] | |
| .value_counts() | |
| .sort(by=["count", selected_column], descending=[True, False]) | |
| ) | |
| chart: alt.Chart = ( | |
| alt.Chart(counts_df) | |
| .mark_bar() | |
| .encode( | |
| x=alt.X("count", title="Frequency"), | |
| y=alt.Y(selected_column, title=selected_column, sort=None), | |
| tooltip=[selected_column, "count"], | |
| ) | |
| .properties(title=f"{selected_column} Distribution") | |
| .interactive() | |
| ) | |
| mo.ui.altair_chart(chart) | |
| return | |
| def _(llm_model_choices: list[str], mo): | |
| llm_tokenizer_selector: mo.ui.dropdown = mo.ui.dropdown( | |
| options=llm_model_choices, | |
| value=llm_model_choices[0], | |
| label="Select LLM Tokenizer Model", | |
| ) | |
| llm_tokenizer_selector | |
| return (llm_tokenizer_selector,) | |
| def _(mo): | |
| add_special_tokens_switch = mo.ui.switch( | |
| label="Add special tokens (encode)", value=False | |
| ) | |
| skip_special_tokens_on_decode_switch = mo.ui.switch( | |
| label="Skip special tokens in decoded view", value=False | |
| ) | |
| representation_radio = mo.ui.radio( | |
| options=["Auto (recommended)", "Decoded strings", "Raw tokens"], | |
| value="Auto (recommended)", | |
| label="LLM token representation", | |
| ) | |
| display_limit_slider = mo.ui.slider( | |
| 100, 5000, value=1000, label="Display token limit" | |
| ) | |
| color_by_radio = mo.ui.radio( | |
| options=["Token", "ID", "Category"], | |
| value="Token", | |
| label="Color by", | |
| ) | |
| show_spaces_switch = mo.ui.switch( | |
| label="Show spaces as · (decoded view)", value=False | |
| ) | |
| mo.vstack( | |
| [ | |
| mo.hstack( | |
| [ | |
| add_special_tokens_switch, | |
| skip_special_tokens_on_decode_switch, | |
| ] | |
| ), | |
| mo.hstack([representation_radio, display_limit_slider]), | |
| mo.hstack([color_by_radio, show_spaces_switch]), | |
| mo.accordion( | |
| { | |
| "Tip": mo.md( | |
| "Many GPT-style tokenizers are byte-level; their raw vocab strings can look garbled. Use Decoded strings or Auto." | |
| ).callout(kind="info") | |
| } | |
| ), | |
| ] | |
| ) | |
| return ( | |
| add_special_tokens_switch, | |
| color_by_radio, | |
| display_limit_slider, | |
| representation_radio, | |
| show_spaces_switch, | |
| skip_special_tokens_on_decode_switch, | |
| ) | |
| def _(mo): | |
| get_tok_cache, set_tok_cache = mo.state({}) | |
| return get_tok_cache, set_tok_cache | |
| def _( | |
| AutoTokenizer, | |
| PreTrainedTokenizerBase, | |
| get_tok_cache, | |
| llm_tokenizer_selector: "mo.ui.dropdown", | |
| mo, | |
| set_tok_cache, | |
| ): | |
| # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py | |
| selected_model_name: str = llm_tokenizer_selector.value | |
| key = selected_model_name | |
| cache = get_tok_cache() | |
| if key in cache: | |
| tokenizer = cache[key] | |
| else: | |
| tokenizer: PreTrainedTokenizerBase = None | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| selected_model_name, | |
| use_fast=True, | |
| trust_remote_code=True, | |
| ) | |
| except Exception as e: | |
| mo.md(f"Failed to load tokenizer '{selected_model_name}': {e}").callout( | |
| kind="error" | |
| ) | |
| tokenizer = None | |
| if tokenizer is not None: | |
| set_tok_cache({**cache, key: tokenizer}) | |
| return (tokenizer,) | |
| def _(Union, math): | |
| TokenStatsDict = dict[str, dict[str, Union[int, float]]] | |
| def get_token_stats(tokens: list[str], original_text: str) -> TokenStatsDict: | |
| """Calculate enhanced statistics about the tokens.""" | |
| if not tokens: | |
| # Return default structure matching TokenStatsDict | |
| return { | |
| "basic_stats": { | |
| "total_tokens": 0, | |
| "unique_tokens": 0, | |
| "compression_ratio": 0.0, | |
| "space_tokens": 0, | |
| "newline_tokens": 0, | |
| "special_tokens": 0, | |
| "punctuation_tokens": 0, | |
| "unique_percentage": 0.0, | |
| }, | |
| "length_stats": { | |
| "avg_length": 0.0, | |
| "std_dev": 0.0, | |
| "min_length": 0, | |
| "max_length": 0, | |
| "median_length": 0.0, | |
| }, | |
| } | |
| total_tokens: int = len(tokens) | |
| unique_tokens: int = len(set(tokens)) | |
| compression_ratio: float = ( | |
| len(original_text) / total_tokens if total_tokens > 0 else 0.0 | |
| ) | |
| space_tokens: int = sum(1 for t in tokens if t.startswith(("Ġ", "▁", " "))) | |
| newline_tokens: int = sum( | |
| 1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>" | |
| ) | |
| special_tokens: int = sum( | |
| 1 | |
| for t in tokens | |
| if (t.startswith("<") and t.endswith(">")) | |
| or (t.startswith("[") and t.endswith("]")) | |
| ) | |
| punctuation_tokens: int = sum( | |
| 1 | |
| for t in tokens | |
| if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"] | |
| ) | |
| lengths: list[int] = [len(t) for t in tokens] | |
| if not lengths: # Should not happen if tokens is not empty, but safe check | |
| return { # Return default structure matching TokenStatsDict | |
| "basic_stats": { | |
| "total_tokens": 0, | |
| "unique_tokens": 0, | |
| "compression_ratio": 0.0, | |
| "space_tokens": 0, | |
| "newline_tokens": 0, | |
| "special_tokens": 0, | |
| "punctuation_tokens": 0, | |
| "unique_percentage": 0.0, | |
| }, | |
| "length_stats": { | |
| "avg_length": 0.0, | |
| "std_dev": 0.0, | |
| "min_length": 0, | |
| "max_length": 0, | |
| "median_length": 0.0, | |
| }, | |
| } | |
| mean_length: float = sum(lengths) / len(lengths) | |
| variance: float = sum((x - mean_length) ** 2 for x in lengths) / len(lengths) | |
| std_dev: float = math.sqrt(variance) | |
| sorted_lengths: list[int] = sorted(lengths) | |
| n = len(lengths) | |
| if n % 2 == 1: | |
| median_length = float(sorted_lengths[n // 2]) | |
| else: | |
| median_length = (sorted_lengths[n // 2 - 1] + sorted_lengths[n // 2]) / 2 | |
| return { | |
| "basic_stats": { | |
| "total_tokens": total_tokens, | |
| "unique_tokens": unique_tokens, | |
| "compression_ratio": round(compression_ratio, 2), | |
| "space_tokens": space_tokens, | |
| "newline_tokens": newline_tokens, | |
| "special_tokens": special_tokens, | |
| "punctuation_tokens": punctuation_tokens, | |
| "unique_percentage": round(unique_tokens / total_tokens * 100, 1) | |
| if total_tokens > 0 | |
| else 0.0, | |
| }, | |
| "length_stats": { | |
| "avg_length": round(mean_length, 2), | |
| "std_dev": round(std_dev, 2), | |
| "min_length": min(lengths), | |
| "max_length": max(lengths), | |
| "median_length": median_length, | |
| }, | |
| } | |
| return (get_token_stats,) | |
| def _(hashlib): | |
| def get_varied_color(token: str) -> dict[str, str]: | |
| """Generate vibrant colors with HSL for better visual distinction.""" | |
| token_hash: str = hashlib.md5(token.encode()).hexdigest() | |
| hue: int = int(token_hash[:3], 16) % 360 | |
| saturation: int = 70 + (int(token_hash[3:5], 16) % 20) | |
| lightness: int = 80 + (int(token_hash[5:7], 16) % 10) | |
| text_lightness: int = 20 | |
| return { | |
| "background": f"hsl({hue}, {saturation}%, {lightness}%)", | |
| "text": f"hsl({hue}, {saturation}%, {text_lightness}%)", | |
| } | |
| return (get_varied_color,) | |
| def fix_token( | |
| token: str, re | |
| ) -> ( | |
| str | |
| ): # re module type is complex, leave as Any implicitly or import types.ModuleType | |
| """Fix token for display, handling byte fallbacks and spaces.""" | |
| # Check for byte fallback pattern <0xHH> using a full match | |
| byte_match = re.fullmatch(r"<0x([0-9A-Fa-f]{2})>", token) | |
| if byte_match: | |
| hex_value = byte_match.group(1).upper() | |
| # Return a clear representation indicating it's a byte | |
| return f"<0x{hex_value}>" | |
| # Replace SentencePiece space marker U+2581 ('▁') and BPE space marker 'Ġ' with a middle dot | |
| token = token.replace("▁", "·").replace("Ġ", "·") | |
| # Replace newline markers for display | |
| token = token.replace("Ċ", "↵\n") | |
| # Handle byte representation of newline AFTER general byte check | |
| # This specific check might become redundant if <0x0A> is caught by the byte_match above | |
| # Keep it for now as a fallback. | |
| token = token.replace("<0x0A>", "↵\n") | |
| return token | |
| def _(Any, PreTrainedTokenizerBase): | |
| def get_tokenizer_info( | |
| tokenizer: PreTrainedTokenizerBase, | |
| ) -> dict[str, Any]: | |
| """ | |
| Extract useful information from a tokenizer. | |
| Returns a dictionary with tokenizer details. | |
| """ | |
| info: dict[str, Any] = {} | |
| try: | |
| if hasattr(tokenizer, "vocab_size"): | |
| info["vocab_size"] = tokenizer.vocab_size | |
| elif hasattr(tokenizer, "get_vocab"): | |
| info["vocab_size"] = len(tokenizer.get_vocab()) | |
| if ( | |
| hasattr(tokenizer, "model_max_length") | |
| and isinstance(tokenizer.model_max_length, int) | |
| and tokenizer.model_max_length < 1000000 | |
| ): | |
| info["model_max_length"] = tokenizer.model_max_length | |
| else: | |
| info["model_max_length"] = "Not specified or very large" | |
| info["tokenizer_type"] = tokenizer.__class__.__name__ | |
| special_tokens: dict[str, str] = {} | |
| special_token_attributes: list[str] = [ | |
| "pad_token", | |
| "eos_token", | |
| "bos_token", | |
| "sep_token", | |
| "cls_token", | |
| "unk_token", | |
| "mask_token", | |
| ] | |
| processed_tokens: set[str] = ( | |
| set() | |
| ) # Keep track of processed tokens to avoid duplicates | |
| # Prefer all_special_tokens if available | |
| if hasattr(tokenizer, "all_special_tokens"): | |
| for token_value in tokenizer.all_special_tokens: | |
| if ( | |
| not token_value | |
| or not str(token_value).strip() | |
| or str(token_value) in processed_tokens | |
| ): | |
| continue | |
| token_name = "special_token" # Default name | |
| # Find the attribute name corresponding to the token value | |
| for attr_name in special_token_attributes: | |
| if ( | |
| hasattr(tokenizer, attr_name) | |
| and getattr(tokenizer, attr_name) == token_value | |
| ): | |
| token_name = attr_name | |
| break | |
| token_str = str(token_value) | |
| token_id = ( | |
| tokenizer.convert_tokens_to_ids(token_str) | |
| if hasattr(tokenizer, "convert_tokens_to_ids") | |
| else None | |
| ) | |
| special_tokens[token_name] = token_str + ( | |
| f" (id {token_id})" if isinstance(token_id, int) else "" | |
| ) | |
| processed_tokens.add(str(token_value)) | |
| # Fallback/Augment with individual attributes if not covered by all_special_tokens | |
| for token_name in special_token_attributes: | |
| if hasattr(tokenizer, token_name): | |
| token_value = getattr(tokenizer, token_name) | |
| if ( | |
| token_value | |
| and str(token_value).strip() | |
| and str(token_value) not in processed_tokens | |
| ): | |
| token_str = str(token_value) | |
| token_id = ( | |
| tokenizer.convert_tokens_to_ids(token_str) | |
| if hasattr(tokenizer, "convert_tokens_to_ids") | |
| else None | |
| ) | |
| special_tokens[token_name] = token_str + ( | |
| f" (id {token_id})" if isinstance(token_id, int) else "" | |
| ) | |
| processed_tokens.add(str(token_value)) | |
| info["special_tokens"] = special_tokens if special_tokens else "None found" | |
| except Exception as e: | |
| info["error"] = f"Error extracting tokenizer info: {str(e)}" | |
| return info | |
| return (get_tokenizer_info,) | |
| def _(mo): | |
| show_ids_switch: mo.ui.switch = mo.ui.switch( | |
| label="Show token IDs instead of text", value=False | |
| ) | |
| return (show_ids_switch,) | |
| def _( | |
| Any, | |
| Optional, | |
| Union, | |
| add_special_tokens_switch, | |
| color_by_radio, | |
| current_text: str, | |
| display_limit_slider, | |
| get_token_stats, | |
| get_tokenizer_info, | |
| get_varied_color, | |
| llm_tokenizer_selector: "mo.ui.dropdown", | |
| mo, | |
| re, | |
| representation_radio, | |
| show_ids_switch: "mo.ui.switch", | |
| show_spaces_switch, | |
| skip_special_tokens_on_decode_switch, | |
| tokenizer, | |
| ): | |
| # Define the Unicode replacement character | |
| REPLACEMENT_CHARACTER = "\ufffd" | |
| mo.stop(tokenizer is None, "Please select a valid tokenizer model.") | |
| tokenizer_info: dict[str, Any] = get_tokenizer_info(tokenizer) | |
| # 1. Encode text to get token IDs first. | |
| token_ids: list[int] = tokenizer.encode( | |
| current_text, add_special_tokens=add_special_tokens_switch.value | |
| ) | |
| # 2. Convert IDs to raw tokens and decode each individually | |
| raw_tokens: list[str] = tokenizer.convert_ids_to_tokens(token_ids) | |
| decoded_per_id: list[str] = [ | |
| tokenizer.decode( | |
| [tid], | |
| skip_special_tokens=skip_special_tokens_on_decode_switch.value, | |
| clean_up_tokenization_spaces=False, | |
| ) | |
| for tid in token_ids | |
| ] | |
| # 3. Get offset mapping for span information | |
| enc = tokenizer( | |
| current_text, | |
| add_special_tokens=add_special_tokens_switch.value, | |
| return_offsets_mapping=True, | |
| ) | |
| offsets = ( | |
| enc.get("offset_mapping") | |
| if isinstance(enc, dict) | |
| else getattr(enc, "offset_mapping", None) | |
| ) | |
| if offsets and len(offsets) == len(token_ids): | |
| records: list[dict[str, Union[int, str]]] = [] | |
| for tid, raw, dec, (s, e) in zip( | |
| token_ids, raw_tokens, decoded_per_id, offsets | |
| ): | |
| substr = current_text[s:e] if (s is not None and e is not None) else "" | |
| records.append( | |
| { | |
| "id": tid, | |
| "raw": raw, | |
| "dec": dec, | |
| "start": s, | |
| "end": e, | |
| "substr": substr, | |
| } | |
| ) | |
| else: | |
| records = [ | |
| { | |
| "id": tid, | |
| "raw": raw, | |
| "dec": dec, | |
| "start": None, | |
| "end": None, | |
| "substr": "", | |
| } | |
| for tid, raw, dec in zip(token_ids, raw_tokens, decoded_per_id) | |
| ] | |
| def _is_byte_level(tok) -> bool: | |
| try: | |
| if getattr(tok, "is_fast", False): | |
| pre = tok.backend_tokenizer.pre_tokenizer | |
| types = [pre.__class__.__name__] | |
| if hasattr(pre, "pre_tokenizers"): | |
| types = [p.__class__.__name__ for p in pre.pre_tokenizers] | |
| return "ByteLevel" in types | |
| except Exception: | |
| pass | |
| return False | |
| if representation_radio.value == "Auto (recommended)": | |
| use_decoded: bool = _is_byte_level(tokenizer) or any( | |
| ("Ġ" in r["raw"] or "Ċ" in r["raw"]) for r in records[:256] | |
| ) | |
| elif representation_radio.value == "Decoded strings": | |
| use_decoded = True | |
| else: | |
| use_decoded = False | |
| if use_decoded: | |
| source_records = [r for r in records if r["dec"] != ""] | |
| stats_tokens_source: list[str] = [r["dec"] for r in records if r["dec"] != ""] | |
| else: | |
| source_records = records | |
| stats_tokens_source = [r["raw"] for r in records] | |
| total_token_count: int = len(source_records) | |
| display_limit: int = display_limit_slider.value | |
| display_records = source_records[:display_limit] | |
| display_limit_reached: bool = len(source_records) > display_limit | |
| # Generate data for visualization | |
| TokenVisData = dict[str, Union[str, int, bool, dict[str, str]]] | |
| llm_token_data: list[TokenVisData] = [] | |
| for idx, r in enumerate(display_records): | |
| token_str: str = r["dec"] if use_decoded else r["raw"] | |
| # Apply space visualization in decoded view | |
| if use_decoded and show_spaces_switch.value: | |
| token_str = token_str.replace(" ", "·") | |
| is_invalid_utf8: bool = REPLACEMENT_CHARACTER in token_str | |
| fixed_token_display: str = ( | |
| f"<0x{r['id']:X}>" if is_invalid_utf8 else fix_token(token_str, re) | |
| ) | |
| # Choose color seed based on color_by_radio | |
| if color_by_radio.value == "ID": | |
| seed = f"id_{r['id']}" | |
| elif color_by_radio.value == "Category": | |
| probe = r["dec"] if use_decoded else r["raw"] | |
| if probe.startswith(("Ġ", "▁", " ")): | |
| cat = "space" | |
| elif ("\n" in probe) or ("Ċ" in probe): | |
| cat = "newline" | |
| elif (probe.startswith("<") and probe.endswith(">")) or ( | |
| probe.startswith("[") and probe.endswith("]") | |
| ): | |
| cat = "special" | |
| else: | |
| cat = "text" | |
| seed = f"cat_{cat}" | |
| else: | |
| seed = token_str | |
| colors: dict[str, str] = get_varied_color( | |
| seed if not is_invalid_utf8 else f"invalid_{r['id']}" | |
| ) | |
| llm_token_data.append( | |
| { | |
| "original": ( | |
| f"Vocab: {r['raw']}\n" | |
| f"Decoded: {r['dec'] if r['dec'] != '' else '∅'}\n" | |
| f"Span: [{r['start']}, {r['end']}]\n" | |
| f"Text: {r['substr']}" | |
| ), | |
| "display": fixed_token_display, | |
| "colors": colors, | |
| "is_newline": "↵" in fixed_token_display, | |
| "token_id": r["id"], | |
| "token_index": idx, | |
| "is_invalid": is_invalid_utf8, | |
| } | |
| ) | |
| token_stats: dict[str, dict[str, Union[int, float]]] = get_token_stats( | |
| stats_tokens_source, | |
| current_text, | |
| ) | |
| html_parts: list[str] = [ | |
| ( | |
| lambda item: ( | |
| style | |
| := f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;" | |
| # Add specific style for invalid tokens | |
| + ( | |
| " border: 1px solid red;" | |
| if item.get("is_invalid") | |
| else ( | |
| " border: 1px solid orange;" | |
| if item["display"].startswith("<0x") | |
| else "" | |
| ) | |
| ), | |
| # Modify title based on validity | |
| title := ( | |
| f"Original: {item['original']}\nID: {item['token_id']}" | |
| + ("\n(Invalid UTF-8)" if item.get("is_invalid") else "") | |
| + ("\n(Byte Token)" if item["display"].startswith("<0x") else "") | |
| ), | |
| aria_label := ( | |
| ("Token ID " + str(item["token_id"]) + ": " + item["original"]) | |
| .replace("\n", " ") | |
| .replace('"', """) | |
| ), | |
| display_content := str(item["token_id"]) | |
| if show_ids_switch.value | |
| else item["display"], | |
| f'<span style="{style}" title="{title}" aria-label="{aria_label}">{display_content}</span>', | |
| )[-1] # Get the last element (the formatted string) from the lambda's tuple | |
| )(item) | |
| for item in llm_token_data | |
| ] | |
| token_viz_html: mo.Html = mo.Html( | |
| f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>' | |
| ) | |
| # Optional: Add a warning if the display limit was reached | |
| limit_warning: Optional[mo.md] = None # Use Optional type | |
| if display_limit_reached: | |
| limit_warning = mo.md(f"""**Warning:** Displaying only the first {display_limit:,} tokens out of {total_token_count:,}. | |
| Statistics are calculated on the full text.""").callout(kind="warn") | |
| representation_hint: Optional[mo.md] = None | |
| if representation_radio.value == "Raw tokens": | |
| try: | |
| if _is_byte_level(tokenizer): | |
| representation_hint = mo.md( | |
| "This tokenizer uses byte-level BPE; raw vocab strings are not human-readable. Prefer Decoded strings or Auto." | |
| ).callout(kind="info") | |
| except Exception: | |
| pass | |
| # Use dict access safely with .get() for stats | |
| basic_stats: dict[str, Union[int, float]] = token_stats.get("basic_stats", {}) | |
| length_stats: dict[str, Union[int, float]] = token_stats.get("length_stats", {}) | |
| # Use list comprehensions for markdown generation (functional style) | |
| basic_stats_md: str = "**Basic Stats:**\n\n" + "\n".join( | |
| f"- **{key.replace('_', ' ').title()}:** `{value}`" | |
| for key, value in basic_stats.items() | |
| ) | |
| length_stats_md: str = "**Length (Character) Stats:**\n\n" + "\n".join( | |
| f"- **{key.replace('_', ' ').title()}:** `{value}`" | |
| for key, value in length_stats.items() | |
| ) | |
| # Build tokenizer info markdown parts | |
| tokenizer_info_md_parts: list[str] = [ | |
| f"**Tokenizer Type:** `{tokenizer_info.get('tokenizer_type', 'N/A')}`" | |
| ] | |
| if vocab_size := tokenizer_info.get("vocab_size"): | |
| tokenizer_info_md_parts.append(f"**Vocab Size:** `{vocab_size:,}`") | |
| if max_len := tokenizer_info.get("model_max_length"): | |
| tokenizer_info_md_parts.append(f"**Model Max Length:** `{max_len}`") | |
| special_tokens_info = tokenizer_info.get("special_tokens") | |
| if isinstance(special_tokens_info, dict) and special_tokens_info: | |
| tokenizer_info_md_parts.append("**Special Tokens:**") | |
| tokenizer_info_md_parts.extend( | |
| f" - `{name}`: `{str(val)}`" for name, val in special_tokens_info.items() | |
| ) | |
| elif isinstance(special_tokens_info, str): # Handle "None found" case | |
| tokenizer_info_md_parts.append(f"**Special Tokens:** `{special_tokens_info}`") | |
| if error_info := tokenizer_info.get("error"): | |
| tokenizer_info_md_parts.append(f"**Info Error:** `{error_info}`") | |
| tokenizer_info_md: str = "\n\n".join(tokenizer_info_md_parts) | |
| tokenizer_info_accordion = mo.accordion( | |
| {"Tokenizer Info": mo.md(tokenizer_info_md)} | |
| ) | |
| mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value} | |
| {show_ids_switch} | |
| {tokenizer_info_accordion} | |
| ## Tokenizer output | |
| {limit_warning if limit_warning else ""} | |
| {representation_hint if representation_hint else ""} | |
| {mo.as_html(token_viz_html)} | |
| ## Token Statistics | |
| (Calculated on full text if truncated above) | |
| {basic_stats_md} | |
| {length_stats_md} | |
| """) | |
| return | |
| def _(): | |
| return | |
| if __name__ == "__main__": | |
| app.run() | |