Spaces:
Running
Running
| # /// script | |
| # [tool.marimo.runtime] | |
| # auto_instantiate = false | |
| # /// | |
| import marimo | |
| __generated_with = "0.13.0" | |
| app = marimo.App(width="medium") | |
| def _(): | |
| import hashlib | |
| import math | |
| import altair as alt | |
| import marimo as mo | |
| import polars as pl | |
| import spacy | |
| from transformers import AutoTokenizer | |
| # Load spaCy models for English and Japanese | |
| nlp_en = spacy.load("en_core_web_md") | |
| nlp_ja = spacy.load("ja_core_news_md") | |
| # List of tokenizer models | |
| llm_model_choices = [ | |
| "meta-llama/Llama-4-Scout-17B-16E-Instruct", | |
| "google/gemma-3-27b-it", | |
| "deepseek-ai/DeepSeek-R1", | |
| "mistralai/Mistral-Small-3.1-24B-Instruct-2503", | |
| "Qwen/Qwen2.5-72B-Instruct", | |
| "google-bert/bert-large-uncased", | |
| "openai-community/gpt2", | |
| ] | |
| return ( | |
| AutoTokenizer, | |
| alt, | |
| hashlib, | |
| llm_model_choices, | |
| math, | |
| mo, | |
| nlp_en, | |
| nlp_ja, | |
| pl, | |
| ) | |
| def _(mo): | |
| mo.md("# Tokenization for English and Japanese") | |
| return | |
| def _(mo): | |
| # Central state for the text input content | |
| get_text_content, set_text_content = mo.state("") | |
| return get_text_content, set_text_content | |
| def _(mo): | |
| # Placeholder texts | |
| en_placeholder = """ | |
| Mrs. Ferrars died on the night of the 16th–17th September—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours. | |
| """.strip() | |
| ja_placeholder = """ | |
| 吾輩は猫である。名前はまだ無い。 | |
| どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。 | |
| """.strip() | |
| # Create UI element for language selection | |
| language_selector = mo.ui.radio( | |
| options=["English", "Japanese"], value="English", label="Language" | |
| ) | |
| # Return selector and placeholders | |
| return en_placeholder, ja_placeholder, language_selector | |
| def _( | |
| en_placeholder, | |
| get_text_content, | |
| ja_placeholder, | |
| language_selector, | |
| mo, | |
| set_text_content, | |
| ): | |
| # Define text_input dynamically based on language | |
| current_placeholder = ( | |
| en_placeholder if language_selector.value == "English" else ja_placeholder | |
| ) | |
| text_input = mo.ui.text_area( | |
| # Read value from state | |
| value=get_text_content(), | |
| label="Enter text", | |
| placeholder=current_placeholder, | |
| full_width=True, | |
| # Update state on user input | |
| on_change=lambda v: set_text_content(v), | |
| ) | |
| return current_placeholder, text_input | |
| def _(current_placeholder, mo, set_text_content): | |
| def apply_placeholder(): | |
| set_text_content(current_placeholder) | |
| apply_placeholder_button = mo.ui.button( | |
| label="Use Placeholder Text", on_click=lambda _: apply_placeholder() | |
| ) | |
| return (apply_placeholder_button,) | |
| def _(apply_placeholder_button, language_selector, mo, text_input): | |
| mo.vstack( | |
| [ | |
| text_input, | |
| mo.hstack([language_selector, apply_placeholder_button], justify="start"), | |
| ] | |
| ) | |
| return | |
| def _(get_text_content, language_selector, mo, nlp_en, nlp_ja): | |
| # Analyze text using spaCy based on selected language | |
| # Read text from state | |
| current_text = get_text_content() | |
| if language_selector.value == "English": | |
| doc = nlp_en(current_text) | |
| else: | |
| doc = nlp_ja(current_text) | |
| # Tokenized version and count | |
| tokenized_text = [token.text for token in doc] | |
| token_count = len(tokenized_text) | |
| mo.md( | |
| f"**Tokenized Text:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}" | |
| ) | |
| return current_text, doc | |
| def _(doc, mo, pl): | |
| # Create a polars DataFrame with token attributes | |
| token_data = pl.DataFrame( | |
| { | |
| "Token": [token.text for token in doc], | |
| "Lemma": [token.lemma_ for token in doc], | |
| "POS": [token.pos_ for token in doc], | |
| "Tag": [token.tag_ for token in doc], | |
| "Morph": [ | |
| str(token.morph) for token in doc | |
| ], # To be more precise, this should be merged back in via .to_dict() | |
| "Token Position": list(range(len(doc))), | |
| "Sentence Number": [ | |
| i for i, sent in enumerate(doc.sents) for token in sent | |
| ], | |
| } | |
| ) | |
| mo.ui.dataframe(token_data, page_size=50) | |
| return (token_data,) | |
| def _(mo): | |
| # Create UI element for selecting the column to visualize | |
| column_selector = mo.ui.dropdown( | |
| options=["POS", "Tag", "Lemma", "Token", "Morph"], | |
| value="POS", | |
| label="Select column to visualize", | |
| ) | |
| column_selector | |
| return (column_selector,) | |
| def _(alt, column_selector, mo, token_data): | |
| mo.stop(token_data.is_empty(), "Please set input text.") | |
| selected_column = column_selector.value | |
| # Calculate value counts for the selected column | |
| counts_df = ( | |
| token_data[selected_column] | |
| .value_counts() | |
| .sort(by=["count", selected_column], descending=[True, False]) | |
| ) | |
| chart = ( | |
| alt.Chart(counts_df) | |
| .mark_bar() | |
| .encode( | |
| x=alt.X("count", title="Frequency"), | |
| y=alt.Y(selected_column, title=selected_column, sort=None), | |
| tooltip=[selected_column, "count"], | |
| ) | |
| .properties(title=f"{selected_column} Distribution") | |
| .interactive() | |
| ) | |
| mo.ui.altair_chart(chart) | |
| return | |
| def _(llm_model_choices, mo): | |
| # UI for selecting the LLM tokenizer model | |
| llm_tokenizer_selector = mo.ui.dropdown( | |
| options=llm_model_choices, | |
| value=llm_model_choices[-1], # Default to gpt2 for faster loading initially | |
| label="Select LLM Tokenizer Model", | |
| ) | |
| llm_tokenizer_selector | |
| return (llm_tokenizer_selector,) | |
| def _(AutoTokenizer, llm_tokenizer_selector): | |
| # Load the selected tokenizer | |
| # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py | |
| # This cell will re-run when llm_tokenizer_selector.value changes | |
| # Marimo caches the result implicitly based on inputs | |
| selected_model_name = llm_tokenizer_selector.value | |
| tokenizer = AutoTokenizer.from_pretrained(selected_model_name) | |
| return (tokenizer,) | |
| def _(math): | |
| # Function to calculate token statistics | |
| def get_token_stats(tokens: list, original_text: str) -> dict: | |
| """Calculate enhanced statistics about the tokens.""" | |
| if not tokens: | |
| return { # Return default structure even for empty input | |
| "basic_stats": { | |
| "total_tokens": 0, | |
| "unique_tokens": 0, | |
| "compression_ratio": 0, | |
| "space_tokens": 0, | |
| "newline_tokens": 0, | |
| "special_tokens": 0, | |
| "punctuation_tokens": 0, | |
| "unique_percentage": 0, | |
| }, | |
| "length_stats": { | |
| "avg_length": 0, | |
| "std_dev": 0, | |
| "min_length": 0, | |
| "max_length": 0, | |
| "median_length": 0, | |
| }, | |
| } | |
| total_tokens = len(tokens) | |
| unique_tokens = len(set(tokens)) | |
| # Handle potential division by zero if total_tokens is 0 (already checked by `if not tokens`) | |
| avg_length = ( | |
| sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0 | |
| ) | |
| # Handle potential division by zero if total_tokens is 0 | |
| compression_ratio = len(original_text) / total_tokens if total_tokens > 0 else 0 | |
| # Token type analysis (Note: Heuristics might vary between tokenizers) | |
| # Using startswith(('Ġ', ' ')) covers common space markers like SentencePiece's U+2581 and BPE's 'Ġ' | |
| space_tokens = sum(1 for t in tokens if t.startswith(("Ġ", " "))) | |
| # Check for common newline representations | |
| newline_tokens = sum( | |
| 1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>" | |
| ) | |
| # A broader definition for special tokens based on common patterns (control tokens) | |
| special_tokens = sum( | |
| 1 | |
| for t in tokens | |
| if (t.startswith("<") and t.endswith(">")) | |
| or (t.startswith("[") and t.endswith("]")) | |
| ) | |
| # Simple punctuation check (might overlap with other categories, focuses on single char punct) | |
| punctuation_tokens = sum( | |
| 1 | |
| for t in tokens | |
| if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"] | |
| ) | |
| # Length distribution | |
| lengths = [len(t) for t in tokens] | |
| if not lengths: # Should not happen if tokens is not empty, but safe check | |
| return { | |
| "basic_stats": { | |
| "total_tokens": 0, | |
| "unique_tokens": 0, | |
| "compression_ratio": 0, | |
| "space_tokens": 0, | |
| "newline_tokens": 0, | |
| "special_tokens": 0, | |
| "punctuation_tokens": 0, | |
| "unique_percentage": 0, | |
| }, | |
| "length_stats": { | |
| "avg_length": 0, | |
| "std_dev": 0, | |
| "min_length": 0, | |
| "max_length": 0, | |
| "median_length": 0, | |
| }, | |
| } | |
| mean_length = sum(lengths) / len(lengths) | |
| variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths) | |
| std_dev = math.sqrt(variance) | |
| sorted_lengths = sorted(lengths) | |
| # Handle case where lengths list might be empty after filtering, though unlikely here | |
| median_length = sorted_lengths[len(lengths) // 2] if lengths else 0 | |
| return { | |
| "basic_stats": { | |
| "total_tokens": total_tokens, | |
| "unique_tokens": unique_tokens, | |
| "compression_ratio": round(compression_ratio, 2), | |
| "space_tokens": space_tokens, | |
| "newline_tokens": newline_tokens, | |
| "special_tokens": special_tokens, | |
| "punctuation_tokens": punctuation_tokens, | |
| "unique_percentage": round(unique_tokens / total_tokens * 100, 1) | |
| if total_tokens > 0 | |
| else 0, | |
| }, | |
| "length_stats": { | |
| "avg_length": round(avg_length, 2), | |
| "std_dev": round(std_dev, 2), | |
| "min_length": min(lengths) if lengths else 0, | |
| "max_length": max(lengths) if lengths else 0, | |
| "median_length": median_length, | |
| }, | |
| } | |
| return (get_token_stats,) | |
| def _(hashlib): | |
| def get_varied_color(token: str) -> dict: | |
| """Generate vibrant colors with HSL for better visual distinction.""" | |
| # Use a fixed salt or seed if you want consistent colors across runs for the same token | |
| token_hash = hashlib.md5(token.encode()).hexdigest() | |
| hue = int(token_hash[:3], 16) % 360 | |
| saturation = 70 + (int(token_hash[3:5], 16) % 20) # Saturation between 70-90% | |
| lightness = 80 + ( | |
| int(token_hash[5:7], 16) % 10 | |
| ) # Lightness between 80-90% (light background) | |
| # Ensure text color contrasts well with the light background | |
| text_lightness = 20 # Dark text for light background | |
| return { | |
| "background": f"hsl({hue}, {saturation}%, {lightness}%)", | |
| "text": f"hsl({hue}, {saturation}%, {text_lightness}%)", | |
| } | |
| return (get_varied_color,) | |
| def fix_token(token: str) -> str: | |
| """Fix token for display with improved space visualization.""" | |
| # Replace SentencePiece space marker U+2581 with a middle dot | |
| token = token.replace(" ", "·") | |
| # Replace BPE space marker 'Ġ' with a middle dot | |
| if token.startswith("Ġ"): | |
| space_count = token.count("Ġ") | |
| return "·" * space_count + token[space_count:] | |
| # Replace newline markers for display | |
| token = token.replace( | |
| "Ċ", "↵\n" | |
| ) # Replace newline marker with symbol and actual newline | |
| token = token.replace("<0x0A>", "↵\n") # Handle byte representation of newline | |
| return token | |
| def get_tokenizer_info(tokenizer): | |
| """ | |
| Extract useful information from a tokenizer. | |
| Returns a dictionary with tokenizer details. | |
| """ | |
| info = {} | |
| try: | |
| # Get vocabulary size (dictionary size) | |
| if hasattr(tokenizer, "vocab_size"): | |
| info["vocab_size"] = tokenizer.vocab_size | |
| elif hasattr(tokenizer, "get_vocab"): | |
| info["vocab_size"] = len(tokenizer.get_vocab()) | |
| # Get model max length if available | |
| if ( | |
| hasattr(tokenizer, "model_max_length") | |
| and tokenizer.model_max_length < 1000000 | |
| ): # Sanity check for realistic values | |
| info["model_max_length"] = tokenizer.model_max_length | |
| else: | |
| info["model_max_length"] = "Not specified or very large" | |
| # Check tokenizer type | |
| info["tokenizer_type"] = tokenizer.__class__.__name__ | |
| # Get special tokens using the recommended attributes/methods | |
| special_tokens = {} | |
| # Prefer all_special_tokens if available | |
| if hasattr(tokenizer, "all_special_tokens"): | |
| for token in tokenizer.all_special_tokens: | |
| # Try to find the attribute name corresponding to the token value | |
| token_name = "unknown_special_token" # Default name | |
| for attr_name in [ | |
| "pad_token", | |
| "eos_token", | |
| "bos_token", | |
| "sep_token", | |
| "cls_token", | |
| "unk_token", | |
| "mask_token", | |
| ]: | |
| if ( | |
| hasattr(tokenizer, attr_name) | |
| and getattr(tokenizer, attr_name) == token | |
| ): | |
| token_name = attr_name | |
| break | |
| if token and str(token).strip(): | |
| special_tokens[token_name] = str(token) | |
| else: | |
| # Fallback to checking individual attributes | |
| for token_name in [ | |
| "pad_token", | |
| "eos_token", | |
| "bos_token", | |
| "sep_token", | |
| "cls_token", | |
| "unk_token", | |
| "mask_token", | |
| ]: | |
| if ( | |
| hasattr(tokenizer, token_name) | |
| and getattr(tokenizer, token_name) is not None | |
| ): | |
| token_value = getattr(tokenizer, token_name) | |
| if token_value and str(token_value).strip(): | |
| special_tokens[token_name] = str(token_value) | |
| info["special_tokens"] = special_tokens if special_tokens else "None found" | |
| except Exception as e: | |
| info["error"] = f"Error extracting tokenizer info: {str(e)}" | |
| return info | |
| def _(mo): | |
| show_ids_switch = mo.ui.switch(label="Show Token IDs instead of Text", value=False) | |
| return (show_ids_switch,) | |
| def _( | |
| current_text, | |
| get_token_stats, | |
| get_varied_color, | |
| llm_tokenizer_selector, | |
| mo, | |
| show_ids_switch, | |
| tokenizer, | |
| ): | |
| # --- Tokenization and Data Preparation --- | |
| # Get tokenizer metadata | |
| tokenizer_info = get_tokenizer_info(tokenizer) | |
| # Tokenize the input text | |
| # Use tokenize to get string representations for analysis and display | |
| all_tokens = tokenizer.tokenize(current_text) | |
| total_token_count = len(all_tokens) | |
| # Limit the number of tokens for display to avoid browser slowdown | |
| display_limit = 1000 | |
| display_tokens = all_tokens[:display_limit] | |
| display_limit_reached = total_token_count > display_limit | |
| # Generate data for visualization | |
| llm_token_data = [] | |
| for idx, token in enumerate(display_tokens): | |
| colors = get_varied_color(token) | |
| fixed_token_display = fix_token(token) # Apply fixes for display | |
| # Handle potential errors during ID conversion (e.g., unknown tokens if not handled by tokenizer) | |
| try: | |
| token_id = tokenizer.convert_tokens_to_ids(token) | |
| except KeyError: | |
| token_id = ( | |
| tokenizer.unk_token_id if hasattr(tokenizer, "unk_token_id") else -1 | |
| ) # Use UNK id or -1 | |
| llm_token_data.append( | |
| { | |
| "original": token, | |
| "display": fixed_token_display, | |
| "colors": colors, | |
| "is_newline": "↵" | |
| in fixed_token_display, # Check if it represents a newline | |
| "token_id": token_id, | |
| "token_index": idx, | |
| } | |
| ) | |
| # Calculate statistics using the full token list | |
| token_stats = get_token_stats(all_tokens, current_text) | |
| # Construct HTML for colored tokens | |
| html_parts = [] | |
| for item in llm_token_data: | |
| # Use pre-wrap to respect spaces and newlines within the token display | |
| style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;" | |
| # Add title attribute for hover info (original token + ID) | |
| title = f"Original: {item['original']}\nID: {item['token_id']}" | |
| display_content = ( | |
| str(item["token_id"]) if show_ids_switch.value else item["display"] | |
| ) | |
| html_parts.append( | |
| f'<span style="{style}" title="{title}">{display_content}</span>' | |
| ) | |
| token_viz_html = mo.Html( | |
| f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>' | |
| ) | |
| basic_stats = token_stats["basic_stats"] | |
| length_stats = token_stats["length_stats"] | |
| basic_stats_md = "**Basic Stats:**\n\n" + "\n".join( | |
| f"- **{key.replace('_', ' ').title()}:** `{value}`" | |
| for key, value in basic_stats.items() | |
| ) | |
| length_stats_md = "**Length (Character) Stats:**\n\n" + "\n".join( | |
| f"- **{key.replace('_', ' ').title()}:** `{value}`" | |
| for key, value in length_stats.items() | |
| ) | |
| mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value} | |
| {show_ids_switch} | |
| ## Tokenizer output | |
| {mo.as_html(token_viz_html)} | |
| ## Token Statistics | |
| {basic_stats_md} | |
| {length_stats_md} | |
| """) | |
| return | |
| def _(): | |
| return | |
| if __name__ == "__main__": | |
| app.run() | |