Spaces:

bor
/

counting_words

Running

counting_words / app.py

Bor Hodošček

chore: format and add lockfile

fade8c0 unverified 7 months ago

18.9 kB

	# /// script
	# [tool.marimo.runtime]
	# auto_instantiate = false
	# ///

	import marimo

	__generated_with = "0.13.0"
	app = marimo.App(width="medium")


	@app.cell
	def _():
	import hashlib
	import math

	import altair as alt
	import marimo as mo
	import polars as pl
	import spacy
	from transformers import AutoTokenizer

	# Load spaCy models for English and Japanese
	nlp_en = spacy.load("en_core_web_md")
	nlp_ja = spacy.load("ja_core_news_md")

	# List of tokenizer models
	llm_model_choices = [
	"meta-llama/Llama-4-Scout-17B-16E-Instruct",
	"google/gemma-3-27b-it",
	"deepseek-ai/DeepSeek-R1",
	"mistralai/Mistral-Small-3.1-24B-Instruct-2503",
	"Qwen/Qwen2.5-72B-Instruct",
	"google-bert/bert-large-uncased",
	"openai-community/gpt2",
	]

	return (
	AutoTokenizer,
	alt,
	hashlib,
	llm_model_choices,
	math,
	mo,
	nlp_en,
	nlp_ja,
	pl,
	)


	@app.cell
	def _(mo):
	mo.md("# Tokenization for English and Japanese")
	return


	@app.cell
	def _(mo):
	# Central state for the text input content
	get_text_content, set_text_content = mo.state("")
	return get_text_content, set_text_content


	@app.cell
	def _(mo):
	# Placeholder texts
	en_placeholder = """
	Mrs. Ferrars died on the night of the 16th⁠–⁠17th September⁠—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours.
	""".strip()
	ja_placeholder = """
	吾輩は猫である。名前はまだ無い。
	どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。
	""".strip()

	# Create UI element for language selection
	language_selector = mo.ui.radio(
	options=["English", "Japanese"], value="English", label="Language"
	)

	# Return selector and placeholders
	return en_placeholder, ja_placeholder, language_selector


	@app.cell
	def _(
	en_placeholder,
	get_text_content,
	ja_placeholder,
	language_selector,
	mo,
	set_text_content,
	):
	# Define text_input dynamically based on language
	current_placeholder = (
	en_placeholder if language_selector.value == "English" else ja_placeholder
	)
	text_input = mo.ui.text_area(
	# Read value from state
	value=get_text_content(),
	label="Enter text",
	placeholder=current_placeholder,
	full_width=True,
	# Update state on user input
	on_change=lambda v: set_text_content(v),
	)
	return current_placeholder, text_input


	@app.cell
	def _(current_placeholder, mo, set_text_content):
	def apply_placeholder():
	set_text_content(current_placeholder)

	apply_placeholder_button = mo.ui.button(
	label="Use Placeholder Text", on_click=lambda _: apply_placeholder()
	)
	return (apply_placeholder_button,)


	@app.cell
	def _(apply_placeholder_button, language_selector, mo, text_input):
	mo.vstack(
	[
	text_input,
	mo.hstack([language_selector, apply_placeholder_button], justify="start"),
	]
	)
	return


	@app.cell
	def _(get_text_content, language_selector, mo, nlp_en, nlp_ja):
	# Analyze text using spaCy based on selected language
	# Read text from state
	current_text = get_text_content()
	if language_selector.value == "English":
	doc = nlp_en(current_text)
	else:
	doc = nlp_ja(current_text)

	# Tokenized version and count
	tokenized_text = [token.text for token in doc]
	token_count = len(tokenized_text)

	mo.md(
	f"Tokenized Text: {' \| '.join(tokenized_text)}\n\nToken Count: {token_count}"
	)
	return current_text, doc


	@app.cell
	def _(doc, mo, pl):
	# Create a polars DataFrame with token attributes
	token_data = pl.DataFrame(
	{
	"Token": [token.text for token in doc],
	"Lemma": [token.lemma_ for token in doc],
	"POS": [token.pos_ for token in doc],
	"Tag": [token.tag_ for token in doc],
	"Morph": [
	str(token.morph) for token in doc
	], # To be more precise, this should be merged back in via .to_dict()
	"Token Position": list(range(len(doc))),
	"Sentence Number": [
	i for i, sent in enumerate(doc.sents) for token in sent
	],
	}
	)

	mo.ui.dataframe(token_data, page_size=50)
	return (token_data,)


	@app.cell
	def _(mo):
	# Create UI element for selecting the column to visualize
	column_selector = mo.ui.dropdown(
	options=["POS", "Tag", "Lemma", "Token", "Morph"],
	value="POS",
	label="Select column to visualize",
	)

	column_selector
	return (column_selector,)


	@app.cell
	def _(alt, column_selector, mo, token_data):
	mo.stop(token_data.is_empty(), "Please set input text.")

	selected_column = column_selector.value
	# Calculate value counts for the selected column
	counts_df = (
	token_data[selected_column]
	.value_counts()
	.sort(by=["count", selected_column], descending=[True, False])
	)

	chart = (
	alt.Chart(counts_df)
	.mark_bar()
	.encode(
	x=alt.X("count", title="Frequency"),
	y=alt.Y(selected_column, title=selected_column, sort=None),
	tooltip=[selected_column, "count"],
	)
	.properties(title=f"{selected_column} Distribution")
	.interactive()
	)
	mo.ui.altair_chart(chart)
	return


	@app.cell
	def _(llm_model_choices, mo):
	# UI for selecting the LLM tokenizer model
	llm_tokenizer_selector = mo.ui.dropdown(
	options=llm_model_choices,
	value=llm_model_choices[-1], # Default to gpt2 for faster loading initially
	label="Select LLM Tokenizer Model",
	)
	llm_tokenizer_selector
	return (llm_tokenizer_selector,)


	@app.cell
	def _(AutoTokenizer, llm_tokenizer_selector):
	# Load the selected tokenizer
	# Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py
	# This cell will re-run when llm_tokenizer_selector.value changes
	# Marimo caches the result implicitly based on inputs
	selected_model_name = llm_tokenizer_selector.value
	tokenizer = AutoTokenizer.from_pretrained(selected_model_name)
	return (tokenizer,)


	@app.cell
	def _(math):
	# Function to calculate token statistics
	def get_token_stats(tokens: list, original_text: str) -> dict:
	"""Calculate enhanced statistics about the tokens."""
	if not tokens:
	return { # Return default structure even for empty input
	"basic_stats": {
	"total_tokens": 0,
	"unique_tokens": 0,
	"compression_ratio": 0,
	"space_tokens": 0,
	"newline_tokens": 0,
	"special_tokens": 0,
	"punctuation_tokens": 0,
	"unique_percentage": 0,
	},
	"length_stats": {
	"avg_length": 0,
	"std_dev": 0,
	"min_length": 0,
	"max_length": 0,
	"median_length": 0,
	},
	}

	total_tokens = len(tokens)
	unique_tokens = len(set(tokens))
	# Handle potential division by zero if total_tokens is 0 (already checked by `if not tokens`)
	avg_length = (
	sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0
	)
	# Handle potential division by zero if total_tokens is 0
	compression_ratio = len(original_text) / total_tokens if total_tokens > 0 else 0

	# Token type analysis (Note: Heuristics might vary between tokenizers)
	# Using startswith(('Ġ', ' ')) covers common space markers like SentencePiece's U+2581 and BPE's 'Ġ'
	space_tokens = sum(1 for t in tokens if t.startswith(("Ġ", " ")))
	# Check for common newline representations
	newline_tokens = sum(
	1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>"
	)
	# A broader definition for special tokens based on common patterns (control tokens)
	special_tokens = sum(
	1
	for t in tokens
	if (t.startswith("<") and t.endswith(">"))
	or (t.startswith("[") and t.endswith("]"))
	)
	# Simple punctuation check (might overlap with other categories, focuses on single char punct)
	punctuation_tokens = sum(
	1
	for t in tokens
	if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"]
	)

	# Length distribution
	lengths = [len(t) for t in tokens]
	if not lengths: # Should not happen if tokens is not empty, but safe check
	return {
	"basic_stats": {
	"total_tokens": 0,
	"unique_tokens": 0,
	"compression_ratio": 0,
	"space_tokens": 0,
	"newline_tokens": 0,
	"special_tokens": 0,
	"punctuation_tokens": 0,
	"unique_percentage": 0,
	},
	"length_stats": {
	"avg_length": 0,
	"std_dev": 0,
	"min_length": 0,
	"max_length": 0,
	"median_length": 0,
	},
	}

	mean_length = sum(lengths) / len(lengths)
	variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
	std_dev = math.sqrt(variance)
	sorted_lengths = sorted(lengths)
	# Handle case where lengths list might be empty after filtering, though unlikely here
	median_length = sorted_lengths[len(lengths) // 2] if lengths else 0

	return {
	"basic_stats": {
	"total_tokens": total_tokens,
	"unique_tokens": unique_tokens,
	"compression_ratio": round(compression_ratio, 2),
	"space_tokens": space_tokens,
	"newline_tokens": newline_tokens,
	"special_tokens": special_tokens,
	"punctuation_tokens": punctuation_tokens,
	"unique_percentage": round(unique_tokens / total_tokens * 100, 1)
	if total_tokens > 0
	else 0,
	},
	"length_stats": {
	"avg_length": round(avg_length, 2),
	"std_dev": round(std_dev, 2),
	"min_length": min(lengths) if lengths else 0,
	"max_length": max(lengths) if lengths else 0,
	"median_length": median_length,
	},
	}

	return (get_token_stats,)


	@app.cell
	def _(hashlib):
	def get_varied_color(token: str) -> dict:
	"""Generate vibrant colors with HSL for better visual distinction."""
	# Use a fixed salt or seed if you want consistent colors across runs for the same token
	token_hash = hashlib.md5(token.encode()).hexdigest()
	hue = int(token_hash[:3], 16) % 360
	saturation = 70 + (int(token_hash[3:5], 16) % 20) # Saturation between 70-90%
	lightness = 80 + (
	int(token_hash[5:7], 16) % 10
	) # Lightness between 80-90% (light background)
	# Ensure text color contrasts well with the light background
	text_lightness = 20 # Dark text for light background

	return {
	"background": f"hsl({hue}, {saturation}%, {lightness}%)",
	"text": f"hsl({hue}, {saturation}%, {text_lightness}%)",
	}

	return (get_varied_color,)


	@app.function
	def fix_token(token: str) -> str:
	"""Fix token for display with improved space visualization."""
	# Replace SentencePiece space marker U+2581 with a middle dot
	token = token.replace(" ", "·")
	# Replace BPE space marker 'Ġ' with a middle dot
	if token.startswith("Ġ"):
	space_count = token.count("Ġ")
	return "·" * space_count + token[space_count:]
	# Replace newline markers for display
	token = token.replace(
	"Ċ", "↵\n"
	) # Replace newline marker with symbol and actual newline
	token = token.replace("<0x0A>", "↵\n") # Handle byte representation of newline
	return token


	@app.function
	def get_tokenizer_info(tokenizer):
	"""
	Extract useful information from a tokenizer.
	Returns a dictionary with tokenizer details.
	"""

	info = {}
	try:
	# Get vocabulary size (dictionary size)
	if hasattr(tokenizer, "vocab_size"):
	info["vocab_size"] = tokenizer.vocab_size
	elif hasattr(tokenizer, "get_vocab"):
	info["vocab_size"] = len(tokenizer.get_vocab())

	# Get model max length if available
	if (
	hasattr(tokenizer, "model_max_length")
	and tokenizer.model_max_length < 1000000
	): # Sanity check for realistic values
	info["model_max_length"] = tokenizer.model_max_length
	else:
	info["model_max_length"] = "Not specified or very large"

	# Check tokenizer type
	info["tokenizer_type"] = tokenizer.__class__.__name__

	# Get special tokens using the recommended attributes/methods
	special_tokens = {}
	# Prefer all_special_tokens if available
	if hasattr(tokenizer, "all_special_tokens"):
	for token in tokenizer.all_special_tokens:
	# Try to find the attribute name corresponding to the token value
	token_name = "unknown_special_token" # Default name
	for attr_name in [
	"pad_token",
	"eos_token",
	"bos_token",
	"sep_token",
	"cls_token",
	"unk_token",
	"mask_token",
	]:
	if (
	hasattr(tokenizer, attr_name)
	and getattr(tokenizer, attr_name) == token
	):
	token_name = attr_name
	break
	if token and str(token).strip():
	special_tokens[token_name] = str(token)
	else:
	# Fallback to checking individual attributes
	for token_name in [
	"pad_token",
	"eos_token",
	"bos_token",
	"sep_token",
	"cls_token",
	"unk_token",
	"mask_token",
	]:
	if (
	hasattr(tokenizer, token_name)
	and getattr(tokenizer, token_name) is not None
	):
	token_value = getattr(tokenizer, token_name)
	if token_value and str(token_value).strip():
	special_tokens[token_name] = str(token_value)

	info["special_tokens"] = special_tokens if special_tokens else "None found"

	except Exception as e:
	info["error"] = f"Error extracting tokenizer info: {str(e)}"

	return info


	@app.cell
	def _(mo):
	show_ids_switch = mo.ui.switch(label="Show Token IDs instead of Text", value=False)
	return (show_ids_switch,)


	@app.cell
	def _(
	current_text,
	get_token_stats,
	get_varied_color,
	llm_tokenizer_selector,
	mo,
	show_ids_switch,
	tokenizer,
	):
	# --- Tokenization and Data Preparation ---

	# Get tokenizer metadata
	tokenizer_info = get_tokenizer_info(tokenizer)

	# Tokenize the input text
	# Use tokenize to get string representations for analysis and display
	all_tokens = tokenizer.tokenize(current_text)
	total_token_count = len(all_tokens)

	# Limit the number of tokens for display to avoid browser slowdown
	display_limit = 1000
	display_tokens = all_tokens[:display_limit]
	display_limit_reached = total_token_count > display_limit

	# Generate data for visualization
	llm_token_data = []
	for idx, token in enumerate(display_tokens):
	colors = get_varied_color(token)
	fixed_token_display = fix_token(token) # Apply fixes for display
	# Handle potential errors during ID conversion (e.g., unknown tokens if not handled by tokenizer)
	try:
	token_id = tokenizer.convert_tokens_to_ids(token)
	except KeyError:
	token_id = (
	tokenizer.unk_token_id if hasattr(tokenizer, "unk_token_id") else -1
	) # Use UNK id or -1

	llm_token_data.append(
	{
	"original": token,
	"display": fixed_token_display,
	"colors": colors,
	"is_newline": "↵"
	in fixed_token_display, # Check if it represents a newline
	"token_id": token_id,
	"token_index": idx,
	}
	)

	# Calculate statistics using the full token list
	token_stats = get_token_stats(all_tokens, current_text)

	# Construct HTML for colored tokens
	html_parts = []
	for item in llm_token_data:
	# Use pre-wrap to respect spaces and newlines within the token display
	style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
	# Add title attribute for hover info (original token + ID)
	title = f"Original: {item['original']}\nID: {item['token_id']}"
	display_content = (
	str(item["token_id"]) if show_ids_switch.value else item["display"]
	)
	html_parts.append(
	f'<span style="{style}" title="{title}">{display_content}</span>'
	)

	token_viz_html = mo.Html(
	f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
	)

	basic_stats = token_stats["basic_stats"]
	length_stats = token_stats["length_stats"]

	basic_stats_md = "Basic Stats:\n\n" + "\n".join(
	f"- {key.replace('_', ' ').title()}: `{value}`"
	for key, value in basic_stats.items()
	)

	length_stats_md = "Length (Character) Stats:\n\n" + "\n".join(
	f"- {key.replace('_', ' ').title()}: `{value}`"
	for key, value in length_stats.items()
	)

	mo.md(f"""# LLM tokenizer: {llm_tokenizer_selector.value}

	{show_ids_switch}

	## Tokenizer output

	{mo.as_html(token_viz_html)}

	## Token Statistics

	{basic_stats_md}

	{length_stats_md}

	""")
	return


	@app.cell
	def _():
	return


	if __name__ == "__main__":
	app.run()