Spaces:
Sleeping
Sleeping
| """ | |
| Evaluate compression ratio of the tokenizer. | |
| """ | |
| from nanochat.tokenizer import get_tokenizer, RustBPETokenizer | |
| from nanochat.dataset import parquets_iter_batched | |
| # Random text I got from a random website this morning | |
| news_text = r""" | |
| (Washington, D.C., July 9, 2025)- Yesterday, Mexicoβs National Service of Agro-Alimentary Health, Safety, and Quality (SENASICA) reported a new case of New World Screwworm (NWS) in Ixhuatlan de Madero, Veracruz in Mexico, which is approximately 160 miles northward of the current sterile fly dispersal grid, on the eastern side of the country and 370 miles south of the U.S./Mexico border. This new northward detection comes approximately two months after northern detections were reported in Oaxaca and Veracruz, less than 700 miles away from the U.S. border, which triggered the closure of our ports to Mexican cattle, bison, and horses on May 11, 2025. | |
| While USDA announced a risk-based phased port re-opening strategy for cattle, bison, and equine from Mexico beginning as early as July 7, 2025, this newly reported NWS case raises significant concern about the previously reported information shared by Mexican officials and severely compromises the outlined port reopening schedule of five ports from July 7-September 15. Therefore, in order to protect American livestock and our nationβs food supply, Secretary Rollins has ordered the closure of livestock trade through southern ports of entry effective immediately. | |
| βThe United States has promised to be vigilant β and after detecting this new NWS case, we are pausing the planned port reopeningβs to further quarantine and target this deadly pest in Mexico. We must see additional progress combatting NWS in Veracruz and other nearby Mexican states in order to reopen livestock ports along the Southern border,β said U.S. Secretary of Agriculture Brooke L. Rollins. βThanks to the aggressive monitoring by USDA staff in the U.S. and in Mexico, we have been able to take quick and decisive action to respond to the spread of this deadly pest.β | |
| """.strip() | |
| # Random Korean text (to test non-English compression) | |
| korean_text = r""" | |
| μ μ§ν μ¬μ€ μμ, 곡μ ν μμ μ λνλ€ | |
| Herald Korea Times | |
| ν€λ΄λμ½λ¦¬μνμμ¦λ μ μΉ, κ²½μ , μ¬ν, λ¬Έν λ± νκ΅ μ¬ν μ λ°μ μ£Όμ μ΄μλ₯Ό μ¬λ μκ² λ€λ£¨λ μ’ ν© μ¨λΌμΈ μ λ¬Έμ¬μ λλ€. | |
| μ°λ¦¬λ λ¨μν λ΄μ€λ₯Ό μ λ¬νλ κ²μ΄ μλλΌ, μ¬μ€(Fact)μ κΈ°λ°ν μμΈ‘μ μκ°μ κ· ν μκ² μ‘°λͺ νλ©°, λ μ μ¬λ¬λΆμ΄ μ€μ€λ‘ νλ¨ν μ μλ βμ 보μ κ· νβμ μ 곡ν©λλ€. | |
| νκ΅ μΈλ‘ μ μ€λ λ¬Έμ λ‘ μ§μ λμ΄ μ¨ μ μΉμ νΈν₯, μ΄λ μ μ곑μμ λ²μ΄λ | |
| μ€μ§ μ μ§ν¨κ³Ό 곡μ ν¨μ μμΉμΌλ‘ μΌλ μΈλ‘ μ μ§ν₯ν©λλ€. | |
| μ΄λ νμͺ½μ μ£Όμ₯λ§μ νλνκ±°λ κ°μΆμ§ μκ³ , | |
| **λͺ¨λ μμ μ λν΄ β무μμ΄ μμ μΈμ§β, βλκ° λ¬΄μμ μ£Όμ₯νλμ§β, βμ¬μ€μ 무μμΈμ§β**λ₯Ό λͺ νν μ λ¬νλ λ° μ§μ€ν©λλ€. | |
| """.strip() | |
| # Random piece of code | |
| code_text = r""" | |
| class BasicTokenizer(Tokenizer): | |
| def __init__(self): | |
| super().__init__() | |
| def train(self, text, vocab_size, verbose=False): | |
| assert vocab_size >= 256 | |
| num_merges = vocab_size - 256 | |
| # input text preprocessing | |
| text_bytes = text.encode("utf-8") # raw bytes | |
| ids = list(text_bytes) # list of integers in range 0..255 | |
| # iteratively merge the most common pairs to create new tokens | |
| merges = {} # (int, int) -> int | |
| vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes | |
| for i in range(num_merges): | |
| # count up the number of times every consecutive pair appears | |
| stats = get_stats(ids) | |
| # find the pair with the highest count | |
| pair = max(stats, key=stats.get) | |
| # mint a new token: assign it the next available id | |
| idx = 256 + i | |
| # replace all occurrences of pair in ids with idx | |
| ids = merge(ids, pair, idx) | |
| # save the merge | |
| merges[pair] = idx | |
| vocab[idx] = vocab[pair[0]] + vocab[pair[1]] | |
| # prints | |
| if verbose: | |
| print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences") | |
| """.strip() | |
| math_text = r""" | |
| \documentclass[12pt]{article} | |
| \usepackage{amsmath,amsthm,amssymb} | |
| \usepackage[margin=1in]{geometry} | |
| \newtheorem{theorem}{Theorem} | |
| \newtheorem*{remark}{Remark} | |
| \begin{document} | |
| \begin{center} | |
| {\Large A Cute Identity: The Sum of Cubes is a Square} | |
| \end{center} | |
| \begin{theorem} | |
| For every integer $n \ge 1$, | |
| \[ | |
| \sum_{k=1}^{n} k^{3} \;=\; \left(\frac{n(n+1)}{2}\right)^{2}. | |
| \] | |
| \end{theorem} | |
| \begin{proof}[Proof 1 (Induction)] | |
| Let $S(n) = \sum_{k=1}^{n} k^3$. For $n=1$, $S(1)=1=(1\cdot 2/2)^2$, so the base case holds. | |
| Assume $S(n)=\big(\tfrac{n(n+1)}{2}\big)^2$ for some $n\ge 1$. | |
| Then | |
| \[ | |
| S(n+1) | |
| = S(n) + (n+1)^3 | |
| = \left(\frac{n(n+1)}{2}\right)^2 + (n+1)^3. | |
| \] | |
| Factor out $(n+1)^2$: | |
| \[ | |
| S(n+1) | |
| = (n+1)^2\left( \frac{n^2}{4} + (n+1) \right) | |
| = (n+1)^2\left( \frac{n^2 + 4n + 4}{4} \right) | |
| = (n+1)^2\left( \frac{(n+2)^2}{4} \right). | |
| \] | |
| Thus | |
| \[ | |
| S(n+1)=\left(\frac{(n+1)(n+2)}{2}\right)^2, | |
| \] | |
| which matches the claimed formula with $n$ replaced by $n+1$. By induction, the identity holds for all $n\ge 1$. | |
| \end{proof} | |
| \begin{proof}[Proof 2 (Algebraic telescoping)] | |
| Recall the binomial identity | |
| \[ | |
| (k+1)^4 - k^4 = 4k^3 + 6k^2 + 4k + 1. | |
| \] | |
| Summing both sides from $k=0$ to $n$ telescopes: | |
| \[ | |
| (n+1)^4 - 0^4 | |
| = \sum_{k=0}^{n}\big(4k^3 + 6k^2 + 4k + 1\big) | |
| = 4\sum_{k=1}^{n}k^3 + 6\sum_{k=1}^{n}k^2 + 4\sum_{k=1}^{n}k + (n+1). | |
| \] | |
| Using the standard sums | |
| \[ | |
| \sum_{k=1}^{n}k = \frac{n(n+1)}{2} | |
| \quad\text{and}\quad | |
| \sum_{k=1}^{n}k^2 = \frac{n(n+1)(2n+1)}{6}, | |
| \] | |
| solve for $\sum_{k=1}^{n}k^3$ to get | |
| \[ | |
| \sum_{k=1}^{n}k^3 = \left(\frac{n(n+1)}{2}\right)^2. | |
| \] | |
| \end{proof} | |
| \begin{remark} | |
| Geometrically, the identity says: ``adding up $1^3,2^3,\dots,n^3$ builds a perfect squareβββnamely the square of the $n$th triangular number. This is why one sometimes calls it the \emph{sum-of-cubes is a square} phenomenon. | |
| \end{remark} | |
| \end{document} | |
| """.strip() | |
| science_text = r""" | |
| Photosynthesis is a photochemical energy transduction process in which light-harvesting pigmentβprotein complexes within the thylakoid membranes of oxygenic phototrophs absorb photons and initiate charge separation at the reaction center, driving the linear electron transport chain from water to NADPβΊ via photosystem II, the cytochrome bβf complex, and photosystem I, concomitantly generating a trans-thylakoid proton motive force utilized by chloroplastic ATP synthase. The light-dependent reactions produce ATP and NADPH, which fuel the CalvinβBensonβBassham cycle in the stroma, wherein ribulose-1,5-bisphosphate is carboxylated by ribulose-1,5-bisphosphate carboxylase/oxygenase (RuBisCO) to form 3-phosphoglycerate, subsequently reduced and regenerated through a series of enzymatic steps, enabling net assimilation of COβ into triose phosphates and ultimately carbohydrates. This process is tightly regulated by photoprotective mechanisms, redox feedback, and metabolite flux, representing a central biochemical pathway coupling solar energy capture to the biosphereβs primary productivity. | |
| """.strip() | |
| # The tokenizer was trained on data from earlier shards, so it has seen this data | |
| train_docs = next(parquets_iter_batched(split="train")) | |
| train_text = "\n".join(train_docs) | |
| val_docs = next(parquets_iter_batched(split="val")) | |
| val_text = "\n".join(val_docs) | |
| all_text = [ | |
| ("news", news_text), | |
| ("korean", korean_text), | |
| ("code", code_text), | |
| ("math", math_text), | |
| ("science", science_text), | |
| ("fwe-train", train_text), | |
| ] | |
| if val_text: | |
| all_text.append(("fwe-val", val_text)) | |
| # Try out current default compared to GPT-2 and GPT-4 tokenizers | |
| tokenizer_results = {} | |
| vocab_sizes = {} | |
| for tokenizer_name in ["gpt2", "gpt4", "ours"]: | |
| if tokenizer_name == "gpt2": | |
| tokenizer = RustBPETokenizer.from_pretrained("gpt2") # gpt-2 base model tokenizer | |
| elif tokenizer_name == "gpt4": | |
| tokenizer = RustBPETokenizer.from_pretrained("cl100k_base") # gpt-4 base model tokenizer | |
| else: | |
| tokenizer = get_tokenizer() | |
| vocab_sizes[tokenizer_name] = tokenizer.get_vocab_size() | |
| tokenizer_results[tokenizer_name] = {} | |
| for name, text in all_text: | |
| encoded = tokenizer.encode(text) | |
| decoded = tokenizer.decode(encoded) | |
| assert decoded == text | |
| encoded_bytes = text.encode('utf-8') | |
| ratio = len(encoded_bytes) / len(encoded) | |
| tokenizer_results[tokenizer_name][name] = { | |
| 'bytes': len(encoded_bytes), | |
| 'tokens': len(encoded), | |
| 'ratio': ratio | |
| } | |
| # ANSI color codes | |
| GREEN = '\033[92m' | |
| RED = '\033[91m' | |
| RESET = '\033[0m' | |
| # Print vocab sizes | |
| print(f"\nVocab sizes:") | |
| print(f"GPT-2: {vocab_sizes['gpt2']}") | |
| print(f"GPT-4: {vocab_sizes['gpt4']}") | |
| print(f"Ours: {vocab_sizes['ours']}") | |
| def print_comparison(baseline_name, baseline_results, ours_results, all_text): | |
| """Print comparison table between baseline tokenizer and ours.""" | |
| print(f"\nComparison with {baseline_name}:") | |
| print("=" * 95) | |
| print(f"{'Text Type':<10} {'Bytes':<8} {baseline_name:<15} {'Ours':<15} {'Relative':<12} {'Better':<10}") | |
| print(f"{'':10} {'':8} {'Tokens':<7} {'Ratio':<7} {'Tokens':<7} {'Ratio':<7} {'Diff %':<12}") | |
| print("-" * 95) | |
| for name, text in all_text: | |
| baseline_data = baseline_results[name] | |
| ours_data = ours_results[name] | |
| # Calculate relative difference (positive means ours is better, negative means worse) | |
| # Using tokens: fewer tokens is better, so we calculate (baseline_tokens - ours_tokens) / baseline_tokens | |
| relative_diff = ((baseline_data['tokens'] - ours_data['tokens']) / baseline_data['tokens']) * 100 | |
| # Determine which has better compression (higher ratio = better) | |
| if baseline_data['ratio'] > ours_data['ratio']: | |
| baseline_color, ours_color = GREEN, RED | |
| better = baseline_name | |
| diff_color = RED | |
| elif ours_data['ratio'] > baseline_data['ratio']: | |
| baseline_color, ours_color = RED, GREEN | |
| better = "Ours" | |
| diff_color = GREEN | |
| else: | |
| baseline_color, ours_color = "", "" | |
| better = "Tie" | |
| diff_color = "" | |
| print(f"{name:<10} {baseline_data['bytes']:<8} " | |
| f"{baseline_color}{baseline_data['tokens']:<7}{RESET} " | |
| f"{baseline_color}{baseline_data['ratio']:<7.2f}{RESET} " | |
| f"{ours_color}{ours_data['tokens']:<7}{RESET} " | |
| f"{ours_color}{ours_data['ratio']:<7.2f}{RESET} " | |
| f"{diff_color}{relative_diff:+7.1f}%{RESET} " | |
| f"{better:<10}") | |
| # Print comparisons | |
| print_comparison("GPT-2", tokenizer_results['gpt2'], tokenizer_results['ours'], all_text) | |
| print_comparison("GPT-4", tokenizer_results['gpt4'], tokenizer_results['ours'], all_text) | |
| # Log to report | |
| from nanochat.report import get_report | |
| lines = [] | |
| for baseline_name in ["GPT-2", "GPT-4"]: | |
| baseline_key = baseline_name.lower().replace('-', '') | |
| baseline_results = tokenizer_results[baseline_key] | |
| ours_results = tokenizer_results['ours'] | |
| lines.append(f"### Comparison with {baseline_name}") | |
| lines.append("") | |
| lines.append("| Text Type | Bytes | " + baseline_name + " Tokens | " + baseline_name + " Ratio | Ours Tokens | Ours Ratio | Relative Diff % |") | |
| lines.append("|-----------|-------|--------------|--------------|-------------|------------|-----------------|") | |
| for name, text in all_text: | |
| baseline_data = baseline_results[name] | |
| ours_data = ours_results[name] | |
| relative_diff = ((baseline_data['tokens'] - ours_data['tokens']) / baseline_data['tokens']) * 100 | |
| lines.append(f"| {name} | {baseline_data['bytes']} | {baseline_data['tokens']} | {baseline_data['ratio']:.2f} | {ours_data['tokens']} | {ours_data['ratio']:.2f} | {relative_diff:+.1f}% |") | |
| lines.append("") | |
| report_markdown = "\n".join(lines) | |
| get_report().log(section="Tokenizer evaluation", data=[ | |
| report_markdown, | |
| ]) | |