|  | import tiktoken | 
					
						
						|  | from transformers import AutoTokenizer | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def analyze_tokens_detailed(text, model): | 
					
						
						|  | """ | 
					
						
						|  | For a given text and model, returns a list of dicts with details for each token: | 
					
						
						|  | - token string | 
					
						
						|  | - token id | 
					
						
						|  | - decoded value | 
					
						
						|  | - token length | 
					
						
						|  | - NSL value (token length / max token length in sequence) | 
					
						
						|  | - subword fertility (number of tokens per word) | 
					
						
						|  | Also returns the decoded output for the entire sequence. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | if 'gpt' in model: | 
					
						
						|  | tokenizer = tiktoken.encoding_for_model(model) | 
					
						
						|  | token_ids = tokenizer.encode(text) | 
					
						
						|  | tokens = [tokenizer.decode([tid]) for tid in token_ids] | 
					
						
						|  | else: | 
					
						
						|  | tokenizer = AutoTokenizer.from_pretrained(model) | 
					
						
						|  | token_ids = tokenizer.encode(text, add_special_tokens=False) | 
					
						
						|  | tokens = [tokenizer.decode([tid]) for tid in token_ids] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if 'gpt' in model: | 
					
						
						|  | decoded_output = tokenizer.decode(token_ids) | 
					
						
						|  | else: | 
					
						
						|  | decoded_output = tokenizer.decode(token_ids) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | token_lengths = [len(t) for t in tokens] | 
					
						
						|  | max_token_length = max(token_lengths) if token_lengths else 1 | 
					
						
						|  | nsl_values = [l / max_token_length for l in token_lengths] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | words = text.split() | 
					
						
						|  | word_token_counts = [] | 
					
						
						|  | if len(words) > 0: | 
					
						
						|  |  | 
					
						
						|  | import re | 
					
						
						|  | text_pointer = 0 | 
					
						
						|  | word_idx = 0 | 
					
						
						|  | token_word_map = [] | 
					
						
						|  | for token in tokens: | 
					
						
						|  |  | 
					
						
						|  | while word_idx < len(words) and not text[text_pointer:].startswith(words[word_idx]): | 
					
						
						|  | text_pointer += 1 | 
					
						
						|  | if word_idx < len(words): | 
					
						
						|  | token_word_map.append(word_idx) | 
					
						
						|  | text_pointer += len(token) | 
					
						
						|  | if text_pointer >= len(text) or (word_idx + 1 < len(words) and text[text_pointer:].startswith(words[word_idx + 1])): | 
					
						
						|  | word_idx += 1 | 
					
						
						|  | else: | 
					
						
						|  | token_word_map.append(-1) | 
					
						
						|  |  | 
					
						
						|  | from collections import Counter | 
					
						
						|  | fertility_counter = Counter(token_word_map) | 
					
						
						|  | subword_fertility = [fertility_counter[i] for i in range(len(words))] | 
					
						
						|  |  | 
					
						
						|  | token_fertility = [fertility_counter[idx] if idx >= 0 else 0 for idx in token_word_map] | 
					
						
						|  | else: | 
					
						
						|  | token_fertility = [1 for _ in tokens] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | table = [] | 
					
						
						|  | for i, (token, tid, decoded, length, nsl, fert) in enumerate(zip(tokens, token_ids, tokens, token_lengths, nsl_values, token_fertility)): | 
					
						
						|  | table.append({ | 
					
						
						|  | 'token': token, | 
					
						
						|  | 'token_id': tid, | 
					
						
						|  | 'decoded': decoded, | 
					
						
						|  | 'token_length': length, | 
					
						
						|  | 'nsl': nsl, | 
					
						
						|  | 'subword_fertility': fert | 
					
						
						|  | }) | 
					
						
						|  | return { | 
					
						
						|  | 'model': model, | 
					
						
						|  | 'decoded_output': decoded_output, | 
					
						
						|  | 'tokens': table | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  |