Telugu_Vocab_Evaluation

Sleeping

App Files Files Community

Telugu_Vocab_Evaluation / _app.py

salmankhanpm

Upload 2 files

a92795e verified 3 months ago

raw

history blame contribute delete

3.11 kB

	import tiktoken
	from transformers import AutoTokenizer

	# ... existing code ...
	def analyze_tokens_detailed(text, model):
	"""
	For a given text and model, returns a list of dicts with details for each token:
	- token string
	- token id
	- decoded value
	- token length
	- NSL value (token length / max token length in sequence)
	- subword fertility (number of tokens per word)
	Also returns the decoded output for the entire sequence.
	"""
	# Tokenize
	if 'gpt' in model:
	tokenizer = tiktoken.encoding_for_model(model)
	token_ids = tokenizer.encode(text)
	tokens = [tokenizer.decode([tid]) for tid in token_ids]
	else:
	tokenizer = AutoTokenizer.from_pretrained(model)
	token_ids = tokenizer.encode(text, add_special_tokens=False)
	tokens = [tokenizer.decode([tid]) for tid in token_ids]

	# Decoded output for the entire sequence
	if 'gpt' in model:
	decoded_output = tokenizer.decode(token_ids)
	else:
	decoded_output = tokenizer.decode(token_ids)

	# Token lengths
	token_lengths = [len(t) for t in tokens]
	max_token_length = max(token_lengths) if token_lengths else 1
	nsl_values = [l / max_token_length for l in token_lengths]

	# Subword fertility: number of tokens per word
	# Map each token to its originating word (approximate)
	words = text.split()
	word_token_counts = []
	if len(words) > 0:
	# Use a simple greedy approach: assign tokens to words in order
	import re
	text_pointer = 0
	word_idx = 0
	token_word_map = []
	for token in tokens:
	# Find the next word that matches the start of the token
	while word_idx < len(words) and not text[text_pointer:].startswith(words[word_idx]):
	text_pointer += 1
	if word_idx < len(words):
	token_word_map.append(word_idx)
	text_pointer += len(token)
	if text_pointer >= len(text) or (word_idx + 1 < len(words) and text[text_pointer:].startswith(words[word_idx + 1])):
	word_idx += 1
	else:
	token_word_map.append(-1)
	# Count tokens per word
	from collections import Counter
	fertility_counter = Counter(token_word_map)
	subword_fertility = [fertility_counter[i] for i in range(len(words))]
	# Assign fertility to each token
	token_fertility = [fertility_counter[idx] if idx >= 0 else 0 for idx in token_word_map]
	else:
	token_fertility = [1 for _ in tokens]

	# Build table
	table = []
	for i, (token, tid, decoded, length, nsl, fert) in enumerate(zip(tokens, token_ids, tokens, token_lengths, nsl_values, token_fertility)):
	table.append({
	'token': token,
	'token_id': tid,
	'decoded': decoded,
	'token_length': length,
	'nsl': nsl,
	'subword_fertility': fert
	})
	return {
	'model': model,
	'decoded_output': decoded_output,
	'tokens': table
	}
	# ... existing code ...