Spaces:
Running
Running
File size: 654 Bytes
e49dd9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
"""Implement dictionary-based LID for lemmatizer"""
from lemmatizer.utils import Idiom
def _get_counts(toks: list, fast_dict: set):
t = 0
for tok in toks:
if tok.lower() in fast_dict:
t += 1
try:
out = t / len(toks)
except ZeroDivisionError:
out = 0.0
return out
def get_scores(toks, in_voc: dict[Idiom, set]) -> dict[Idiom, float]:
"""Calculate the proportion of tokens in a document that belong to the vocabulary of a given idiom"""
output = {}
for dial in in_voc.keys():
fast_dict = in_voc[dial]
output[dial] = _get_counts(toks, fast_dict)
return output
|