File size: 654 Bytes
e49dd9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
"""Implement dictionary-based LID for lemmatizer"""

from lemmatizer.utils import Idiom


def _get_counts(toks: list, fast_dict: set):
    t = 0

    for tok in toks:
        if tok.lower() in fast_dict:
            t += 1
    try:
        out = t / len(toks)
    except ZeroDivisionError:
        out = 0.0
    return out


def get_scores(toks, in_voc: dict[Idiom, set]) -> dict[Idiom, float]:
    """Calculate the proportion of tokens in a document that belong to the vocabulary of a given idiom"""
    output = {}

    for dial in in_voc.keys():
        fast_dict = in_voc[dial]

        output[dial] = _get_counts(toks, fast_dict)

    return output