13Aluminium commited on
Commit
b8a2acb
·
verified ·
1 Parent(s): e1189d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -23
app.py CHANGED
@@ -29,51 +29,51 @@ class SanskritMorphemeTokenizer:
29
  - Then minimizes total pieces
30
  - Then prefers longer known morphemes
31
  """
32
- if not word:
33
- return [self.unk_token]
34
 
35
- from functools import lru_cache
36
 
37
  # optional: cap how far we look ahead; adjust if your morphemes are very long
38
- max_morph_len = min(30, len(word))
39
 
40
- @lru_cache(None)
41
- def best(i: int):
42
  # returns (unk_count, pieces_count, -avg_known_len, pieces_list)
43
- if i == len(word):
44
- return (0, 0, 0.0, [])
45
 
46
- best_tuple = (10**9, 10**9, 0.0, [self.unk_token]) # big sentinel
47
 
48
  # try all prefixes starting at i
49
- for j in range(i + 1, min(len(word), i + max_morph_len) + 1):
50
- piece = word[i:j]
51
- is_known = piece in self.morpheme_vocab
52
 
53
  # cost for this piece
54
- piece_unk = 0 if is_known else 1
55
 
56
  # recurse for the remainder
57
- tail = best(j)
58
- unk_count = piece_unk + tail[0]
59
- pieces_count = 1 + tail[1]
60
 
61
  # score tiebreak: prefer longer known pieces
62
- known_len = len(piece) if is_known else 0
63
  # for averaging, combine with tail's average (stored as negative)
64
  # to keep scoring monotonic, we’ll compute a simple total-known-len
65
- total_known_len = known_len + (-tail[2]) * max(1, tail[1]) # invert back
66
 
67
  # pack a comparable tuple:
68
  # 1) fewer UNKs, 2) fewer pieces, 3) longer total known length
69
- candidate = (unk_count, pieces_count, - (total_known_len / pieces_count), [piece] + tail[3])
70
 
71
- if candidate < best_tuple:
72
- best_tuple = candidate
73
 
74
- return best_tuple
75
 
76
- return best(0)[3]
77
 
78
 
79
  def tokenize(self, text: str):
 
29
  - Then minimizes total pieces
30
  - Then prefers longer known morphemes
31
  """
32
+ if not word:
33
+ return [self.unk_token]
34
 
35
+ from functools import lru_cache
36
 
37
  # optional: cap how far we look ahead; adjust if your morphemes are very long
38
+ max_morph_len = min(30, len(word))
39
 
40
+ @lru_cache(None)
41
+ def best(i: int):
42
  # returns (unk_count, pieces_count, -avg_known_len, pieces_list)
43
+ if i == len(word):
44
+ return (0, 0, 0.0, [])
45
 
46
+ best_tuple = (10**9, 10**9, 0.0, [self.unk_token]) # big sentinel
47
 
48
  # try all prefixes starting at i
49
+ for j in range(i + 1, min(len(word), i + max_morph_len) + 1):
50
+ piece = word[i:j]
51
+ is_known = piece in self.morpheme_vocab
52
 
53
  # cost for this piece
54
+ piece_unk = 0 if is_known else 1
55
 
56
  # recurse for the remainder
57
+ tail = best(j)
58
+ unk_count = piece_unk + tail[0]
59
+ pieces_count = 1 + tail[1]
60
 
61
  # score tiebreak: prefer longer known pieces
62
+ known_len = len(piece) if is_known else 0
63
  # for averaging, combine with tail's average (stored as negative)
64
  # to keep scoring monotonic, we’ll compute a simple total-known-len
65
+ total_known_len = known_len + (-tail[2]) * max(1, tail[1]) # invert back
66
 
67
  # pack a comparable tuple:
68
  # 1) fewer UNKs, 2) fewer pieces, 3) longer total known length
69
+ candidate = (unk_count, pieces_count, - (total_known_len / pieces_count), [piece] + tail[3])
70
 
71
+ if candidate < best_tuple:
72
+ best_tuple = candidate
73
 
74
+ return best_tuple
75
 
76
+ return best(0)[3]
77
 
78
 
79
  def tokenize(self, text: str):