Spaces:

13Aluminium
/

token

Sleeping

App Files Files Community

13Aluminium commited on Sep 4

Commit

b8a2acb

verified ·

1 Parent(s): e1189d2

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -23

app.py CHANGED Viewed

@@ -29,51 +29,51 @@ class SanskritMorphemeTokenizer:
     - Then minimizes total pieces
     - Then prefers longer known morphemes
     """
-    if not word:
-        return [self.unk_token]
-    from functools import lru_cache
     # optional: cap how far we look ahead; adjust if your morphemes are very long
-    max_morph_len = min(30, len(word))
-    @lru_cache(None)
-    def best(i: int):
         # returns (unk_count, pieces_count, -avg_known_len, pieces_list)
-        if i == len(word):
-            return (0, 0, 0.0, [])
-        best_tuple = (10**9, 10**9, 0.0, [self.unk_token])  # big sentinel
         # try all prefixes starting at i
-        for j in range(i + 1, min(len(word), i + max_morph_len) + 1):
-            piece = word[i:j]
-            is_known = piece in self.morpheme_vocab
             # cost for this piece
-            piece_unk = 0 if is_known else 1
             # recurse for the remainder
-            tail = best(j)
-            unk_count = piece_unk + tail[0]
-            pieces_count = 1 + tail[1]
             # score tiebreak: prefer longer known pieces
-            known_len = len(piece) if is_known else 0
             # for averaging, combine with tail's average (stored as negative)
             # to keep scoring monotonic, we’ll compute a simple total-known-len
-            total_known_len = known_len + (-tail[2]) * max(1, tail[1])  # invert back
             # pack a comparable tuple:
             # 1) fewer UNKs, 2) fewer pieces, 3) longer total known length
-            candidate = (unk_count, pieces_count, - (total_known_len / pieces_count), [piece] + tail[3])
-            if candidate < best_tuple:
-                best_tuple = candidate
-        return best_tuple
-    return best(0)[3]
     def tokenize(self, text: str):

     - Then minimizes total pieces
     - Then prefers longer known morphemes
     """
+        if not word:
+            return [self.unk_token]
+        from functools import lru_cache
     # optional: cap how far we look ahead; adjust if your morphemes are very long
+        max_morph_len = min(30, len(word))
+        @lru_cache(None)
+        def best(i: int):
         # returns (unk_count, pieces_count, -avg_known_len, pieces_list)
+            if i == len(word):
+                return (0, 0, 0.0, [])
+            best_tuple = (10**9, 10**9, 0.0, [self.unk_token])  # big sentinel
         # try all prefixes starting at i
+            for j in range(i + 1, min(len(word), i + max_morph_len) + 1):
+                piece = word[i:j]
+                is_known = piece in self.morpheme_vocab
             # cost for this piece
+                piece_unk = 0 if is_known else 1
             # recurse for the remainder
+                tail = best(j)
+                unk_count = piece_unk + tail[0]
+                pieces_count = 1 + tail[1]
             # score tiebreak: prefer longer known pieces
+                known_len = len(piece) if is_known else 0
             # for averaging, combine with tail's average (stored as negative)
             # to keep scoring monotonic, we’ll compute a simple total-known-len
+                total_known_len = known_len + (-tail[2]) * max(1, tail[1])  # invert back
             # pack a comparable tuple:
             # 1) fewer UNKs, 2) fewer pieces, 3) longer total known length
+                candidate = (unk_count, pieces_count, - (total_known_len / pieces_count), [piece] + tail[3])
+                if candidate < best_tuple:
+                    best_tuple = candidate
+            return best_tuple
+        return best(0)[3]
     def tokenize(self, text: str):