Spaces:

13Aluminium
/

token

Sleeping

13Aluminium commited on Sep 4

Commit

5439334

verified ·

1 Parent(s): 2b6a295

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,6 +29,16 @@ class SanskritMorphemeTokenizer:
         """
         if not word:
             return [self.unk_token]
         from functools import lru_cache

         """
         if not word:
             return [self.unk_token]
+        if word in self.morpheme_vocab:
+            return [word]
+        for i in range(len(word), 0, -1):
+            prefix = word[:i]
+            if prefix in self.morpheme_vocab:
+                remaining = word[i:]
+                return [prefix] + (self._segment_word(remaining) if remaining else [])
+        return [self.unk_token]
         from functools import lru_cache