Spaces:
Sleeping
Sleeping
File size: 5,230 Bytes
a7f7a5d 2b6a295 a7f7a5d 5439334 a7f7a5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import gradio as gr
import json, pickle, re
from collections import Counter
from huggingface_hub import hf_hub_download
import unicodedata
REPO_ID = "snskrt/sanskrit-morpheme-tokenizer" # change this
class SanskritMorphemeTokenizer:
def __init__(self):
self.token_to_id = {}
self.id_to_token = {}
self.morpheme_vocab = set()
self.morpheme_freq = Counter()
self.unk_token = "[UNK]"
def clean_token(self, token: str):
token = unicodedata.normalize("NFC", token)
token = re.sub(r'[*।॥०-९\d]+', '', token)
token = token.strip()
return token if token else None
def _segment_word(self, word: str):
"""
DP-based segmenter:
- Minimizes #UNK pieces
- Then minimizes total pieces
- Then prefers longer known morphemes
"""
if not word:
return [self.unk_token]
if word in self.morpheme_vocab:
return [word]
for i in range(len(word), 0, -1):
prefix = word[:i]
if prefix in self.morpheme_vocab:
remaining = word[i:]
return [prefix] + (self._segment_word(remaining) if remaining else [])
return [self.unk_token]
from functools import lru_cache
# optional: cap how far we look ahead; adjust if your morphemes are very long
max_morph_len = min(30, len(word))
@lru_cache(None)
def best(i: int):
# returns (unk_count, pieces_count, -avg_known_len, pieces_list)
if i == len(word):
return (0, 0, 0.0, [])
best_tuple = (10**9, 10**9, 0.0, [self.unk_token]) # big sentinel
# try all prefixes starting at i
for j in range(i + 1, min(len(word), i + max_morph_len) + 1):
piece = word[i:j]
is_known = piece in self.morpheme_vocab
# cost for this piece
piece_unk = 0 if is_known else 1
# recurse for the remainder
tail = best(j)
unk_count = piece_unk + tail[0]
pieces_count = 1 + tail[1]
# score tiebreak: prefer longer known pieces
known_len = len(piece) if is_known else 0
# for averaging, combine with tail's average (stored as negative)
# to keep scoring monotonic, we’ll compute a simple total-known-len
total_known_len = known_len + (-tail[2]) * max(1, tail[1]) # invert back
# pack a comparable tuple:
# 1) fewer UNKs, 2) fewer pieces, 3) longer total known length
candidate = (unk_count, pieces_count, - (total_known_len / pieces_count), [piece] + tail[3])
if candidate < best_tuple:
best_tuple = candidate
return best_tuple
return best(0)[3]
def tokenize(self, text: str):
tokens = []
for w in text.split():
cw = self.clean_token(w)
if not cw:
continue
if cw in self.morpheme_vocab:
tokens.append(cw)
else:
tokens.extend(self._segment_word(cw))
return tokens
def encode(self, text: str):
return [self.token_to_id.get(t, self.token_to_id.get(self.unk_token)) for t in self.tokenize(text)]
def decode(self, ids):
return " ".join(self.id_to_token.get(i, self.unk_token) for i in ids)
def load_from_hub(self, repo_id):
vocab_fp = hf_hub_download(repo_id, "vocab.json")
freq_fp = hf_hub_download(repo_id, "morpheme_freq.pkl")
cfg_fp = hf_hub_download(repo_id, "config.json")
with open(vocab_fp, "r", encoding="utf-8") as f:
self.token_to_id = json.load(f)
self.id_to_token = {int(i): tok for tok, i in self.token_to_id.items()}
self.morpheme_vocab = set(self.token_to_id.keys())
with open(freq_fp, "rb") as f:
self.morpheme_freq = Counter(pickle.load(f))
tokenizer = SanskritMorphemeTokenizer()
tokenizer.load_from_hub(REPO_ID)
def run(text):
tokens = tokenizer.tokenize(text)
ids = tokenizer.encode(text)
decoded = tokenizer.decode(ids)
return tokens, ids, decoded
demo = gr.Interface(
fn=run,
inputs=gr.Textbox(label="Input Sanskrit Text", lines=2),
outputs=[gr.JSON(label="Tokens"), gr.JSON(label="Token IDs"), gr.Textbox(label="Decoded")]
)
demo.launch() |