13Aluminium commited on
Commit
a7f7a5d
·
verified ·
1 Parent(s): b8a2acb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -123
app.py CHANGED
@@ -1,123 +1,121 @@
1
- import gradio as gr
2
- import json, pickle, re
3
- from collections import Counter
4
- from huggingface_hub import hf_hub_download
5
- import unicodedata
6
-
7
- REPO_ID = "13Aluminium/sanskrit-morpheme-tokenizer" # change this
8
-
9
- class SanskritMorphemeTokenizer:
10
- def __init__(self):
11
- self.token_to_id = {}
12
- self.id_to_token = {}
13
- self.morpheme_vocab = set()
14
- self.morpheme_freq = Counter()
15
- self.unk_token = "[UNK]"
16
-
17
-
18
- def clean_token(self, token: str):
19
- token = unicodedata.normalize("NFC", token)
20
- token = re.sub(r'[*।॥०-९\d]+', '', token)
21
- token = token.strip()
22
- return token if token else None
23
-
24
-
25
- def _segment_word(self, word: str):
26
- """
27
- DP-based segmenter:
28
- - Minimizes #UNK pieces
29
- - Then minimizes total pieces
30
- - Then prefers longer known morphemes
31
- """
32
- if not word:
33
- return [self.unk_token]
34
-
35
- from functools import lru_cache
36
-
37
- # optional: cap how far we look ahead; adjust if your morphemes are very long
38
- max_morph_len = min(30, len(word))
39
-
40
- @lru_cache(None)
41
- def best(i: int):
42
- # returns (unk_count, pieces_count, -avg_known_len, pieces_list)
43
- if i == len(word):
44
- return (0, 0, 0.0, [])
45
-
46
- best_tuple = (10**9, 10**9, 0.0, [self.unk_token]) # big sentinel
47
-
48
- # try all prefixes starting at i
49
- for j in range(i + 1, min(len(word), i + max_morph_len) + 1):
50
- piece = word[i:j]
51
- is_known = piece in self.morpheme_vocab
52
-
53
- # cost for this piece
54
- piece_unk = 0 if is_known else 1
55
-
56
- # recurse for the remainder
57
- tail = best(j)
58
- unk_count = piece_unk + tail[0]
59
- pieces_count = 1 + tail[1]
60
-
61
- # score tiebreak: prefer longer known pieces
62
- known_len = len(piece) if is_known else 0
63
- # for averaging, combine with tail's average (stored as negative)
64
- # to keep scoring monotonic, we’ll compute a simple total-known-len
65
- total_known_len = known_len + (-tail[2]) * max(1, tail[1]) # invert back
66
-
67
- # pack a comparable tuple:
68
- # 1) fewer UNKs, 2) fewer pieces, 3) longer total known length
69
- candidate = (unk_count, pieces_count, - (total_known_len / pieces_count), [piece] + tail[3])
70
-
71
- if candidate < best_tuple:
72
- best_tuple = candidate
73
-
74
- return best_tuple
75
-
76
- return best(0)[3]
77
-
78
-
79
- def tokenize(self, text: str):
80
- tokens = []
81
- for w in text.split():
82
- cw = self.clean_token(w)
83
- if not cw: continue
84
- if cw in self.morpheme_vocab:
85
- tokens.append(cw)
86
- else:
87
- tokens.extend(self._segment_word(cw))
88
- return tokens
89
-
90
- def encode(self, text: str):
91
- return [self.token_to_id.get(t, self.token_to_id.get(self.unk_token)) for t in self.tokenize(text)]
92
-
93
- def decode(self, ids):
94
- return " ".join(self.id_to_token.get(i, self.unk_token) for i in ids)
95
-
96
- def load_from_hub(self, repo_id):
97
- vocab_fp = hf_hub_download(repo_id, "vocab.json")
98
- freq_fp = hf_hub_download(repo_id, "morpheme_freq.pkl")
99
- cfg_fp = hf_hub_download(repo_id, "config.json")
100
-
101
- with open(vocab_fp, "r", encoding="utf-8") as f:
102
- self.token_to_id = json.load(f)
103
- self.id_to_token = {int(i): tok for tok, i in self.token_to_id.items()}
104
- self.morpheme_vocab = set(self.token_to_id.keys())
105
- with open(freq_fp, "rb") as f:
106
- self.morpheme_freq = Counter(pickle.load(f))
107
-
108
- tokenizer = SanskritMorphemeTokenizer()
109
- tokenizer.load_from_hub(REPO_ID)
110
-
111
- def run(text):
112
- tokens = tokenizer.tokenize(text)
113
- ids = tokenizer.encode(text)
114
- decoded = tokenizer.decode(ids)
115
- return tokens, ids, decoded
116
-
117
- demo = gr.Interface(
118
- fn=run,
119
- inputs=gr.Textbox(label="Input Sanskrit Text", lines=2),
120
- outputs=[gr.JSON(label="Tokens"), gr.JSON(label="Token IDs"), gr.Textbox(label="Decoded")]
121
- )
122
-
123
- demo.launch()
 
1
+ import gradio as gr
2
+ import json, pickle, re
3
+ from collections import Counter
4
+ from huggingface_hub import hf_hub_download
5
+ import unicodedata
6
+
7
+ REPO_ID = "13Aluminium/sanskrit-morpheme-tokenizer" # change this
8
+
9
+ class SanskritMorphemeTokenizer:
10
+ def __init__(self):
11
+ self.token_to_id = {}
12
+ self.id_to_token = {}
13
+ self.morpheme_vocab = set()
14
+ self.morpheme_freq = Counter()
15
+ self.unk_token = "[UNK]"
16
+
17
+ def clean_token(self, token: str):
18
+ token = unicodedata.normalize("NFC", token)
19
+ token = re.sub(r'[*।॥०-९\d]+', '', token)
20
+ token = token.strip()
21
+ return token if token else None
22
+
23
+ def _segment_word(self, word: str):
24
+ """
25
+ DP-based segmenter:
26
+ - Minimizes #UNK pieces
27
+ - Then minimizes total pieces
28
+ - Then prefers longer known morphemes
29
+ """
30
+ if not word:
31
+ return [self.unk_token]
32
+
33
+ from functools import lru_cache
34
+
35
+ # optional: cap how far we look ahead; adjust if your morphemes are very long
36
+ max_morph_len = min(30, len(word))
37
+
38
+ @lru_cache(None)
39
+ def best(i: int):
40
+ # returns (unk_count, pieces_count, -avg_known_len, pieces_list)
41
+ if i == len(word):
42
+ return (0, 0, 0.0, [])
43
+
44
+ best_tuple = (10**9, 10**9, 0.0, [self.unk_token]) # big sentinel
45
+
46
+ # try all prefixes starting at i
47
+ for j in range(i + 1, min(len(word), i + max_morph_len) + 1):
48
+ piece = word[i:j]
49
+ is_known = piece in self.morpheme_vocab
50
+
51
+ # cost for this piece
52
+ piece_unk = 0 if is_known else 1
53
+
54
+ # recurse for the remainder
55
+ tail = best(j)
56
+ unk_count = piece_unk + tail[0]
57
+ pieces_count = 1 + tail[1]
58
+
59
+ # score tiebreak: prefer longer known pieces
60
+ known_len = len(piece) if is_known else 0
61
+ # for averaging, combine with tail's average (stored as negative)
62
+ # to keep scoring monotonic, we’ll compute a simple total-known-len
63
+ total_known_len = known_len + (-tail[2]) * max(1, tail[1]) # invert back
64
+
65
+ # pack a comparable tuple:
66
+ # 1) fewer UNKs, 2) fewer pieces, 3) longer total known length
67
+ candidate = (unk_count, pieces_count, - (total_known_len / pieces_count), [piece] + tail[3])
68
+
69
+ if candidate < best_tuple:
70
+ best_tuple = candidate
71
+
72
+ return best_tuple
73
+
74
+ return best(0)[3]
75
+
76
+ def tokenize(self, text: str):
77
+ tokens = []
78
+ for w in text.split():
79
+ cw = self.clean_token(w)
80
+ if not cw:
81
+ continue
82
+ if cw in self.morpheme_vocab:
83
+ tokens.append(cw)
84
+ else:
85
+ tokens.extend(self._segment_word(cw))
86
+ return tokens
87
+
88
+ def encode(self, text: str):
89
+ return [self.token_to_id.get(t, self.token_to_id.get(self.unk_token)) for t in self.tokenize(text)]
90
+
91
+ def decode(self, ids):
92
+ return " ".join(self.id_to_token.get(i, self.unk_token) for i in ids)
93
+
94
+ def load_from_hub(self, repo_id):
95
+ vocab_fp = hf_hub_download(repo_id, "vocab.json")
96
+ freq_fp = hf_hub_download(repo_id, "morpheme_freq.pkl")
97
+ cfg_fp = hf_hub_download(repo_id, "config.json")
98
+
99
+ with open(vocab_fp, "r", encoding="utf-8") as f:
100
+ self.token_to_id = json.load(f)
101
+ self.id_to_token = {int(i): tok for tok, i in self.token_to_id.items()}
102
+ self.morpheme_vocab = set(self.token_to_id.keys())
103
+ with open(freq_fp, "rb") as f:
104
+ self.morpheme_freq = Counter(pickle.load(f))
105
+
106
+ tokenizer = SanskritMorphemeTokenizer()
107
+ tokenizer.load_from_hub(REPO_ID)
108
+
109
+ def run(text):
110
+ tokens = tokenizer.tokenize(text)
111
+ ids = tokenizer.encode(text)
112
+ decoded = tokenizer.decode(ids)
113
+ return tokens, ids, decoded
114
+
115
+ demo = gr.Interface(
116
+ fn=run,
117
+ inputs=gr.Textbox(label="Input Sanskrit Text", lines=2),
118
+ outputs=[gr.JSON(label="Tokens"), gr.JSON(label="Token IDs"), gr.Textbox(label="Decoded")]
119
+ )
120
+
121
+ demo.launch()