13Aluminium commited on
Commit
43c3670
·
verified ·
1 Parent(s): 66e8adb

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +47 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import PreTrainedTokenizerFast
3
+
4
+ # Load your tokenizer
5
+ tok = PreTrainedTokenizerFast.from_pretrained("snskrt/Sanskrit_Tokenizer")
6
+
7
+ def infer_shloka(text: str):
8
+ # Encode
9
+ enc = tok(text, add_special_tokens=False)
10
+ ids = enc["input_ids"]
11
+ toks = tok.convert_ids_to_tokens(ids)
12
+ # Manual merge: strip "##" and re-join
13
+ detok = []
14
+ for t in toks:
15
+ if t.startswith("##"):
16
+ detok[-1] += t[2:]
17
+ else:
18
+ detok.append(t)
19
+ dec = " ".join(detok)
20
+
21
+ # Format output
22
+ out = (
23
+ f"**Input IDs:**\n{ids}\n\n"
24
+ f"**Tokens:**\n{toks}\n\n"
25
+ f"**Decoded:**\n{dec}\n"
26
+ )
27
+ return out
28
+
29
+ # Three sample ślokas as examples
30
+ examples = [
31
+ ["ॐ सर्वे भवन्तु सुखिनः ॥"],
32
+ ["धर्मो रक्षति रक्षितः ॥"],
33
+ ["यथा दीपः निवातस्थः प्रवर्तमानः ॥"]
34
+ ]
35
+
36
+ iface = gr.Interface(
37
+ fn=infer_shloka,
38
+ inputs=gr.Textbox(lines=2, placeholder="Enter a Sanskrit śloka here…"),
39
+ outputs=gr.Markdown(),
40
+ examples=examples,
41
+ title="Sanskrit-BPE Tokenizer Demo",
42
+ description="Encode a Devanāgarī śloka with your custom BPE tokenizer, view IDs, subtokens, and detokenized output.",
43
+ allow_flagging="never"
44
+ )
45
+
46
+ if __name__ == "__main__":
47
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ huggingface_hub==0.25.2
2
+ gradio
3
+ transformers