Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import PreTrainedTokenizerFast | |
| # Load your tokenizer | |
| tok = PreTrainedTokenizerFast.from_pretrained("snskrt/Sanskrit_Tokenizer") | |
| def infer_shloka(text: str): | |
| # Encode | |
| enc = tok(text, add_special_tokens=False) | |
| ids = enc["input_ids"] | |
| toks = tok.convert_ids_to_tokens(ids) | |
| # Manual merge: strip "##" and re-join | |
| detok = [] | |
| for t in toks: | |
| if t.startswith("##"): | |
| detok[-1] += t[2:] | |
| else: | |
| detok.append(t) | |
| dec = " ".join(detok) | |
| # Format output | |
| out = ( | |
| f"**Input IDs:**\n{ids}\n\n" | |
| f"**Tokens:**\n{toks}\n\n" | |
| f"**Decoded:**\n{dec}\n" | |
| ) | |
| return out | |
| # Three sample ślokas as examples | |
| examples = [ | |
| ["ॐ सर्वे भवन्तु सुखिनः ॥"], | |
| ["धर्मो रक्षति रक्षितः ॥"], | |
| ["यथा दीपः निवातस्थः प्रवर्तमानः ॥"] | |
| ] | |
| iface = gr.Interface( | |
| fn=infer_shloka, | |
| inputs=gr.Textbox(lines=2, placeholder="Enter a Sanskrit śloka here…"), | |
| outputs=gr.Markdown(), | |
| examples=examples, | |
| title="Sanskrit-BPE Tokenizer Demo", | |
| description="Encode a Devanāgarī śloka with your custom BPE tokenizer, view IDs, subtokens, and detokenized output.", | |
| allow_flagging="never" | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |