Spaces:
Running
Running
| import streamlit as st | |
| import awesome_streamlit as ast | |
| from .preprocess import ( | |
| ArabertPreprocessor, | |
| white_spaced_back_quotation_regex, | |
| white_spaced_double_quotation_regex, | |
| white_spaced_em_dash, | |
| white_spaced_single_quotation_regex, | |
| left_and_right_spaced_chars, | |
| left_spaced_chars, | |
| right_spaced_chars, | |
| ) | |
| import re | |
| MODELS_to_SELECT = [ | |
| "None", | |
| "bert-base-arabertv01", | |
| "bert-base-arabert", | |
| "bert-base-arabertv02", | |
| "bert-base-arabertv2", | |
| "bert-large-arabertv02", | |
| "bert-large-arabertv2", | |
| "araelectra-base", | |
| "araelectra-base-discriminator", | |
| "araelectra-base-generator", | |
| "araelectra-base-artydiqa", | |
| "aragpt2-base", | |
| "aragpt2-medium", | |
| "aragpt2-large", | |
| "aragpt2-mega", | |
| ] | |
| def unpreprocess(text: str) -> str: | |
| """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces. | |
| The objective is to make the generated text of any model appear natural and not preprocessed. | |
| Args: | |
| text (:obj:`str`): input text to be un-preprocessed | |
| desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before].. | |
| Returns: | |
| str: The unpreprocessed (and possibly Farasa-desegmented) text. | |
| """ | |
| text = desegment(text) | |
| # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple | |
| # https://stackoverflow.com/a/53436792/5381220 | |
| text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text) | |
| text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text) | |
| text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text) | |
| text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text) | |
| # during generation, sometimes the models don't put a space after the dot, this handles it | |
| text = text.replace(".", " . ") | |
| text = " ".join(text.split()) | |
| # handle decimals | |
| text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text) | |
| text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text) | |
| text = re.sub(left_and_right_spaced_chars, r"\1", text) | |
| text = re.sub(left_spaced_chars, r"\1", text) | |
| text = re.sub(right_spaced_chars, r"\1", text) | |
| return text | |
| def desegment(text: str) -> str: | |
| """ | |
| Use this function if sentence tokenization was done using | |
| `from arabert.preprocess_arabert import preprocess` with Farasa enabled | |
| AraBERT segmentation using Farasa adds a space after the '+' for prefixes, | |
| and after before the '+' for suffixes | |
| Example: | |
| >>> desegment('ال+ دراس +ات') | |
| الدراسات | |
| """ | |
| text = text.replace("+ ", "+") | |
| text = text.replace(" +", "+") | |
| text = " ".join([_desegmentword(word) for word in text.split(" ")]) | |
| return text | |
| def _desegmentword(orig_word: str) -> str: | |
| """ | |
| Word segmentor that takes a Farasa Segmented Word and removes the '+' signs | |
| Example: | |
| >>> _desegmentword("ال+يومي+ة") | |
| اليومية | |
| """ | |
| word = orig_word.replace("ل+ال+", "لل") | |
| if "ال+ال" not in orig_word: | |
| word = word.replace("ل+ال", "لل") | |
| word = word.replace("+", "") | |
| word = word.replace("للل", "لل") | |
| return word | |
| def write(): | |
| st.markdown( | |
| """ | |
| <h1 style="text-align:left;">Arabic Text Pre-Processor</h1> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| st.markdown( | |
| """ | |
| <style> | |
| p, div, input, label { | |
| text-align: right; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| input_text = st.text_input( | |
| "Text to Pre-Process", | |
| value="ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري", | |
| ) | |
| st.sidebar.title("Model Selector") | |
| model_selector = st.sidebar.selectbox( | |
| """Select None to enable further filters""", options=MODELS_to_SELECT, index=3 | |
| ) | |
| if model_selector == "None": | |
| keep_emojis = st.sidebar.checkbox("Keep emojis", False) | |
| remove_html_markup = st.sidebar.checkbox("Remove html markup", True) | |
| strip_tashkeel = st.sidebar.checkbox("Strip tashkeel", True) | |
| replace_urls_emails_mentions = st.sidebar.checkbox( | |
| "Replace urls and emails", True | |
| ) | |
| strip_tatweel = st.sidebar.checkbox("Strip tatweel", True) | |
| insert_white_spaces = st.sidebar.checkbox("Insert white spaces", True) | |
| remove_non_digit_repetition = st.sidebar.checkbox( | |
| "Remove non-digit repetition", True | |
| ) | |
| replace_slash_with_dash = st.sidebar.checkbox("Replace slash with dash", None) | |
| map_hindi_numbers_to_arabic = st.sidebar.checkbox( | |
| "Map hindi numbers to arabic", None | |
| ) | |
| apply_farasa_segmentation = st.sidebar.checkbox( | |
| "Apply farasa segmentation", None | |
| ) | |
| run_preprocessor = st.button("Run Pre-Processor") | |
| prep_text = None | |
| if run_preprocessor: | |
| if model_selector == "None": | |
| arabert_preprocessor = ArabertPreprocessor( | |
| model_selector, | |
| keep_emojis, | |
| remove_html_markup, | |
| replace_urls_emails_mentions, | |
| strip_tashkeel, | |
| strip_tatweel, | |
| insert_white_spaces, | |
| remove_non_digit_repetition, | |
| replace_slash_with_dash, | |
| map_hindi_numbers_to_arabic, | |
| apply_farasa_segmentation, | |
| ) | |
| else: | |
| arabert_preprocessor = ArabertPreprocessor(model_name=model_selector) | |
| prep_text = arabert_preprocessor._preprocess_v3(input_text) | |
| st.write(prep_text) | |
| st.write("-----") | |
| input_text_unprep = st.text_input( | |
| "Text to Undo the Pre-Processing", | |
| value=prep_text | |
| if prep_text | |
| else "و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري", | |
| ) | |
| run_unpreprocessor = st.button("Run Un-Pre-Processor") | |
| if run_unpreprocessor: | |
| st.write(unpreprocess(input_text_unprep)) | |