diff --git a/.gitattributes b/.gitattributes index 957b2579c6ef20995a09efd9a17f8fd90606f5ed..ed96f8a80fb2d088cbd2247ab146383664f57c28 100644 --- a/.gitattributes +++ b/.gitattributes @@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zstandard filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +*.json filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e2915adf89870c95283705ac56f9e3a3fe96578d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*cpython-39.pyc +.DS_Store diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..9843adf90a7825684ca9ab05b4baea8823573310 --- /dev/null +++ b/LICENSE @@ -0,0 +1,204 @@ +------------- LICENSE FOR Bigscience code -------------- + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2021] [Bigscience] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md index 5862195ec9bbb4043fc6ec1626402ef2c696efa7..afc159b8e250a60e5faa9000a337b110235aa65d 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ --- -title: Text Data Filtering 2 -emoji: 📈 -colorFrom: green -colorTo: yellow +title: Text Data Filtering +emoji: 👁 +colorFrom: blue +colorTo: pink sdk: streamlit app_file: app.py pinned: false @@ -10,36 +10,28 @@ pinned: false # Configuration -`title`: _string_ +`title`: _string_ Display title for the Space -`emoji`: _string_ +`emoji`: _string_ Space emoji (emoji-only character allowed) -`colorFrom`: _string_ +`colorFrom`: _string_ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray) -`colorTo`: _string_ +`colorTo`: _string_ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray) -`sdk`: _string_ -Can be either `gradio`, `streamlit`, or `static` +`sdk`: _string_ +Can be either `gradio` or `streamlit` -`sdk_version` : _string_ +`sdk_version` : _string_ Only applicable for `streamlit` SDK. See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions. -`app_file`: _string_ -Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code). +`app_file`: _string_ +Path to your main application file (which contains either `gradio` or `streamlit` Python code). Path is relative to the root of the repository. -`models`: _List[string]_ -HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space. -Will be parsed automatically from your code if not specified here. - -`datasets`: _List[string]_ -HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space. -Will be parsed automatically from your code if not specified here. - -`pinned`: _boolean_ +`pinned`: _boolean_ Whether the Space stays on top of your list. diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..98597a78499ff628a96217d0cbea07aefde4df0b --- /dev/null +++ b/app.py @@ -0,0 +1,916 @@ +# Run with: streamlit run visualization.py + +import streamlit as st + +import os + +from io import StringIO +import base64 +import json +import pandas as pd + +pd.options.mode.chained_assignment = None + +import numpy as np + +import matplotlib.pyplot as plt + +from filtering import LoadParameters, ModifyingDocuments, Filtering +from languages_id import langs_id + + +class Visualization_for_lang: + def __init__( + self, + path_data, + lang, + num_docs, + num_docs_for_words, + max_len_text_display, + lang_dataset_id, + path_fasttext_model, + path_sentencepiece_model, + path_kenlm_model, + ): + self.path_data = path_data + self.lang = lang + self.num_docs = num_docs + self.num_docs_for_words = num_docs_for_words + self.max_len_text_display = max_len_text_display + + self.lang_dataset_id = lang_dataset_id + self.param = LoadParameters.load_parameters(lang_dataset_id) + self.stopwords = LoadParameters.load_stopwords(lang_dataset_id) + self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id) + self.model_lang_id = LoadParameters.load_model_lang_id( + lang_dataset_id, path_fasttext_model + ) + self.sentencepiece_model = LoadParameters.load_sentencepiece_model( + lang_dataset_id, path_sentencepiece_model + ) + self.sentencepiece_model_tok = ( + self.sentencepiece_model if self.param["tokenization"] else None + ) + self.kenlm_model = LoadParameters.load_kenlm_model( + lang_dataset_id, path_kenlm_model + ) + + def set_title(self): + st.title(f"Filtering visualization for {self.lang}") + + def open_data(self): + with open(self.path_data) as json_file: + data = json.load(json_file) + + self.num_docs = min(self.num_docs, len(data)) + self.num_docs_for_words = min(self.num_docs_for_words, len(data)) + + if "words" in data[0]: + words = [doc["words"] for doc in data[: self.num_docs_for_words]] + words = [word for doc in words for word in doc] + self.words = pd.DataFrame(words) + else: + self.words = None + + docs = data[: self.num_docs] + for doc in docs: + if not (self.words is None): + del doc["words"] + if len(doc["text"]) > self.max_len_text_display: + doc["text"] = ( + doc["text"][: self.max_len_text_display] + + " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]" + ) + self.docs_checkpoint = pd.DataFrame(docs) + self.docs = self.docs_checkpoint + + @staticmethod + def print_discarded_by_cond(cond): + st.caption( + f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter." + ) + + @staticmethod + def plot_hist(dataframe, key, num_bins=50): + checkbox = st.checkbox( + "Diplay distribution", value=True, key=f"display_distribution_{key[0]}" + ) + if checkbox: + fig, ax = plt.subplots() + val = dataframe[key[0]].values + if np.median(val) != 0: + val = val[ + abs(val - np.median(val)) + < 9 * np.median(np.absolute(val - np.median(val))) + ] + ax.hist(val, bins=num_bins, density=True) + ax.set_title(" ".join(key[0].split("_"))) + ax.axvline(x=key[1], color="r", linestyle="dashed") + st.pyplot(fig) + + @staticmethod + def display_dataset(dataframe, cond, description, type_of_examples): + displayed_examples = dataframe.loc[cond] + st.subheader( + f"{description}: {len(displayed_examples)} {type_of_examples} ({len(displayed_examples) / len(dataframe.index) * 100:.2f}%)" + ) + st.markdown( + "Click on a column to sort by it, place the cursor on the text to display it." + ) + st.dataframe(displayed_examples) + + def filtering_of_docs(self): + def set_sliders(): + columns = list(self.docs) + keys = [] + conds = {} + + def get_cond(key, cutoff, max_cutoff): + if max_cutoff: + return self.docs[key] <= cutoff + return self.docs[key] >= cutoff + + if "number_words" in columns: + with st.sidebar.expander("Number of words"): + cutoff_def = "If the number of words of a document is lower than this number, the document is removed." + max_nb_words = int(np.max(self.docs["number_words"])) + 1 + cutoff_min_number_words = st.slider( + cutoff_def, 0, min(max_nb_words, 500), 0 + ) + new_key = ("number_words", cutoff_min_number_words, False) + keys.append(new_key) + Visualization_for_lang.plot_hist(self.docs, new_key) + cond_1 = get_cond(new_key[0], new_key[1], new_key[2]) + Visualization_for_lang.print_discarded_by_cond(cond_1) + + cutoff_def = "If the number of words of a document is higher than this number, the document is removed." + cutoff_max_number_words = st.slider( + cutoff_def, 0, max_nb_words, max_nb_words + ) + new_key = ("number_words", cutoff_max_number_words, True) + keys.append(new_key) + cond_2 = get_cond(new_key[0], new_key[1], new_key[2]) + Visualization_for_lang.print_discarded_by_cond(cond_2) + + conds["number_words"] = [cond_1, cond_2] + + if "character_repetition_ratio" in columns: + with st.sidebar.expander("Character repetition ratio"): + val_repetitions_lengths = list( + self.docs["character_repetition_ratio"].iloc[0].keys() + ) + default_index = ( + val_repetitions_lengths.index("10") + if "10" in val_repetitions_lengths + else 0 + ) + label_selectbox = "Length of repetitions in characters (that will influence the character repetition ratio)." + repetitions_length = st.selectbox( + label=label_selectbox, + options=val_repetitions_lengths, + index=default_index, + ) + st.caption( + "Choosing a higher or lower number does not mean that the filtering " + "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) " + "tends to associate a high character repetition ratio to very long documents (like book chapters), but with " + "few or no repetitions, simply because their length gives them more diversity, and we do " + "not want to discard such documents. It is generally better to increase this number, so that false " + "positives are very short documents (which we want to delete anyway) rather than long ones. However, " + "a low number can be useful for Chinese, where a character can designate a whole word." + ) + self.docs["character_repetition_ratio"] = self.docs_checkpoint[ + "character_repetition_ratio" + ] + for i in range(len(self.docs["character_repetition_ratio"])): + self.docs["character_repetition_ratio"].iloc[i] = self.docs[ + "character_repetition_ratio" + ].iloc[i][repetitions_length] + + cutoff_def = "If the character repetition ratio of a document is higher than this number, the document is removed." + cutoff_character_repetition_ratio = st.slider( + cutoff_def, 0.0, 1.0, 1.0, step=0.01 + ) + new_key = ( + "character_repetition_ratio", + cutoff_character_repetition_ratio, + True, + repetitions_length, + ) + keys.append(new_key) + Visualization_for_lang.plot_hist(self.docs, new_key) + cond = get_cond(new_key[0], new_key[1], new_key[2]) + Visualization_for_lang.print_discarded_by_cond(cond) + conds["character_repetition_ratio"] = [cond] + + if "word_repetition_ratio" in columns: + with st.sidebar.expander("Word repetition ratio"): + val_repetitions_lengths = list( + self.docs["word_repetition_ratio"].iloc[0].keys() + ) + default_index = ( + val_repetitions_lengths.index("5") + if "5" in val_repetitions_lengths + else 0 + ) + label_selectbox = "Length of repetitions in words (that will influence the word repetition ratio)." + repetitions_length = st.selectbox( + label=label_selectbox, + options=val_repetitions_lengths, + index=default_index, + ) + st.caption( + "Choosing a higher or lower number does not mean that the filtering " + "is stronger or weaker. Be careful, choosing a low number (like 3) could " + "tend to associate a high word repetition ratio to very long documents (like book chapters), but with " + "few or no repetitions, simply because their length gives them more diversity, and we do " + "not want to discard such documents. It is generally better to increase a bit this number, so that false " + "positives are very short documents (which we want to delete anyway) rather than long ones." + ) + self.docs["word_repetition_ratio"] = self.docs_checkpoint[ + "word_repetition_ratio" + ] + for i in range(len(self.docs["word_repetition_ratio"])): + self.docs["word_repetition_ratio"].iloc[i] = self.docs[ + "word_repetition_ratio" + ].iloc[i][repetitions_length] + + cutoff_def = "If the word repetition ratio of a document is higher than this number, the document is removed." + cutoff_word_repetition_ratio = st.slider( + cutoff_def, 0.0, 1.0, 1.0, step=0.01 + ) + new_key = ( + "word_repetition_ratio", + cutoff_word_repetition_ratio, + True, + repetitions_length, + ) + keys.append(new_key) + Visualization_for_lang.plot_hist(self.docs, new_key) + cond = get_cond(new_key[0], new_key[1], new_key[2]) + Visualization_for_lang.print_discarded_by_cond(cond) + conds["word_repetition_ratio"] = [cond] + + if "special_characters_ratio" in columns: + with st.sidebar.expander("Special characters ratio"): + cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed." + cutoff_special_characters_ratio = st.slider( + cutoff_def, 0.0, 1.0, 1.0, step=0.01 + ) + new_key = ( + "special_characters_ratio", + cutoff_special_characters_ratio, + True, + ) + keys.append(new_key) + Visualization_for_lang.plot_hist(self.docs, new_key) + cond = get_cond(new_key[0], new_key[1], new_key[2]) + Visualization_for_lang.print_discarded_by_cond(cond) + conds["special_characters_ratio"] = [cond] + + if "stopwords_ratio" in columns: + with st.sidebar.expander("Stop words ratio"): + stopwords_file = st.file_uploader( + "Upload your own list of stop words (one per line). If there is none, the default one is used." + ) + if stopwords_file: + new_stopwords = StringIO( + stopwords_file.getvalue().decode("utf-8") + ).read() + new_stopwords = set(new_stopwords.split("\n")) + self.docs["stopwords_ratio"] = self.docs_checkpoint[ + "stopwords_ratio" + ] + for i in range(len(self.docs["stopwords_ratio"])): + self.docs["stopwords_ratio"].iloc[ + i + ] = Filtering.compute_stopwords_ratio( + self.docs["text"].iloc[i], + self.sentencepiece_model_tok, + self.param["strip_characters"], + self.param["cond_words_augmentation"], + self.param["words_augmentation_group_sizes"], + self.param["words_augmentation_join_char"], + new_stopwords, + ) + cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed." + cutoff_stopwords_ratio = st.slider( + cutoff_def, 0.0, 1.0, 0.0, step=0.01 + ) + new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False) + keys.append(new_key) + Visualization_for_lang.plot_hist(self.docs, new_key) + cond = get_cond(new_key[0], new_key[1], new_key[2]) + Visualization_for_lang.print_discarded_by_cond(cond) + conds["stopwords_ratio"] = [cond] + + if "flagged_words_ratio" in columns: + with st.sidebar.expander("Flagged words ratio"): + flagged_words_file = st.file_uploader( + "Upload your own list of flagged words (one per line). If there is none, the default one is used." + ) + if flagged_words_file: + new_flagged_words = StringIO( + flagged_words_file.getvalue().decode("utf-8") + ).read() + new_flagged_words = set(new_flagged_words.split("\n")) + self.docs["flagged_words_ratio"] = self.docs_checkpoint[ + "flagged_words_ratio" + ] + for i in range(len(self.docs["flagged_words_ratio"])): + self.docs["flagged_words_ratio"].iloc[ + i + ] = Filtering.compute_flagged_words_ratio( + self.docs["text"].iloc[i], + self.sentencepiece_model_tok, + self.param["strip_characters"], + self.param["cond_words_augmentation"], + self.param["words_augmentation_group_sizes"], + self.param["words_augmentation_join_char"], + new_flagged_words, + ) + cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed." + max_fwr = np.max(self.docs["flagged_words_ratio"]) + max_fwr = np.ceil(max_fwr * 1000) / 1000 + max_fwr = float(max_fwr) + cutoff_flagged_words_ratio = st.slider( + cutoff_def, + 0.000, + max_fwr, + max_fwr, + step=0.001, + format="%f", + ) + new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True) + keys.append(new_key) + Visualization_for_lang.plot_hist(self.docs, new_key) + cond = get_cond(new_key[0], new_key[1], new_key[2]) + Visualization_for_lang.print_discarded_by_cond(cond) + conds["flagged_words_ratio"] = [cond] + + if "lang_id_score" in columns: + with st.sidebar.expander("Language ID confidence score"): + cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed." + cutoff_lang_id_score = st.slider( + cutoff_def, 0.0, 1.0, 0.0, step=0.01 + ) + new_key = ("lang_id_score", cutoff_lang_id_score, False) + keys.append(new_key) + Visualization_for_lang.plot_hist(self.docs, new_key) + cond = get_cond(new_key[0], new_key[1], new_key[2]) + Visualization_for_lang.print_discarded_by_cond(cond) + conds["lang_id_score"] = [cond] + + if "perplexity_score" in columns: + with st.sidebar.expander("Perplexity score"): + cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed." + max_pp = int(np.max(self.docs["perplexity_score"])) + 1 + cutoff_perplexity_score = st.slider(cutoff_def, 0, max_pp, max_pp) + new_key = ("perplexity_score", cutoff_perplexity_score, True) + keys.append(new_key) + Visualization_for_lang.plot_hist(self.docs, new_key) + cond = get_cond(new_key[0], new_key[1], new_key[2]) + Visualization_for_lang.print_discarded_by_cond(cond) + conds["perplexity_score"] = [cond] + + return keys, conds + + with st.expander( + f"Filtering on documents, for {self.num_docs} {self.lang} documents" + ): + st.header( + f"Filtering on documents, for {self.num_docs} {self.lang} documents" + ) + + if "labels" in list(self.docs): + chosen_label = st.selectbox( + label="Consider only documents that include the following label", + options=[ + "All", + "NA: Narrative", + "IN: Informational Description", + "OP: Opinion", + "ID: Interactive Discussion", + "HI: How-to/Instruction", + "IP: Informational Persuasion", + "LY: Lyrical", + "SP: Spoken", + ], + ) + chosen_label = chosen_label.split(":")[0] + if chosen_label != "All": + cond_label = list( + self.docs["labels"].apply( + lambda x: True if chosen_label in x else False + ) + ) + self.docs = self.docs[cond_label] + + if self.docs.empty: + st.markdown( + "No document to display, please try to select a different label." + ) + self.keys = [] + self.parameters = [] + + else: + st.sidebar.subheader("Parameters of the filtering on documents") + self.keys, conds = set_sliders() + self.parameters = self.keys * 1 + + all_conds = [ + subcond for cond in list(conds.values()) for subcond in cond + ] + all_conds = np.all(all_conds, axis=0) + + Visualization_for_lang.display_dataset( + self.docs, np.invert(all_conds), "Discarded documents", "docs" + ) + + # st.subheader("Display discarded documents by filter") + display_discarded_documents_by_filter = st.checkbox( + "Display discarded documents by filter" + ) + + if display_discarded_documents_by_filter: + columns = list(self.docs) + + if "number_words" in columns: + cond_filter = np.invert(np.all(conds["number_words"], axis=0)) + Visualization_for_lang.display_dataset( + self.docs, + cond_filter, + "Discarded documents for the filter on the number of words", + "docs", + ) + + if "character_repetition_ratio" in columns: + cond_filter = np.invert( + np.all(conds["character_repetition_ratio"], axis=0) + ) + Visualization_for_lang.display_dataset( + self.docs, + cond_filter, + "Discarded documents for the filter on the character repetition ratio", + "docs", + ) + + if "word_repetition_ratio" in columns: + cond_filter = np.invert( + np.all(conds["word_repetition_ratio"], axis=0) + ) + Visualization_for_lang.display_dataset( + self.docs, + cond_filter, + "Discarded documents for the filter on the word repetition ratio", + "docs", + ) + + if "special_characters_ratio" in columns: + cond_filter = np.invert( + np.all(conds["special_characters_ratio"], axis=0) + ) + Visualization_for_lang.display_dataset( + self.docs, + cond_filter, + "Discarded documents for the filter on the special characters ratio", + "docs", + ) + + if "stopwords_ratio" in columns: + cond_filter = np.invert( + np.all(conds["stopwords_ratio"], axis=0) + ) + Visualization_for_lang.display_dataset( + self.docs, + cond_filter, + "Discarded documents for the filter on the stop words ratio", + "docs", + ) + + if "flagged_words_ratio" in columns: + cond_filter = np.invert( + np.all(conds["flagged_words_ratio"], axis=0) + ) + Visualization_for_lang.display_dataset( + self.docs, + cond_filter, + "Discarded documents for the filter on the flagged words ratio", + "docs", + ) + + if "lang_id_score" in columns: + cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0)) + Visualization_for_lang.display_dataset( + self.docs, + cond_filter, + "Discarded documents for the filter on the language identification confidence score", + "docs", + ) + + if "perplexity_score" in columns: + cond_filter = np.invert( + np.all(conds["perplexity_score"], axis=0) + ) + Visualization_for_lang.display_dataset( + self.docs, + cond_filter, + "Discarded documents for the filter on the perplexity score", + "docs", + ) + + Visualization_for_lang.display_dataset( + self.docs, all_conds, "Retained documents", "docs" + ) + + st.header("Download data") + + with open(self.path_data) as json_file: + btn = st.download_button( + label="Download data as json", + data=json_file, + file_name="data.json", + ) + + def filtering_of_words(self): + if not (self.words is None): + columns = list(self.words) + + st.sidebar.subheader("Parameter of the filtering on words") + + conds_words = {} + + if "len_word" in columns: + with st.sidebar.expander("Length of words"): + cutoff_def = "If the length of a word is higher than this number, the word is removed." + max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200) + cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word) + new_key = ("len_word", cutoff_word, True) + self.parameters.append(new_key) + Visualization_for_lang.plot_hist(self.words, new_key) + cond_len_words = self.words["len_word"] <= cutoff_word + Visualization_for_lang.print_discarded_by_cond(cond_len_words) + conds_words["len_word"] = cond_len_words + + if "incorrect_substrings" in columns: + with st.sidebar.expander("Words with incorrect substrings"): + incorrect_substrings = st.checkbox( + "Remove words with incorrect substrings." + ) + self.parameters.append( + ("incorrect_substrings", incorrect_substrings) + ) + + checkbox = st.checkbox( + "Diplay distribution", + value=True, + key="display_distribution_incorrect_substrings", + ) + if checkbox: + incor_sub = np.array(self.words["incorrect_substrings"]) * 1 + with_incor_sub = np.sum(incor_sub) + without_incor_sub = len(incor_sub) - with_incor_sub + st.markdown( + f"Number of words with incorrect substrings: {with_incor_sub}" + ) + st.markdown( + f"Number of words without incorrect substrings: {without_incor_sub}" + ) + + if incorrect_substrings: + cond_incorrect_substrings = np.invert( + self.words["incorrect_substrings"] + ) + else: + cond_incorrect_substrings = np.array( + [ + True + for i in range(len(self.words["incorrect_substrings"])) + ] + ) + Visualization_for_lang.print_discarded_by_cond( + cond_incorrect_substrings + ) + conds_words["incorrect_substrings"] = cond_incorrect_substrings + + all_conds_words = np.all(list(conds_words.values()), axis=0) + + with st.expander( + f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents" + ): + st.header( + f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents" + ) + + st.markdown( + f"Since the number of words is way larger than the number of documents, " + f"we consider in this section words for only {self.num_docs_for_words} documents." + ) + + Visualization_for_lang.display_dataset( + self.words, np.invert(all_conds_words), "Discarded words", "words" + ) + + # st.subheader("Display discarded words by filter") + display_discarded_words_by_filter = st.checkbox( + "Display discarded words by filter" + ) + + if display_discarded_words_by_filter: + + if "len_word" in columns: + cond_filter = np.invert(conds_words["len_word"]) + Visualization_for_lang.display_dataset( + self.words, + cond_filter, + "Discarded words for the filter on length", + "words", + ) + + if "incorrect_substrings" in columns: + cond_filter = np.invert(conds_words["incorrect_substrings"]) + Visualization_for_lang.display_dataset( + self.words, + cond_filter, + "Discarded words for the filter on incorrect substrings", + "words", + ) + + Visualization_for_lang.display_dataset( + self.words, all_conds_words, "Retained words", "words" + ) + + def download_parameters(self): + st.sidebar.subheader("Download parameters") + btn = st.sidebar.download_button( + label="Download current parameters as json", + data=json.dumps(self.parameters), + file_name=f"parameters_{self.lang_dataset_id}.json", + ) + + """ + def plot_zipf_law(self): + if not (self.words is None): + st.header("Zipf's Law") + + display_zipf_law = st.checkbox("Display Zipf's Law") + + if display_zipf_law: + + freq_words = {} + for _, row in self.words.iterrows(): + freq_words[row["word"]] = freq_words.get(row["word"], 0) + 1 + freq_words = np.array(list(freq_words.values())) + freq_words = -np.sort(-freq_words) + + fig, ax = plt.subplots() + ax.loglog(freq_words) + ax.set_title("Zipf's Law") + ax.set_xlabel("$i$-th most frequent word") + ax.set_ylabel("frequency in the documents") + st.pyplot(fig) + """ + + def analyse_personal_doc(self): + with st.expander("Analyse your own document"): + st.header("Analyse your own document") + + personal_doc = st.text_area( + label="Paste here the document you want to analyse", + value="", + max_chars=10000, + ) + + is_discarded = False + + def is_doc_discarded(key, score): + if key[2]: # max cutoff + return score > key[1] + else: + return score < key[1] + + if personal_doc: + + st.markdown("Statistics of the document:") + + for key in self.keys: + if key[0] == "number_words": + words = ModifyingDocuments.get_words_from_document( + personal_doc, + self.sentencepiece_model_tok, + lower_case=False, + strip_characters=self.param["strip_characters"], + ) + if key[2]: + st.markdown(f"Number of words: {len(words)}") + if is_doc_discarded(key, len(words)): + is_discarded = True + + elif key[0] == "character_repetition_ratio": + character_repetition_ratio = ( + Filtering.compute_character_repetition_ratio( + personal_doc, int(key[3]) + ) + ) + character_repetition_ratio = round( + character_repetition_ratio, 3 + ) + st.markdown( + f"Character repetition ratio: {character_repetition_ratio}" + ) + if is_doc_discarded(key, character_repetition_ratio): + is_discarded = True + + elif key[0] == "word_repetition_ratio": + word_repetition_ratio = Filtering.compute_word_repetition_ratio( + personal_doc, + self.sentencepiece_model_tok, + self.param["strip_characters"], + int(key[3]), + ) + word_repetition_ratio = round(word_repetition_ratio, 3) + st.markdown(f"Word repetition ratio: {word_repetition_ratio}") + if is_doc_discarded(key, word_repetition_ratio): + is_discarded = True + + elif key[0] == "special_characters_ratio": + special_characters_ratio = ( + Filtering.compute_special_characters_ratio( + personal_doc, self.param["special_characters"] + ) + ) + special_characters_ratio = round(special_characters_ratio, 3) + st.markdown( + f"Special characters ratio: {special_characters_ratio}" + ) + if is_doc_discarded(key, special_characters_ratio): + is_discarded = True + + elif key[0] == "stopwords_ratio": + stopwords_ratio = Filtering.compute_stopwords_ratio( + personal_doc, + self.sentencepiece_model_tok, + self.param["strip_characters"], + self.param["cond_words_augmentation"], + self.param["words_augmentation_group_sizes"], + self.param["words_augmentation_join_char"], + self.stopwords, + ) + stopwords_ratio = round(stopwords_ratio, 3) + st.markdown(f"Stop words ratio: {stopwords_ratio}") + if is_doc_discarded(key, stopwords_ratio): + is_discarded = True + + elif key[0] == "flagged_words_ratio": + flagged_words_ratio = Filtering.compute_flagged_words_ratio( + personal_doc, + self.sentencepiece_model_tok, + self.param["strip_characters"], + self.param["cond_words_augmentation"], + self.param["words_augmentation_group_sizes"], + self.param["words_augmentation_join_char"], + self.flagged_words, + ) + flagged_words_ratio = round(flagged_words_ratio, 3) + st.markdown(f"Flagged words ratio: {flagged_words_ratio}") + if is_doc_discarded(key, flagged_words_ratio): + is_discarded = True + + elif key[0] == "lang_id_score": + ( + lang_pred_dataset_id, + lang_id_score, + ) = Filtering.compute_lang_id_pred_score( + personal_doc, self.model_lang_id + ) + lang_id_score = round(lang_id_score, 3) + st.markdown( + f"Language identification confidence score: {lang_id_score}" + ) + if is_doc_discarded(key, flagged_words_ratio) or ( + self.lang_dataset_id != lang_pred_dataset_id + ): + is_discarded = True + + elif key[0] == "perplexity_score": + perplexity_score = Filtering.compute_perplexity_score( + personal_doc, + self.sentencepiece_model, + self.kenlm_model, + ) + perplexity_score = round(perplexity_score, 3) + st.markdown(f"Perplexity score: {perplexity_score}") + if is_doc_discarded(key, perplexity_score): + is_discarded = True + + is_discarded = "" if is_discarded else "not " + st.markdown( + f"With the current filtering parameters, this document **is {is_discarded}discarded**." + ) + + def visualization_for_lang(self): + self.set_title() + self.open_data() + self.filtering_of_docs() + self.filtering_of_words() + self.download_parameters() + self.analyse_personal_doc() + + +class Visualization: + def __init__(self, path_instructions, param_visu_langs): + self.path_instructions = path_instructions + self.param_visu_langs = param_visu_langs + + def preamble(self): + def get_binary_file_downloader_html(bin_file, file_label="File"): + with open(bin_file, "rb") as f: + data = f.read() + bin_str = base64.b64encode(data).decode() + href = f'{file_label}' + return href + + st.markdown( + "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this " + + get_binary_file_downloader_html( + self.path_instructions, + "pdf", + ) + + ".", + unsafe_allow_html=True, + ) + + def warning_preamble(self): + st.markdown( + "This demo can be a little slow, and only allows you to process up to 5000 documents " + "for a decent speed. If you want to display up to three times more documents and have " + "a faster visualization, we invite you to run this " + "[code](https://github.com/bigscience-workshop/data_tooling/tree/master/ac_dc/visualization) " + "on your computer." + ) + + def choose_lang(self): + options = [ + self.param_visu_langs[lang_dataset_id]["lang"] + for lang_dataset_id in self.param_visu_langs + ] + index = options.index("English") if ("English" in options) else 0 + lang_chosen = st.selectbox( + label="Select the language for visualization", + options=options, + index=index, + ) + if lang_chosen != "None": + lang_chosen_dataset_id = langs_id.loc[ + langs_id["lang"] == lang_chosen, "dataset_id" + ].iloc[0] + visualization_for_lang = Visualization_for_lang( + path_data=self.param_visu_langs[lang_chosen_dataset_id]["path_data"], + lang=self.param_visu_langs[lang_chosen_dataset_id]["lang"], + num_docs=self.param_visu_langs[lang_chosen_dataset_id]["num_docs"], + num_docs_for_words=self.param_visu_langs[lang_chosen_dataset_id][ + "num_docs_for_words" + ], + max_len_text_display=self.param_visu_langs[lang_chosen_dataset_id][ + "max_len_text_display" + ], + lang_dataset_id=self.param_visu_langs[lang_chosen_dataset_id][ + "lang_dataset_id" + ], + path_fasttext_model=self.param_visu_langs[lang_chosen_dataset_id][ + "path_fasttext_model" + ], + path_sentencepiece_model=self.param_visu_langs[lang_chosen_dataset_id][ + "path_sentencepiece_model" + ], + path_kenlm_model=self.param_visu_langs[lang_chosen_dataset_id][ + "path_kenlm_model" + ], + ) + visualization_for_lang.visualization_for_lang() + + def visualization(self): + self.preamble() + self.warning_preamble() + self.choose_lang() + + +path_instructions = "./explanation_filtering_pipeline.pdf" + +param_visu_langs = { + lang_dataset_id: { + "path_data": f"./{lang_dataset_id}_examples_with_stats.json", + "lang": langs_id.loc[langs_id["dataset_id"] == lang_dataset_id, "lang"].iloc[0], + "num_docs": 5000, + "num_docs_for_words": 500, + "max_len_text_display": 10000, + "lang_dataset_id": lang_dataset_id, + "path_fasttext_model": "./lid.176.bin", + "path_sentencepiece_model": f"./{lang_dataset_id}.sp.model", + "path_kenlm_model": f"./{lang_dataset_id}.arpa.bin", + } + for lang_dataset_id in ["en", "pt"] +} + +visualization = Visualization(path_instructions, param_visu_langs) +visualization.visualization() diff --git a/en.arpa.bin b/en.arpa.bin new file mode 100644 index 0000000000000000000000000000000000000000..b74834cdda59ef28c35b172721256d427086ddff --- /dev/null +++ b/en.arpa.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04923fccbb4e63005c40f01d66112659416de01accd80d16e366a592289ee07a +size 4444690658 diff --git a/en.sp.model b/en.sp.model new file mode 100644 index 0000000000000000000000000000000000000000..d5cd3c4f88420f22d0a8a7123311ce894baec8ac --- /dev/null +++ b/en.sp.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf8147a573770b4e6c0d4df1dcb75453baa88190706dab406be7711b84f059de +size 931348 diff --git a/en_examples_with_stats.json b/en_examples_with_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..ec7242ca7b49a8c9faf23267418c60bfaeaad5a9 --- /dev/null +++ b/en_examples_with_stats.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dccf03710e9dc7ec68c676175e711be815bc29a50260f5d334156b03fe2e6d1 +size 241408394 diff --git a/explanation_filtering_pipeline.pdf b/explanation_filtering_pipeline.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d8319eaec32419a168e1ede890fa1d5fc9547076 Binary files /dev/null and b/explanation_filtering_pipeline.pdf differ diff --git a/filtering.py b/filtering.py new file mode 100644 index 0000000000000000000000000000000000000000..eb2f4358b0a520098dd41a678a51250b4be88176 --- /dev/null +++ b/filtering.py @@ -0,0 +1,957 @@ +import re + +import numpy as np + +import fasttext + +import sentencepiece +import kenlm + +import pathlib + +from languages_id import langs_id +from parameters_filtering import parameters_filtering +from normalization import normalization +from stopwords import stopwords +from flagged_words import flagged_words + + +class LoadParameters: + @staticmethod + def load_parameters(lang_dataset_id): + if lang_dataset_id in parameters_filtering: + param = parameters_filtering[lang_dataset_id] + else: + param = parameters_filtering["default"] + return param + + @staticmethod + def load_stopwords(lang_dataset_id): + stopwords_lang_id = langs_id.loc[ + langs_id["dataset_id"] == lang_dataset_id, "stopwords_id" + ].iloc[0] + if stopwords_lang_id: + stopwords_lang = set(stopwords[stopwords_lang_id]) + else: + stopwords_lang = None + return stopwords_lang + + @staticmethod + def load_flagged_words(lang_dataset_id): + flagged_words_lang_id = langs_id.loc[ + langs_id["dataset_id"] == lang_dataset_id, "flagged_words_id" + ].iloc[0] + if flagged_words_lang_id: + flagged_words_lang = set(flagged_words[flagged_words_lang_id]) + else: + flagged_words_lang = None + return flagged_words_lang + + @staticmethod + def load_model_lang_id(lang_dataset_id, path_fasttext_model): + fasttext_lang_id = langs_id.loc[ + langs_id["dataset_id"] == lang_dataset_id, "fasttext_id" + ].iloc[0] + if fasttext_lang_id: + model_lang_id = fasttext.load_model(path_fasttext_model) + else: + model_lang_id = None + return model_lang_id + + @staticmethod + def load_sentencepiece_model(lang_dataset_id, path_sentencepiece_model): + sentencepiece_lang_id = langs_id.loc[ + langs_id["dataset_id"] == lang_dataset_id, "sentencepiece_id" + ].iloc[0] + if sentencepiece_lang_id: + sentencepiece_model = sentencepiece.SentencePieceProcessor() + sentencepiece_model.load(path_sentencepiece_model) + else: + sentencepiece_model = None + return sentencepiece_model + + @staticmethod + def load_kenlm_model(lang_dataset_id, path_kenlm_model): + kenlm_lang_id = langs_id.loc[ + langs_id["dataset_id"] == lang_dataset_id, "kenlm_id" + ].iloc[0] + if kenlm_lang_id: + kenlm_model = kenlm.Model(path_kenlm_model) + else: + kenlm_model = None + return kenlm_model + + +class ModifyingDocuments: + @staticmethod + def remove_empty_el_from_list(list_): + return [el for el in list_ if el] + + @staticmethod + def remove_non_printing_characters(document, non_printing_characters_re): + return non_printing_characters_re.sub("", document) + + @staticmethod + def uniform_whitespace( + document, + whitespace=[ + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "", + "„", + ], + ): + """There are different whitespace characters.""" + whitespace = set(whitespace) + document = "".join( + [char if char not in whitespace else " " for char in document] + ) + return document + + @staticmethod + def replace_digits_with_zeros(document, digits_re): + return digits_re.sub("0", document) + + @staticmethod + def replace_unicode_punctuation(document, unicode_punctuation): + return "".join(unicode_punctuation.get(c, c) for c in document) + + @staticmethod + def normalization( + document, + remove_non_printing_characters, + strip, + lower_case, + uniform_whitespace, + replace_digits_with_zeros, + replace_unicode_punctuation, + non_printing_characters_re=normalization["non_printing_characters_re"], + digits_re=normalization["digits_re"], + unicode_punctuation=normalization["unicode_punctuation"], + ): + if remove_non_printing_characters: + document = ModifyingDocuments.remove_non_printing_characters( + document, non_printing_characters_re + ) + if strip: + document = document.strip() + if not document: + return document + if lower_case: + document = document.lower() + if uniform_whitespace: + document = ModifyingDocuments.uniform_whitespace(document) + if replace_digits_with_zeros: + document = ModifyingDocuments.replace_digits_with_zeros(document, digits_re) + if replace_unicode_punctuation: + document = ModifyingDocuments.replace_unicode_punctuation( + document, unicode_punctuation + ) + return document + + @staticmethod + def tokenization(document, sentencepiece_model, join_on_whitespace): + document_tokenized = sentencepiece_model.encode_as_pieces(document) + if join_on_whitespace: + document_tokenized = " ".join(document_tokenized) + return document_tokenized + + @staticmethod + def split_on_whitespace( + document, + new_line=False, + tab=False, + ): + """This method also removes concatenated spaces.""" + sep = [" "] + new_line * ["\n"] + tab * ["\t"] + sep = "|".join(sep) + split_document = re.split(sep, document) + split_document = ModifyingDocuments.remove_empty_el_from_list(split_document) + return split_document + + @staticmethod + def strip(document, strip_characters): + """Way faster than document.strip(strip_characters) + since strip_characters is now a set instead of a str, + and it contains a lot of elements (all the emojis).""" + if not document: + return document + beg_ind = 0 + end_ind = len(document) + for i in range(len(document)): + if document[i] in strip_characters: + beg_ind += 1 + else: + break + for i in range(1, len(document) + 1): + if document[-i] in strip_characters: + end_ind -= 1 + else: + break + document_stripped = document[beg_ind:end_ind] + return document_stripped + + @staticmethod + def get_words_from_document( + document, sentencepiece_model_tok, lower_case, strip_characters + ): + """Get words from a document. Non reversible since the document + is split on multiple characters, words are stripped of + special characters and characters are converted to lower case. + Useful to compute ratios, like the stopwords ratio.""" + if sentencepiece_model_tok: + document_normalized = ModifyingDocuments.normalization( + document=document, + remove_non_printing_characters=True, + strip=True, + lower_case=True, + uniform_whitespace=True, + replace_digits_with_zeros=True, + replace_unicode_punctuation=True, + ) + words = ModifyingDocuments.tokenization( + document_normalized, sentencepiece_model_tok, join_on_whitespace=False + ) + else: + words = ModifyingDocuments.split_on_whitespace( + document, new_line=True, tab=True + ) + if lower_case: + words = [word.lower() for word in words] + if strip_characters: + words = [ModifyingDocuments.strip(word, strip_characters) for word in words] + words = ModifyingDocuments.remove_empty_el_from_list(words) + return words + + @staticmethod + def words_augmentation(words, group_size, join_char): + """Augment words, especially for Chinese (without a space between words) + and Vietnamese (with a space between syllables).""" + augmentation = [ + join_char.join(words[i : i + group_size]) + for i in range(len(words) - group_size + 1) + ] + return augmentation + + @staticmethod + def split_on_newline_tab_whitespace(document): + """First split on "\n", then on "\t", then on " ".""" + sentences = document.split("\n") + sentences = [sentence.split("\t") for sentence in sentences] + sentences = [ + [ + ModifyingDocuments.split_on_whitespace(subsentence) + for subsentence in sentence + ] + for sentence in sentences + ] + return sentences + + @staticmethod + def merge_on_whitespace_tab_newline(sentences): + """Invert the method split_on_newline_tab_whitespace. + Removes concatenated separators.""" + sentences = [ + [" ".join(subsentence) for subsentence in sentence if subsentence] + for sentence in sentences + ] + sentences = ["\t".join(sentence) for sentence in sentences if sentence] + if not sentences: + return "" + document = "\n".join(sentences) + return document + + @staticmethod + def should_keep_word_with_incorrect_substrings( + word, strip_characters, incorrect_word_substrings + ): + word = ModifyingDocuments.strip(word, strip_characters) + should_keep = all( + [(i_substr not in word) for i_substr in incorrect_word_substrings] + ) + return should_keep + + @staticmethod + def remove_words_with_incorrect_substrings( + document, + strip_characters, + incorrect_word_substrings, + ): + sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document) + sentences = [ + [ + [ + word + for word in subsentence + if ModifyingDocuments.should_keep_word_with_incorrect_substrings( + word, strip_characters, incorrect_word_substrings + ) + ] + for subsentence in sentence + ] + for sentence in sentences + ] + document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences) + return document + + @staticmethod + def should_keep_long_word(word, strip_characters, length_word_max_cutoff): + """If the word is too long but it contains only one + special character, it might be a concatenation of one word, + a punctuation, and another word, with no space between them. + In this case, we give the word a pass.""" + if len(word) <= length_word_max_cutoff: + return True + word = ModifyingDocuments.strip(word, strip_characters) + if not word: # The word consisted only of strip characters + return False + if len(word) <= length_word_max_cutoff: + return True + return False + + def remove_long_words( + document, + strip_characters, + length_word_max_cutoff, + ): + sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document) + sentences = [ + [ + [ + word + for word in subsentence + if ModifyingDocuments.should_keep_long_word( + word, + strip_characters, + length_word_max_cutoff, + ) + ] + for subsentence in sentence + ] + for sentence in sentences + ] + document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences) + return document + + @staticmethod + def modifying_documents( + document, + cond_uniform_whitespace, + cond_replace_unicode_punctuation, + cond_remove_words_with_incorrect_substrings, + strip_characters, + incorrect_word_substrings, + cond_remove_long_words, + length_word_max_cutoff, + ): + document = ModifyingDocuments.normalization( + document=document, + remove_non_printing_characters=False, + strip=True, + lower_case=False, + uniform_whitespace=cond_uniform_whitespace, + replace_digits_with_zeros=False, + replace_unicode_punctuation=cond_replace_unicode_punctuation, + ) + if cond_remove_words_with_incorrect_substrings: + document = ModifyingDocuments.remove_words_with_incorrect_substrings( + document, + strip_characters, + incorrect_word_substrings, + ) + if cond_remove_long_words: + document = ModifyingDocuments.remove_long_words( + document, + strip_characters, + length_word_max_cutoff, + ) + return document + + +class FunctionDatasetModifyingDocuments: + def __init__(self, lang_dataset_id): + self.lang_dataset_id = lang_dataset_id + self.param = LoadParameters.load_parameters(lang_dataset_id) + + def __call__(self, example): + example["text"] = ModifyingDocuments.modifying_documents( + document=example["text"], + cond_uniform_whitespace=self.param["cond_uniform_whitespace"], + cond_replace_unicode_punctuation=self.param[ + "cond_replace_unicode_punctuation" + ], + cond_remove_words_with_incorrect_substrings=self.param[ + "cond_remove_words_with_incorrect_substrings" + ], + strip_characters=self.param["strip_characters"], + incorrect_word_substrings=self.param["incorrect_word_substrings"], + cond_remove_long_words=self.param["cond_remove_long_words"], + length_word_max_cutoff=self.param["length_word_max_cutoff"], + ) + return example + + def __reduce__(self): + return (self.__class__, (self.lang_dataset_id,)) + + +class Filtering: + @staticmethod + def check_number_words( + document, + sentencepiece_model_tok, + strip_characters, + number_words_min_cutoff, + number_words_max_cutoff, + ): + words = ModifyingDocuments.get_words_from_document( + document, + sentencepiece_model_tok, + lower_case=False, + strip_characters=strip_characters, + ) + cond = (len(words) >= number_words_min_cutoff) and ( + len(words) <= number_words_max_cutoff + ) + return cond + + @staticmethod + def compute_character_repetition_ratio(document, character_repetition_length): + def get_freq_character_ngrams(document, n): + character_ngrams = [ + document[i : i + n] for i in range(len(document) - n + 1) + ] + freq_character_ngrams = {} + for character_ngram in character_ngrams: + freq_character_ngrams[character_ngram] = ( + freq_character_ngrams.get(character_ngram, 0) + 1 + ) + return freq_character_ngrams + + freq_character_ngrams = get_freq_character_ngrams( + document, character_repetition_length + ) + if len(freq_character_ngrams) == 0: + return 0 + freq_character_ngrams = list(freq_character_ngrams.values()) + freq_character_ngrams = sorted(freq_character_ngrams, reverse=True) + val_less_than_one = len([el for el in freq_character_ngrams if el > 1]) + num_rep_character_ngrams = min( + int(np.sqrt(len(freq_character_ngrams))), + len(freq_character_ngrams) - val_less_than_one, + ) + character_repetition_ratio = sum( + freq_character_ngrams[:num_rep_character_ngrams] + ) / sum(freq_character_ngrams) + return character_repetition_ratio + + @staticmethod + def check_character_repetition_removal( + document, + character_repetition_length, + character_repetition_max_cutoff, + ): + character_repetition_ratio = Filtering.compute_character_repetition_ratio( + document, character_repetition_length + ) + cond = character_repetition_ratio <= character_repetition_max_cutoff + return cond + + @staticmethod + def compute_word_repetition_ratio( + document, sentencepiece_model_tok, strip_characters, word_repetition_length + ): + def get_freq_word_ngrams( + document, sentencepiece_model_tok, strip_characters, n + ): + words = ModifyingDocuments.get_words_from_document( + document, + sentencepiece_model_tok, + lower_case=True, + strip_characters=strip_characters, + ) + word_ngrams = [ + " ".join(words[i : i + n]) for i in range(len(words) - n + 1) + ] + freq_word_ngrams = {} + for word_ngram in word_ngrams: + freq_word_ngrams[word_ngram] = freq_word_ngrams.get(word_ngram, 0) + 1 + return freq_word_ngrams + + freq_word_ngrams = get_freq_word_ngrams( + document, sentencepiece_model_tok, strip_characters, word_repetition_length + ) + if len(freq_word_ngrams) == 0: + return 0 + freq_word_ngrams = list(freq_word_ngrams.values()) + word_repetition_ratio = sum( + freq for freq in freq_word_ngrams if freq > 1 + ) / sum(freq_word_ngrams) + return word_repetition_ratio + + @staticmethod + def check_word_repetition_removal( + document, + sentencepiece_model_tok, + strip_characters, + word_repetition_length, + word_repetition_max_cutoff, + ): + word_repetition_ratio = Filtering.compute_word_repetition_ratio( + document, sentencepiece_model_tok, strip_characters, word_repetition_length + ) + cond = word_repetition_ratio <= word_repetition_max_cutoff + return cond + + @staticmethod + def compute_special_characters_ratio(document, special_characters): + if len(document) == 0: + return 0 + special_characters_ratio = len( + [char for char in document if char in special_characters] + ) / len(document) + return special_characters_ratio + + @staticmethod + def check_special_characters( + document, + special_characters, + special_characters_max_cutoff, + ): + special_characters_ratio = Filtering.compute_special_characters_ratio( + document, special_characters + ) + cond = special_characters_ratio <= special_characters_max_cutoff + return cond + + @staticmethod + def compute_stopwords_ratio( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + stopwords, + ): + words = ModifyingDocuments.get_words_from_document( + document, + sentencepiece_model_tok, + lower_case=True, + strip_characters=strip_characters, + ) + if not words: + return 0 + augmentation = [] + if cond_words_augmentation: + augmentation = [ + ModifyingDocuments.words_augmentation( + words, group_size, words_augmentation_join_char + ) + for group_size in words_augmentation_group_sizes + ] + augmentation = [word for augm in augmentation for word in augm] + stopwords_ratio = len( + [word for word in words + augmentation if word in stopwords] + ) / len(words) + if stopwords_ratio > 1.0: + stopwords_ratio = 1.0 + return stopwords_ratio + + @staticmethod + def check_stopwords( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + stopwords, + stopwords_min_cutoff, + ): + cond = True + if stopwords: + stopwords_ratio = Filtering.compute_stopwords_ratio( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + stopwords, + ) + cond = stopwords_ratio >= stopwords_min_cutoff + return cond + + @staticmethod + def compute_flagged_words_ratio( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + flagged_words, + ): + words = ModifyingDocuments.get_words_from_document( + document, + sentencepiece_model_tok, + lower_case=True, + strip_characters=strip_characters, + ) + if not words: + return 0 + augmentation = [] + if cond_words_augmentation: + augmentation = [ + ModifyingDocuments.words_augmentation( + words, group_size, words_augmentation_join_char + ) + for group_size in words_augmentation_group_sizes + ] + augmentation = [word for augm in augmentation for word in augm] + flagged_words_ratio = len( + [word for word in words + augmentation if word in flagged_words] + ) / len(words) + if flagged_words_ratio > 1.0: + flagged_words_ratio = 1.0 + return flagged_words_ratio + + @staticmethod + def check_flagged_words( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + flagged_words, + flagged_words_max_cutoff, + ): + cond = True + if flagged_words: + flagged_words_ratio = Filtering.compute_flagged_words_ratio( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + flagged_words, + ) + cond = flagged_words_ratio <= flagged_words_max_cutoff + return cond + + @staticmethod + def compute_lang_id_pred_score(document, model_lang_id): + document = document.lower().replace("\n", " ") + pred = model_lang_id.predict(document) + lang_pred_fasttext_id = pred[0][0].replace("__label__", "") + score_pred = pred[1][0] + lang_pred_dataset_id = langs_id.loc[ + langs_id["fasttext_id"] == lang_pred_fasttext_id, "dataset_id" + ] + if len(lang_pred_dataset_id) > 0: + lang_pred_dataset_id = lang_pred_dataset_id.iloc[0] + else: + lang_pred_dataset_id = "unknown" + return lang_pred_dataset_id, score_pred + + @staticmethod + def check_lang_id( + document, + lang_dataset_id, + model_lang_id, + lang_id_min_cutoff, + ): + cond = True + if model_lang_id: + lang_pred_dataset_id, score_pred = Filtering.compute_lang_id_pred_score( + document, model_lang_id + ) + cond = (lang_pred_dataset_id == lang_dataset_id) and ( + score_pred >= lang_id_min_cutoff + ) + return cond + + @staticmethod + def compute_perplexity_score(document, sentencepiece_model, kenlm_model): + document = ModifyingDocuments.normalization( + document=document, + remove_non_printing_characters=True, + strip=True, + lower_case=False, + uniform_whitespace=True, + replace_digits_with_zeros=True, + replace_unicode_punctuation=True, + ) + document = ModifyingDocuments.tokenization( + document, sentencepiece_model, join_on_whitespace=True + ) + doc_log_score, doc_length = 0, 0 + for line in document.split("\n"): + log_score = kenlm_model.score(line) + length = len(line.split()) + 1 + doc_log_score += log_score + doc_length += length + pp_score = 10.0 ** (-doc_log_score / doc_length) + pp_score = round(pp_score, 1) + return pp_score + + @staticmethod + def check_perplexity( + document, + sentencepiece_model, + kenlm_model, + perplexity_max_cutoff, + ): + cond = True + if kenlm_model: + score = Filtering.compute_perplexity_score( + document, sentencepiece_model, kenlm_model + ) + cond = score <= perplexity_max_cutoff + return cond + + @staticmethod + def filtering( + document, + cond_check_number_words, + sentencepiece_model_tok, + strip_characters, + number_words_min_cutoff, + number_words_max_cutoff, + cond_check_character_repetition_removal, + character_repetition_length, + character_repetition_max_cutoff, + cond_check_word_repetition_removal, + word_repetition_length, + word_repetition_max_cutoff, + cond_check_special_characters, + special_characters, + special_characters_max_cutoff, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + cond_check_stopwords, + stopwords, + stopwords_min_cutoff, + cond_check_flagged_words, + flagged_words, + flagged_words_max_cutoff, + cond_check_lang_id, + lang_dataset_id, + model_lang_id, + lang_id_min_cutoff, + cond_check_perplexity, + sentencepiece_model, + kenlm_model, + perplexity_max_cutoff, + ): + if cond_check_number_words: + if not Filtering.check_number_words( + document, + sentencepiece_model_tok, + strip_characters, + number_words_min_cutoff, + number_words_max_cutoff, + ): + return False + if cond_check_character_repetition_removal: + if not Filtering.check_character_repetition_removal( + document, + character_repetition_length, + character_repetition_max_cutoff, + ): + return False + if cond_check_word_repetition_removal: + if not Filtering.check_word_repetition_removal( + document, + sentencepiece_model_tok, + strip_characters, + word_repetition_length, + word_repetition_max_cutoff, + ): + return False + if cond_check_special_characters: + if not Filtering.check_special_characters( + document, + special_characters, + special_characters_max_cutoff, + ): + return False + if cond_check_stopwords: + if not Filtering.check_stopwords( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + stopwords, + stopwords_min_cutoff, + ): + return False + if cond_check_flagged_words: + if not Filtering.check_flagged_words( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + flagged_words, + flagged_words_max_cutoff, + ): + return False + if cond_check_lang_id: + if not Filtering.check_lang_id( + document, + lang_dataset_id, + model_lang_id, + lang_id_min_cutoff, + ): + return False + if cond_check_perplexity: + if not Filtering.check_perplexity( + document, + sentencepiece_model, + kenlm_model, + perplexity_max_cutoff, + ): + return False + return True + + +class FunctionDatasetFiltering: + def __init__( + self, + lang_dataset_id, + path_fasttext_model, + path_sentencepiece_model, + path_kenlm_model, + ): + self.lang_dataset_id = lang_dataset_id + self.path_fasttext_model = path_fasttext_model + self.path_sentencepiece_model = path_sentencepiece_model + self.path_kenlm_model = path_kenlm_model + + self.param = LoadParameters.load_parameters(lang_dataset_id) + self.stopwords = LoadParameters.load_stopwords(lang_dataset_id) + self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id) + self.model_lang_id = LoadParameters.load_model_lang_id( + lang_dataset_id, path_fasttext_model + ) + self.sentencepiece_model = LoadParameters.load_sentencepiece_model( + lang_dataset_id, path_sentencepiece_model + ) + self.sentencepiece_model_tok = ( + self.sentencepiece_model if self.param["tokenization"] else None + ) + self.kenlm_model = LoadParameters.load_kenlm_model( + lang_dataset_id, path_kenlm_model + ) + + def __call__(self, example): + keep_example = Filtering.filtering( + document=example["text"], + cond_check_number_words=self.param["cond_check_number_words"], + sentencepiece_model_tok=self.sentencepiece_model_tok, + strip_characters=self.param["strip_characters"], + number_words_min_cutoff=self.param["number_words_min_cutoff"], + number_words_max_cutoff=self.param["number_words_max_cutoff"], + cond_check_character_repetition_removal=self.param[ + "cond_check_character_repetition_removal" + ], + character_repetition_length=self.param["character_repetition_length"], + character_repetition_max_cutoff=self.param[ + "character_repetition_max_cutoff" + ], + cond_check_word_repetition_removal=self.param[ + "cond_check_word_repetition_removal" + ], + word_repetition_length=self.param["word_repetition_length"], + word_repetition_max_cutoff=self.param["word_repetition_max_cutoff"], + cond_check_special_characters=self.param["cond_check_special_characters"], + special_characters=self.param["special_characters"], + special_characters_max_cutoff=self.param["special_characters_max_cutoff"], + cond_words_augmentation=self.param["cond_words_augmentation"], + words_augmentation_group_sizes=self.param["words_augmentation_group_sizes"], + words_augmentation_join_char=self.param["words_augmentation_join_char"], + cond_check_stopwords=self.param["cond_check_stopwords"], + stopwords=self.stopwords, + stopwords_min_cutoff=self.param["stopwords_min_cutoff"], + cond_check_flagged_words=self.param["cond_check_flagged_words"], + flagged_words=self.flagged_words, + flagged_words_max_cutoff=self.param["flagged_words_max_cutoff"], + cond_check_lang_id=self.param["cond_check_lang_id"], + lang_dataset_id=self.lang_dataset_id, + model_lang_id=self.model_lang_id, + lang_id_min_cutoff=self.param["lang_id_min_cutoff"], + cond_check_perplexity=self.param["cond_check_perplexity"], + sentencepiece_model=self.sentencepiece_model, + kenlm_model=self.kenlm_model, + perplexity_max_cutoff=self.param["perplexity_max_cutoff"], + ) + return keep_example + + def __reduce__(self): + return ( + self.__class__, + ( + self.lang_dataset_id, + self.path_fasttext_model, + self.path_sentencepiece_model, + self.path_kenlm_model, + ), + ) + + +class DatasetFiltering: + def __init__( + self, + dataset, + lang_dataset_id, + path_fasttext_model, + path_sentencepiece_model, + path_kenlm_model, + num_proc, + path_dir_save_dataset, + ): + self.ds = dataset + self.lang_dataset_id = lang_dataset_id + self.path_fasttext_model = path_fasttext_model + self.path_sentencepiece_model = path_sentencepiece_model + self.path_kenlm_model = path_kenlm_model + self.num_proc = num_proc + self.path_dir_save_dataset = path_dir_save_dataset + + def modifying_documents(self): + func_dataset_modifying_documents = FunctionDatasetModifyingDocuments( + self.lang_dataset_id + ) + self.ds = self.ds.map(func_dataset_modifying_documents, num_proc=self.num_proc) + + def filtering(self): + func_dataset_filtering = FunctionDatasetFiltering( + self.lang_dataset_id, + self.path_fasttext_model, + self.path_sentencepiece_model, + self.path_kenlm_model, + ) + self.ds = self.ds.filter(func_dataset_filtering, num_proc=self.num_proc) + + def save_dataset(self): + pathlib.Path(self.path_dir_save_dataset).mkdir(parents=True, exist_ok=True) + path_dir_save_dataset = pathlib.PurePath( + self.path_dir_save_dataset, self.lang_dataset_id + ) + pathlib.Path(path_dir_save_dataset).mkdir(parents=True, exist_ok=True) + self.ds.save_to_disk(path_dir_save_dataset) diff --git a/flagged_words.py b/flagged_words.py new file mode 100644 index 0000000000000000000000000000000000000000..dcec0e8243ecc8c9b9e65014d05812d8ac8fdc4b --- /dev/null +++ b/flagged_words.py @@ -0,0 +1,1055 @@ +# Merge +# https://github.com/zacanger/profane-words +# and +# https://github.com/thisandagain/washyourmouthoutwithsoap/blob/develop/data/build.json +# and +# https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words + + +english_flagged_words = [ + "anal", + "bareback", + "bbw", + "bdsm", + "blowjob", + "blowjobs", + "brazzers", + "bukkake", + "camgirl", + "camwhore", + "cocksucking", + "cougar", + "creampie", + "cuckold", + "cum", + "cumming", + "cums", + "cumshot", + "cumshots", + "cumslut", + "cunnilingus", + "deepthroat", + "deepthroating", + "dildo", + "dildos", + "dogging", + "doggystyle", + "dominatrix", + "erotic", + "fellatio", + "femdom", + "fingering", + "fisting", + "footjob", + "gangbang", + "handjob", + "hentai", + "horney", + "horniest", + "horny", + "jism", + "jizz", + "masterbating", + "masturbate", + "masturbating", + "masturbation", + "milf", + "orgies", + "orgy", + "pegging", + "porn", + "pornhub", + "porno", + "pornos", + "pornstar", + "pornstars", + "redtube", + "rimming", + "slutty", + "squirting", + "strapon", + "threesome", + "vibrator", + "xhamster", + "xnxx", + "xvideos", + "xxx", + "youporn", +] + + +flagged_words = { + "ar": english_flagged_words + + [ + "إباحي", + "احتلام", + "است", + "استمناء", + "اغتصاب", + "أورغازم", + "إيروتيك", + "أير", + "بز", + "بزاز", + "بظر", + "بورن", + "بيضان", + "مص", + "ثدي", + "جماع", + "جنس", + "حلمة", + "خلاعة", + "خنثي", + "خول", + "دعارة", + "زب", + "سحاق", + "سحاقية", + "سكس", + "سيكس", + "شاذ", + "شبق", + "شرج", + "شرموطة", + "شهواني", + "شهوة", + "طيز", + "عادة السرية", + "عاهرة", + "عرص", + "فاسقة", + "فرج", + "قحبة", + "قضيب", + "كس", + "لحس", + "لعق", + "لواط", + "لوطي", + "مؤخرة", + "متناك", + "متناكة", + "مومس", + "مثير", + "مص", + "مضاجعة", + "مفلقسة", + "مني", + "مهتاج", + "نشوة", + "نكاح", + "نيك", + ], + "ca": english_flagged_words + + [ + "cagarro", + "cagarros", + "cipote", + "cipotes", + "collons", + "colló", + "consolador", + "consoladors", + "cony", + "conys", + "corre's", + "corre't", + "corregut", + "cunillingus", + "córrer-se", + "escorreguda", + "escorregudes", + "escorregut", + "escrot", + "escrots", + "escórre's", + "escórre't", + "escórrer-se", + "mamada", + "mamadera", + "mamaderes", + "mamades", + "masturba", + "masturbacions", + "masturbació", + "masturbant", + "masturbar", + "masturbar-se", + "masturbat", + "masturbats", + "masturbes", + "orgasme", + "orgasmes", + "ou", + "ous", + "palla", + "palles", + "pornografia", + "semen", + "semens", + "verga", + "vergues", + "xxx", + ], + "en": english_flagged_words, + "es": english_flagged_words + + [ + "chupar el coño", + "chupar la concha", + "chupar la polla", + "chupar la verga", + "comer el coño", + "comer la concha", + "comer la polla", + "comer la verga", + "coprofagía", + "correrse", + "cunillingus", + "fagging", + "felación", + "felching", + "follada", + "follador de culo", + "folladores", + "fudge packer", + "hacer una paja", + "hacerse una paja", + "hore", + "kock", + "macizorra", + "madre folladora", + "mamada", + "perro follador", + "pisser", + "pornografía", + "sado", + "sadomasoquismo", + "sadomasoquista", + "sexo anal", + "skank", + "smegma", + "x clasificado", + ], + "eu": english_flagged_words + [], + "fr": english_flagged_words + + [ + "baiseurs", + "baiseur", + "baiseuse", + "baiseuses", + "branlette", + "branlettes", + "branleuse", + "branleuses", + "cunillingus", + "cunilingus", + "enculée", + "enculées", + "enculation", + "enculations", + "enculement", + "enculements", + "fellation", + "fellations", + "porno", + "pornos", + "pornographie", + "pornographique", + "pornographiques", + "salope", + "salopes", + "suceuse", + "suceuses", + "xxx", + ], + "ha": english_flagged_words + + [ + "bf", + "bura", + "burar", + "cin gindi", + "duri", + "durin", + "gindi", + "gindin", + "guntsu", + "guntsun", + "karuwa", + "karuwai", + "karuwar", + "maniyyi", + "maɗigo", + "maɗugo", + "nonuwa", + "shan gindi", + "tsuliya", + "tsuliyariskanci", + "ɗuwaiwai", + "ɗuwaiwan", + "ɗuwawu", + "ɗuwawun", + ], + "hi": english_flagged_words + + [ + "अंडकोश की थैली", + "एक्स रेटेड", + "ओगाज़्म", + "कामोद्दीपक चित्र", + "कालीन का चूरा", + "कून", + "कॉक", + "गेंद का थैला", + "चाकलेट का रंग", + "चूची", + "चूतड़", + "झटका बंद", + "ठगना पैकर", + "डिल्डो", + "नितंब", + "पिछाड़ी", + "पीड़न कामुक", + "पॉर्न", + "फटना", + "फूहड़", + "बट", + "बहुत मदहोश", + "बेल अंत", + "भगवान-शापित", + "भगशेफ", + "माँ कमीने", + "मुखमैथुन", + "मुर्गा चूसने वाला", + "रक्तरंजित", + "लेबिया", + "वहशी", + "वहशीता", + "वैंग", + "शिश्नमल", + "संभोग सुख", + "सह शॉट", + "सींग का बना हुआ", + "होर", + "घपा घप", + "चुदाई", + "चुदक्कड़", + ], + "id": english_flagged_words + + [ + "bokep", + "coli", + "colmek", + "grepe", + "horni", + "janda", + "jembut", + "jilat memek", + "jilmek", + "kontol", + "masturbasi", + "memek", + "ngentot", + "ngewe", + "peju", + "pepek", + "pornografi", + "sange", + "sepong", + "tusbol", + ], + "kn": english_flagged_words + + [ + "ಗರ್ಭಪಾತ", + "ಗುದ", + "ಗುದದ್ವಾರ", + "ಕತ್ತೆ", + "ಆಶ್-ಫಕರ್", + "ಅಸ್ಹೋಲ್", + "ಅಸೋಲೆಸ್", + "ಬಾಲ್ಬಾಗ್", + "ಚೆಂಡುಗಳು", + "ಬಾಸ್ಟರ್ಡ್", + "ಬೆಲೆಂಡ್", + "ಮೃದ್ವಂಗಿ", + "ಪ್ರಾಣಿಜನ್ಯತೆ", + "ಬಿಚ್", + "ಬಿಟ್ಚಿಸ್", + "ಬೆಚಿಂಗ್", + "ರಕ್ತಸಿಕ್ತ", + "ಬ್ಲೋಜಾಬ್", + "ಬೊಲ್ಲೊಕ್", + "ಕುರುಚಲು ಗಿಡ", + "ಬೂಬಿಗಳು", + "ಸ್ತನಗಳನ್ನು", + "ಬುಕೆಟಾ", + "ತಿಕ", + "ಬಟ್", + "ಕಾರ್ಪೆಟ್ ಮಂಚರ್", + "ಚಿಂಕ್", + "ಸಿಪಾ", + "ಚಂದ್ರನಾಡಿ", + "ಕೋಳಿ", + "ಕೋಳಿ ಸಕ್ಕರ್", + "ಕಾಕ್ಸ್", + "ಕೂನ್", + "ಅಮೇಧ್ಯ", + "ಕಮ್", + "ಕಮ್ಶಾಟ್", + "ಕುನಿಲ್ಲಸ್", + "ಕಂಟ್", + "ಡ್ಯಾಮ್", + "ಡಿಕ್", + "ದ್ವಿಧ್ರುವಿ", + "dildos", + "ಡಿಂಕ್", + "ನಾಯಿ-ಫಕರ್", + "ಡಚೆ", + "ಡೈಕ್", + "ಹೊರಹೊಮ್ಮಿಸು", + "ಸ್ಫೂರ್ತಿ", + "ಎಜಾಕ್ಯುಲೇಟ್ಸ್", + "ಇಜಲಲೇಟಿಂಗ್", + "ಉದ್ಗಾರ", + "ತಮಾಷೆ", + "ಮಂದಗತಿ", + "ಮಬ್ಬು", + "fagots", + "ಫ್ಯಾನಿ", + "ಹೊಡೆತ", + "ಪತನ", + "ಚಾಚುಪಟ್ಟಿ", + "ಫಕ್", + "ನಾಶವಾಗಿದ್ದನು", + "ಫಕರ್", + "fuckers", + "ಫಕಿಂಗ್", + "ಫಕಿಂಗ್ಸ್", + "ಇಷ್ಟಪಡುತ್ತಾನೆ", + "ಮಿಠಾಯಿ ಪ್ಯಾಕರ್", + "ದೇವರನ್ನು ಹಾನಿಗೊಳಗಾಯಿತು", + "ಗಾಡ್ಡಮ್", + "ನರಕ", + "ಹೋರ್", + "ಮೊನಚಾದ", + "ಜರ್ಕ್-ಆಫ್", + "ಕೋಕ್", + "ಯೋನಿಯ", + "ಕಾಮ", + "ಕಾಮುಕ", + "ಮಾಸೋಚಿಸ್ಟ್", + "ಹಸ್ತಮೈಥುನ ಮಾಡು", + "ತಾಯಿ ಫಕರ್", + "ನಾಜಿ", + "ನಿಗರ್", + "ನಿಗ್ಗರ್ಗಳು", + "ಒರಾಸಿಮ್", + "ಪರಾಕಾಷ್ಠೆ", + "ಪರಾಕಾಷ್ಠೆಗಳನ್ನು", + "ಪೆಕರ್", + "ಶಿಶ್ನ", + "ಮೂತ್ರ ವಿಸರ್ಜಿಸು", + "ನಿರುತ್ಸಾಹಗೊಂಡಿದೆ", + "ಪಿಸರ್", + "ಮೂತ್ರಪಿಂಡಗಳು", + "pissing", + "ಪಿಸ್ಸಾಫ್", + "ಪೂಪ್", + "ಅಶ್ಲೀಲತೆ", + "ಅಶ್ಲೀಲ", + "ಚುಚ್ಚು", + "ಪ್ರಿಕ್ಸ್", + "ಪಬ್", + "ಪುಸಿಗಳು", + "ಪುಸಿ", + "ಅತ್ಯಾಚಾರ", + "ಅತ್ಯಾಚಾರಿ", + "ಗುದನಾಳದ", + "ರಿಟಾರ್ಡ್", + "ಹಚ್ಚುವುದು", + "ದುಃಖಗಾರ", + "ತಿರುಗಿಸುವುದು", + "ಸ್ಕ್ರೋಟಮ್", + "ವೀರ್ಯ", + "ಲೈಂಗಿಕತೆ", + "ಶಾಗ್", + "ಶಾಗ್ಗಿಂಗ್", + "ಶೆಮೇಲ್", + "ಶಿಟ್", + "ಷೈಟ್", + "ಶಿಟ್ಸ್", + "shitted", + "ಅಲುಗಾಡುವಿಕೆ", + "ಅಸಹ್ಯ", + "ಸ್ಕಾಂಕ್", + "ಸೂಳೆ", + "ಸ್ಲಟ್ಗಳು", + "ಸ್ಮೆಗ್ಮಾ", + "ಕೊಳೆತ", + "ಸ್ನ್ಯಾಚ್", + "ಮಗ-ಆಫ್-ಬಿಚ್", + "spac", + "ಉಬ್ಬು", + "ವೃಷಣ", + "ಟಿಟ್", + "ಚೇಕಡಿ ಹಕ್ಕಿಗಳು", + "turd", + "ಯೋನಿ", + "ವಯಾಗ್ರ", + "ವಾಂಗ್", + "ಮುಷ್ಕರ", + "x ರೇಟೆಡ್", + "xxx", + ], + "ml": english_flagged_words + + [ + "ഗർഭഛിദ്രം", + "വിശപ്പ്", + "മലദ്വാരം", + "കഴുത", + "അസി ഫക്കർ", + "കഴുതകളെ", + "ആസ്ഹോൾ", + "അശ്ളീലങ്ങൾ", + "ബോൾബാഗ്", + "പന്തുകൾ", + "തന്തയില്ലാത്തവൻ", + "ബെല്ലെൻഡ്", + "മൃഗീയമായ", + "മൃഗീയത", + "ബിച്ച്", + "ബിച്ചുകൾ", + "ബിപിഡിംഗ്", + "രക്തരൂക്ഷിതമായ", + "ആശ്വാസം", + "ബലോക്ക്", + "ബോബ്", + "പൂക്കൾ", + "സ്തനങ്ങൾ", + "ബ്യൂട്ടാ", + "ബം", + "മയക്കുമരുന്ന്", + "പരവതാനി മാൻച്ചർ", + "ചുംബ്", + "സിപാ", + "ക്ലോറിസിസ്", + "കോക്ക്", + "കോക്ക് സക്കർ", + "കോക്സ്", + "കോൺ", + "ക്രാപ്പ്", + "ശുക്ലം", + "പുരുഷാരം", + "സി", + "മുഷിഞ്ഞ", + "കഷ്ടം", + "ഡിക്ക്", + "ഡിൽഡോ", + "dildos", + "ഡൈൻ", + "നായ-ഫക്കർ", + "ഡച്ച്", + "ഡൈകെ", + "ശമിപ്പിക്കുക", + "മോഷ്ടിച്ചു", + "വികാരങ്ങൾ", + "വിരസത", + "മടി", + "ക്ഷീണിപ്പിക്കുക", + "fagot", + "വഞ്ചന", + "ഫാനി", + "വേദന", + "flange", + "ഊമ്പി", + "സംഭോഗം ചെയ്യുക", + "ഫക്കർ", + "നർമ്മം", + "ഫഡ്ജ് പാക്കർ", + "ദൈവം-കൊള്ളിത", + "ഗോഡ്ഡം", + "നരകം", + "വയ്ക്കുക", + "വൃത്തികെട്ട", + "ജെർക് ഓഫ്", + "കിക്ക്", + "ലാബിയ", + "മോഹം", + "മോഹഭംഗം", + "മാസോച്ചിസ്റ്റ്", + "സ്വയംഭോഗം ചെയ്യുക", + "അമ്മ ഫക്കർ", + "നാസി", + "നിഗർ", + "മയക്കുമരുന്നുകൾ", + "രതിമൂർച്ഛ", + "പെക്കർ", + "ലിംഗം", + "മൂത്രമൊഴിക്കുക", + "കുഴഞ്ഞുവീഴുന്നു", + "പിസ്സർ", + "പിസ്സകൾ", + "pissing", + "പിസ്സോഫ്", + "poop", + "അശ്ലീലം", + "അശ്ലീലത", + "പ്രാവി", + "വിസർജ്യങ്ങൾ", + "പ്യൂബ്", + "pussies", + "pussy", + "ബലാൽസംഗം", + "ബലാത്സംഗം", + "മലാശയം", + "തുടരുക", + "റിമ്മിംഗ്", + "സചിസ്റ്റ്", + "വഞ്ചി", + "പുല്ല്", + "ബീജം", + "ശവം", + "ഷാഗിംഗ്", + "അവൾ", + "ഷീറ്റ്", + "ഷെയ്റ്റ്", + "shits", + "തിന്നിട്ടില്ല", + "ഷോർട്ട്", + "ഷൈറ്റി", + "സ്കാൻ", + "മന്ദഹസരം", + "സ്നെഗമാ", + "പുഞ്ചിരി", + "പിടിക്കുക", + "വെറുക്കപ്പെട്ടയാൾ", + "സ്പെയ്ക്", + "തുളച്ച്", + "വൃഷണം", + "പേ", + "ടിത്ത്", + "കുഴപ്പമില്ല", + "യോനി", + "വരാഗ്ര", + "വാൽവ", + "വാങ്", + "വാൻ", + "വേശ്യ", + "x റേറ്റുചെയ്തു", + "xxx", + ], + "mr": english_flagged_words + + [ + "गर्भपात", + "गुदा", + "गाढव", + "गांडुळ", + "asses", + "asshole", + "assholes", + "ballbag", + "चेंडू", + "बॅस्टर्ड", + "बेलेंड", + "बेस्टियल", + "प्राण्यांबरोबर", + "कुत्री", + "बिट्स", + "खूनी", + "blowjob", + "बोलोक", + "बोब", + "स्तन", + "बसीटा", + "बम", + "बट", + "कार्पेट मुन्चर", + "चिंक", + "सिपा", + "क्लिटोरिस", + "मुर्ख", + "मांसाहारी", + "कॉक्स", + "कॉनन", + "बकवास", + "सह", + "cumshot", + "कनिलिंगस", + "कांट", + "धिक्कार", + "डिक", + "dildo", + "डिल्डो", + "डंक", + "duche", + "डाईक", + "उद्गार", + "उत्साही", + "ejaculates", + "उत्सुकता", + "स्खलन", + "फॅग", + "फॅगिंग", + "फॅगॉट", + "फॅगॉट्स", + "फॅनी", + "फेलिंग", + "फॅलेटीओ", + "निकला", + "fucked", + "गुप्तचर", + "fuckers", + "fucking", + "fuckings", + "fucks", + "फडगे पॅकर", + "देव-शापित", + "देव", + "नरक", + "होरे", + "शिंग", + "झटका बंद", + "कॉक", + "लॅबिया", + "वासना", + "मासोचिस्ट", + "हस्तमैथुन करा", + "आई माकड", + "नाझी", + "निगर", + "निगार", + "ऑर्गॅसिम", + "संभोग", + "orgasms", + "चापटी", + "पुरुषाचे जननेंद्रिय", + "पेशी", + "pissed", + "पिसर", + "pisses", + "पिसिंग", + "पिसोफ", + "घाट", + "अश्लील", + "पोर्नोग्राफी", + "मुरुम", + "प्रिक्स", + "प्यूब", + "pussies", + "मांजर", + "बलात्कार", + "गुदाशय", + "मंद", + "rimming", + "दुःखी", + "screwing", + "स्क्रोटम", + "वीर्य", + "लिंग", + "शेग", + "shagging", + "शेमले", + "विचित्र", + "shite", + "shits", + "shitted", + "shitting", + "shitty", + "घाणेरडा", + "फट", + "sluts", + "सुगंध", + "स्मट", + "छेडछाड", + "मुलगा-एक-कुत्री", + "spac", + "तिरस्कार", + "परीक्षक", + "शीर्षक", + "टिट", + "टर्ड", + "योनी", + "वियाग्रा", + "वल्वा", + "वांग", + "विंक", + "वेश्या", + "एक्स रेट केले", + "xxx", + ], + "pt": english_flagged_words + + [ + "balalao", + "bate uma", + "beijo grego", + "boceta", + "boquete", + "buceta", + "caralho", + "chochota", + "coito", + "cona", + "consolo", + "corno", + "cu", + "dar a bunda", + "dar o rabo", + "dildo", + "dildos", + "esporrar", + "estrovenga", + "felação", + "filho da puta", + "filhos da puta", + "gozada", + "jeba", + "perereca", + "pica", + "piru", + "porno", + "pornografia", + "pornô", + "porra", + "prostituta", + "pube", + "punheta", + "punheteiro", + "putaria", + "queca", + "sexo", + "siririca", + "tesão", + "trepada", + "verga", + "vibrador", + "xana", + "xochota", + "xoxota", + ], + "ta": english_flagged_words + + [ + "ஓதா", + "ஒத்தா", + "புண்டை", + "ஒம்மாளே", + "பக்கி", + "கூமுட்டை", + "கருமம்", + "சனியன்", + "கஸ்மாலம்", + "சூத்து", + ], + "te": english_flagged_words + + [ + "గర్భస్రావం", + "అంగ", + "పాయువు", + "గాడిద", + "గాడిద-fucker", + "asses", + "assholes", + "బాల్బ్యాగ్", + "బంతుల్లో", + "బాస్టర్డ్", + "బెల్లెండ్", + "మృగ", + "బెస్టియాలిటీ", + "బిచ్", + "bitches", + "బిట్చింగ్", + "బ్లడీ", + "blowjob", + "బోల్లక", + "బూబ్", + "వక్షోజాలను", + "ఛాతీ", + "buceta", + "బం", + "బట్", + "కార్పెట్ ముంచర్", + "చింక్", + "cipa", + "స్త్రీగుహ్యాంకురము", + "ఆత్మవిశ్వాసం", + "కాక్-సక్కర్", + "కాక్స్", + "కూన్", + "చెత్త", + "కం", + "cumshot", + "క్యునిల్లింగస్", + "కంట్", + "తిట్టు", + "డిక్", + "లైంగిక సంతృప్తి కోసం స్త్రీలు ఉపయోగించే పురుషాంగము వంటి పరికరము", + "డిల్డోస్", + "dink", + "కుక్క-fucker", + "డూష్", + "డైక్", + "స్ఖలించు", + "ఎజాక్యులేటెడ్", + "ఎజాక్యులేట్స్", + "ఎరాక్యులేటింగ్", + "స్ఖలనం", + "నవుకరు", + "ఫాగ్గింగ్", + "ఫాగాట్", + "ఫగాట్స్", + "fanny", + "ఫెల్చింగ్", + "కుడుచుట", + "అచ్చు", + "ఫక్", + "ఇబ్బంది పెట్టాడు", + "fucker", + "ఫకర్స్", + "ఫకింగ్", + "ఫకింగ్స్", + "ఫక్స్", + "ఫడ్జ్ ప్యాకర్", + "దేవతలా మంచిది", + "గాడ్డామ్", + "నరకం", + "హోర్", + "horny", + "జెర్క్-ఆఫ్", + "కాక్", + "పెదవి", + "కామం", + "మనసు పడ్డట్లు చిత్రించారు", + "masochist", + "హస్తప్రయోగం", + "తల్లి ఫెకర్", + "నాజీ", + "నిగ్గర్", + "నిగ్గర్స్", + "ఆర్గాసిమ్", + "స్కలనం", + "orgasms", + "pecker", + "పురుషాంగం", + "విసర్జన", + "pissed", + "పిస్సర్", + "పిస్సీస్", + "పిస్సింగ్", + "పిస్సాఫ్", + "poop", + "శృంగార", + "పోర్నో", + "అశ్లీల", + "బుడతడు", + "ప్రిక్స్", + "ప్యూబ్", + "pussies", + "పుస్సీ", + "రేప్", + "ఉన్నప్పటికీ బలాత్కారం", + "పురీషనాళం", + "రిటార్డ్", + "రిమ్మింగ్", + "పీడన కాముకత", + "screwing", + "స్క్రోటమ్", + "వీర్యం", + "సెక్స్", + "బొచ్చు", + "షగ్గింగ్", + "షీమేల్", + "ఒంటి", + "షైట్", + "షిట్స్", + "షిట్టెడ్", + "షిట్టింగ్", + "shitty", + "స్కాన్క్", + "నీతి", + "స్లట్స్", + "శిశ్న", + "స్మట్", + "స్నాచ్", + "ఒక బిచ్ కుమారుడు ఆఫ్", + "spac", + "స్పంక్", + "వృషణాలు", + "తునక", + "టిట్స్", + "టిట్", + "turd", + "యోని", + "వయాగ్రా", + "జననాంగం", + "వాంగ్", + "వ్యాంక్", + "వేశ్య", + "x రేట్", + "xxx", + ], + "vi": english_flagged_words + + [ + "cặc", + "lồn", + "địt", + "buồi", + "đụ", + "nứng", + "phò", + "bú lol", + "bú lon", + "bú cac", + "bú cak", + "bú cax", + "bú buoi", + "bú mút", + "húp lol", + "húp lon", + "húp sò", + "nung lol", + "nung lon", + "nug lon", + "nung cac", + "nung buoi", + "thèm dit", + "thèm lon", + "thèm lol", + "thèm buoi", + "thèm dit", + "thèm chich", + "thèm chịch", + "thèm cax", + "thèm cak", + "móc lol", + "móc lon", + "buscu", + "lol non", + "bướm non", + "vú to", + "vú bự", + "lon to", + "phim sex", + "phim xex", + "phim xxx", + ], + "zh": english_flagged_words + + [ + "三级片", + "乳交", + "卖B", + "卖比", + "卖淫", + "口交", + "姣西", + "打炮", + "打飞机", + "抽插", + "招妓", + "插你", + "插死你", + "淫水", + "肉壁", + "肉棍子", + "肉缝", + "肏", + "肛交", + ], +} diff --git a/languages_id.py b/languages_id.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd1a696b8fc56a3cf58e3f0abc2396810f94898 --- /dev/null +++ b/languages_id.py @@ -0,0 +1,222 @@ +import pandas as pd + + +langs_id = [ + { + "lang": "Afrikaans", + "dataset_id": "af", + "stopwords_id": "af", + "flagged_words_id": None, + "fasttext_id": "af", + "sentencepiece_id": "af", + "kenlm_id": "af", + }, + { + "lang": "Arabic", + "dataset_id": "ar", + "stopwords_id": "ar", + "flagged_words_id": "ar", + "fasttext_id": "ar", + "sentencepiece_id": "ar", + "kenlm_id": "ar", + }, + { + "lang": "Egyptian Arabic", + "dataset_id": "arz", + "stopwords_id": None, + "flagged_words_id": None, + "fasttext_id": "arz", + "sentencepiece_id": "arz", + "kenlm_id": "arz", + }, + { + "lang": "Assamese", + "dataset_id": "as", + "stopwords_id": None, + "flagged_words_id": None, + "fasttext_id": "as", + "sentencepiece_id": "as", + "kenlm_id": "as", + }, + { + "lang": "Bengali", + "dataset_id": "bn", + "stopwords_id": "bn", + "flagged_words_id": None, + "fasttext_id": "bn", + "sentencepiece_id": "bn", + "kenlm_id": "bn", + }, + { + "lang": "Catalan", + "dataset_id": "ca", + "stopwords_id": "ca", + "flagged_words_id": "ca", + "fasttext_id": "ca", + "sentencepiece_id": "ca", + "kenlm_id": "ca", + }, + { + "lang": "English", + "dataset_id": "en", + "stopwords_id": "en", + "flagged_words_id": "en", + "fasttext_id": "en", + "sentencepiece_id": "en", + "kenlm_id": "en", + }, + { + "lang": "Spanish", + "dataset_id": "es", + "stopwords_id": "es", + "flagged_words_id": "es", + "fasttext_id": "es", + "sentencepiece_id": "es", + "kenlm_id": "es", + }, + { + "lang": "Basque", + "dataset_id": "eu", + "stopwords_id": "eu", + "flagged_words_id": "eu", + "fasttext_id": "eu", + "sentencepiece_id": "eu", + "kenlm_id": "eu", + }, + { + "lang": "French", + "dataset_id": "fr", + "stopwords_id": "fr", + "flagged_words_id": "fr", + "fasttext_id": "fr", + "sentencepiece_id": "fr", + "kenlm_id": "fr", + }, + { + "lang": "Gujarati", + "dataset_id": "gu", + "stopwords_id": None, + "flagged_words_id": None, + "fasttext_id": "gu", + "sentencepiece_id": "gu", + "kenlm_id": "gu", + }, + { + "lang": "Hindi", + "dataset_id": "hi", + "stopwords_id": "hi", + "flagged_words_id": "hi", + "fasttext_id": "hi", + "sentencepiece_id": "hi", + "kenlm_id": "hi", + }, + { + "lang": "Indonesian", + "dataset_id": "id", + "stopwords_id": "id", + "flagged_words_id": "id", + "fasttext_id": "id", + "sentencepiece_id": "id", + "kenlm_id": "id", + }, + { + "lang": "Kannada", + "dataset_id": "kn", + "stopwords_id": None, + "flagged_words_id": "kn", + "fasttext_id": "kn", + "sentencepiece_id": "kn", + "kenlm_id": "kn", + }, + { + "lang": "Malayalam", + "dataset_id": "ml", + "stopwords_id": None, + "flagged_words_id": "ml", + "fasttext_id": "ml", + "sentencepiece_id": "ml", + "kenlm_id": "ml", + }, + { + "lang": "Marathi", + "dataset_id": "mr", + "stopwords_id": "mr", + "flagged_words_id": "mr", + "fasttext_id": "mr", + "sentencepiece_id": "mr", + "kenlm_id": "mr", + }, + { + "lang": "Portuguese", + "dataset_id": "pt", + "stopwords_id": "pt", + "flagged_words_id": "pt", + "fasttext_id": "pt", + "sentencepiece_id": "pt", + "kenlm_id": "pt", + }, + { + "lang": "Swahili", + "dataset_id": "sw", + "stopwords_id": "sw", + "flagged_words_id": None, + "fasttext_id": "sw", + "sentencepiece_id": "sw", + "kenlm_id": "sw", + }, + { + "lang": "Tamil", + "dataset_id": "ta", + "stopwords_id": None, + "flagged_words_id": "ta", + "fasttext_id": "ta", + "sentencepiece_id": "ta", + "kenlm_id": "ta", + }, + { + "lang": "Telugu", + "dataset_id": "te", + "stopwords_id": None, + "flagged_words_id": "te", + "fasttext_id": "te", + "sentencepiece_id": "te", + "kenlm_id": "te", + }, + { + "lang": "Urdu", + "dataset_id": "ur", + "stopwords_id": "ur", + "flagged_words_id": None, + "fasttext_id": "ur", + "sentencepiece_id": "ur", + "kenlm_id": "ur", + }, + { + "lang": "Vietnamese", + "dataset_id": "vi", + "stopwords_id": "vi", + "flagged_words_id": "vi", + "fasttext_id": "vi", + "sentencepiece_id": "vi", + "kenlm_id": "vi", + }, + { + "lang": "Yoruba", + "dataset_id": "yo", + "stopwords_id": "yo", + "flagged_words_id": None, + "fasttext_id": "yo", + "sentencepiece_id": "yo", + "kenlm_id": "yo", + }, + { + "lang": "Chinese", + "dataset_id": "zh", + "stopwords_id": "zh", + "flagged_words_id": "zh", + "fasttext_id": "zh", + "sentencepiece_id": "zh", + "kenlm_id": "zh", + }, +] +langs_id = pd.DataFrame(langs_id) diff --git a/lid.176.bin b/lid.176.bin new file mode 100644 index 0000000000000000000000000000000000000000..f8707035ea3cc86ac248a4e31fa6368cd845476a --- /dev/null +++ b/lid.176.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e +size 131266198 diff --git a/normalization.py b/normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..652e810fb5019c5177f6fd0abf9635f322f23927 --- /dev/null +++ b/normalization.py @@ -0,0 +1,52 @@ +import re +from typing import Dict + + +non_printing_characters_re = re.compile( + f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" +) + +digits_re: re.Pattern = re.compile(r"\d") + +unicode_punctuation: Dict[str, str] = { + ",": ",", + "。": ".", + "、": ",", + "„": '"', + "”": '"', + "“": '"', + "«": '"', + "»": '"', + "1": '"', + "」": '"', + "「": '"', + "《": '"', + "》": '"', + "´": "'", + "∶": ":", + ":": ":", + "?": "?", + "!": "!", + "(": "(", + ")": ")", + ";": ";", + "–": "-", + "—": " - ", + ".": ". ", + "~": "~", + "’": "'", + "…": "...", + "━": "-", + "〈": "<", + "〉": ">", + "【": "[", + "】": "]", + "%": "%", + "►": "-", +} + +normalization = { + "non_printing_characters_re": non_printing_characters_re, + "digits_re": digits_re, + "unicode_punctuation": unicode_punctuation, +} diff --git a/parameters_filtering.py b/parameters_filtering.py new file mode 100644 index 0000000000000000000000000000000000000000..1a992a8d10512da8c416640d956deff47a8f2ce7 --- /dev/null +++ b/parameters_filtering.py @@ -0,0 +1,895 @@ +import string +import emoji + + +main_special_characters = string.punctuation + string.digits + string.whitespace +other_special_characters = ( + "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═" + "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖" + "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" + "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" + "」﴾》" +) +emoji = list(emoji.UNICODE_EMOJI["en"].keys()) + +special_characters_default = set(main_special_characters + other_special_characters) +special_characters_default.update(emoji) + + +parameters_filtering_default = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": False, + "length_word_max_cutoff": 50, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.4, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": False, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.70, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_af = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.6, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_ar = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.45, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 1000000, +} + +parameters_filtering_arz = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.5, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_as = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_bn = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.275, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.05, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 575000, +} + +parameters_filtering_ca = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.35, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 1750000, +} + +parameters_filtering_en = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": True, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 20, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.4, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.3, + "cond_check_flagged_words": True, + "flagged_words_max_cutoff": 0.045, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.80, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 2500, +} + +parameters_filtering_es = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.2, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 2500000, +} + +parameters_filtering_eu = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 35, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_fr = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.35, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.15, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_gu = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 250000, +} + +parameters_filtering_hi = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.35, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 600000, +} + +parameters_filtering_id = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.25, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 2500000, +} + +parameters_filtering_kn = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 50, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 400000, +} + +parameters_filtering_ml = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 50, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.2, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 1600000, +} + +parameters_filtering_mr = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 425000, +} + +parameters_filtering_pt = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.15, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_sw = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.275, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_ta = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 50, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_te = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 35, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_ur = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.4, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_vi = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.35, + "cond_words_augmentation": True, + "words_augmentation_group_sizes": [2], + "words_augmentation_join_char": " ", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_yo = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_zh = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": False, + "length_word_max_cutoff": 1000, + "cond_check_number_words": True, + "tokenization": True, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "cond_check_character_repetition_removal": True, + "character_repetition_length": 10, + "character_repetition_max_cutoff": 0.106, + "cond_check_word_repetition_removal": True, + "word_repetition_length": 5, + "word_repetition_max_cutoff": 0.19, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.4, + "cond_words_augmentation": True, + "words_augmentation_group_sizes": [2], + "words_augmentation_join_char": "", + "cond_check_stopwords": False, + "stopwords_min_cutoff": 0, + "cond_check_flagged_words": False, + "flagged_words_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering = { + "default": parameters_filtering_default, + "af": parameters_filtering_af, + "ar": parameters_filtering_ar, + "arz": parameters_filtering_arz, + "as": parameters_filtering_as, + "bn": parameters_filtering_bn, + "ca": parameters_filtering_ca, + "en": parameters_filtering_en, + "es": parameters_filtering_es, + "eu": parameters_filtering_eu, + "fr": parameters_filtering_fr, + "gu": parameters_filtering_gu, + "hi": parameters_filtering_hi, + "id": parameters_filtering_id, + "kn": parameters_filtering_kn, + "ml": parameters_filtering_ml, + "mr": parameters_filtering_mr, + "pt": parameters_filtering_pt, + "sw": parameters_filtering_sw, + "ta": parameters_filtering_ta, + "te": parameters_filtering_te, + "ur": parameters_filtering_ur, + "vi": parameters_filtering_vi, + "yo": parameters_filtering_yo, + "zh": parameters_filtering_zh, +} diff --git a/pt.arpa.bin b/pt.arpa.bin new file mode 100644 index 0000000000000000000000000000000000000000..1ed3b02dab66efb87a4da18948d7ec7f4a5ffa90 --- /dev/null +++ b/pt.arpa.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad7241c4b11d902fa092506b731f61e5f67177897c2598b750d1a2e519be87ad +size 3220168756 diff --git a/pt.sp.model b/pt.sp.model new file mode 100644 index 0000000000000000000000000000000000000000..3c2ab113c5644ebf7b1d8d23790b90b16c964d75 --- /dev/null +++ b/pt.sp.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1707a7517b61ca9d4d333dabcc5ec7024e44c6466ff6faea9ccc95a0f1b2737c +size 958101 diff --git a/pt_examples_with_stats.json b/pt_examples_with_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..43237b7e36350526bad1aa383d7c875aae3f8af4 --- /dev/null +++ b/pt_examples_with_stats.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72a681cc82b2a0f9e11a8fa052143f7eaad5a67d31269bbd96653715e0ff776a +size 135498651 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e6b4f5f64a5d880bef902ec49cfb16703015608 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +fasttext +sentencepiece +https://github.com/kpu/kenlm/archive/master.zip +emoji \ No newline at end of file diff --git a/stopwords.py b/stopwords.py new file mode 100644 index 0000000000000000000000000000000000000000..9093aec7c814d1fe93d148382bd64a5bf9e9b882 --- /dev/null +++ b/stopwords.py @@ -0,0 +1,7445 @@ +# From https://github.com/6/stopwords-json +# From https://github.com/stopwords-iso/stopwords-iso for Urdu and Vietnamese + + +stopwords = { + "af": [ + "'n", + "aan", + "af", + "al", + "as", + "baie", + "by", + "daar", + "dag", + "dat", + "die", + "dit", + "een", + "ek", + "en", + "gaan", + "gesê", + "haar", + "het", + "hom", + "hulle", + "hy", + "in", + "is", + "jou", + "jy", + "kan", + "kom", + "ma", + "maar", + "met", + "my", + "na", + "nie", + "om", + "ons", + "op", + "saam", + "sal", + "se", + "sien", + "so", + "sy", + "te", + "toe", + "uit", + "van", + "vir", + "was", + "wat", + "ʼn", + ], + "ar": [ + "آخر", + "آنَا", + "أ", + "أثناء", + "أحد", + "أصبح", + "أصبحت", + "أغلب", + "أكثر", + "أكون", + "ألا", + "أم", + "أما", + "أمام", + "أن", + "أنا", + "أنت", + "أنتم", + "أنَا", + "أو", + "أولئك", + "أولٰئك", + "أي", + "أية", + "أين", + "أينما", + "أَ", + "أَثنَاءَ", + "أَلَّا", + "أَم", + "أَمَامَ", + "أَمَّا", + "أَن", + "أَنَّ", + "أَو", + "أَي", + "أَينَ", + "أَينَمَا", + "أَيّ", + "إبان", + "إثر", + "إحدى", + "إذ", + "إذا", + "إزا", + "إزاء", + "إل", + "إلا", + "إلى", + "إلي", + "إليها", + "إما", + "إن", + "إنما", + "إنّ", + "إيا", + "إِثرَ", + "إِذ", + "إِذًا", + "إِذَا", + "إِزَاءَ", + "إِلَى", + "إِلَّا", + "إِمَّا", + "إِن", + "إِنَّ", + "إِنَّمَا", + "إِيَّا", + "اثر", + "اثناء", + "اذ", + "اذا", + "ازا", + "ازاء", + "ال", + "الا", + "التى", + "التي", + "الذى", + "الذي", + "الذين", + "الغاية", + "الـ", + "الـــ", + "الفوق", + "اللاتى", + "اللاتي", + "اللتان", + "اللتين", + "اللذان", + "اللذين", + "اللواتي", + "اللي", + "الى", + "الي", + "ام", + "اما", + "امام", + "ان", + "انا", + "انتم", + "انما", + "او", + "اولئك", + "اى", + "اي", + "اين", + "اينما", + "اَل", + "اَلَّذِي", + "ب", + "بأنفسهم", + "بات", + "باتت", + "بس", + "بعد", + "بعدما", + "بعض", + "بعيد", + "بغزة", + "بـ", + "بل", + "بما", + "بهم", + "بيد", + "بين", + "بينما", + "بَس", + "بَعدَ", + "بَعدَمَا", + "بَل", + "بَيدَ", + "بَينَ", + "بَينَمَا", + "بُعَيدَ", + "بِ", + "تحت", + "تحـــت", + "تصبح", + "تعد", + "تكن", + "تكون", + "تكونون", + "تلك", + "تَحتَ", + "تُجَاهَ", + "ثم", + "ثُمَّ", + "جراء", + "جَرَّاء", + "حتى", + "حسب", + "حسبما", + "حوالى", + "حوالي", + "حول", + "حولي", + "حيال", + "حيث", + "حيثما", + "حين", + "حينما", + "حَتَّى", + "حَسَب", + "حَسَبَ", + "حَسَبَمَا", + "حَولَ", + "حَوَالَى", + "حَيثُ", + "حِينَ", + "حِينَمَا", + "حِيَالَ", + "خلال", + "خَلفَ", + "خِلَالَ", + "دون", + "دُونَ", + "ذا", + "ذاك", + "ذلك", + "ذو", + "ذي", + "ذَا", + "ذَاكَ", + "ذُو", + "ذٰلك", + "ذٰلِكَ", + "راح", + "ربما", + "ربمــا", + "رغم", + "ريثما", + "رَغمَ", + "رَيثَمَا", + "رُبَّمَا", + "س", + "سائر", + "سواء", + "سوف", + "سوى", + "سَ", + "سَوفَ", + "سِوَى", + "شبه", + "شو", + "صار", + "صوب", + "ضد", + "ضمن", + "ضِدَّ", + "ضِمنَ", + "طال", + "طالما", + "طالَما", + "طوال", + "طيلة", + "طَالَمَا", + "طِوَالَ", + "طِيلَةَ", + "عبر", + "عدا", + "عدة", + "عشان", + "عـــلى", + "عـــندما", + "عــلى", + "عقب", + "عل", + "علـى", + "على", + "علي", + "علّ", + "عم", + "عن", + "عنا", + "عند", + "عندما", + "عوض", + "عَاد", + "عَبرَ", + "عَدَا", + "عَشان", + "عَقِبَ", + "عَلَى", + "عَلَّ", + "عَم", + "عَن", + "عِندَ", + "عِندَمَا", + "عِوَضَ", + "غالبية", + "غدت", + "غير", + "غَيرَ", + "ف", + "فتئ", + "فـ", + "فـي", + "فور", + "فوق", + "فى", + "في", + "فيما", + "فَ", + "فَورَ", + "فَوقَ", + "فِي", + "فِيمَا", + "ق", + "قبالة", + "قبل", + "قبيل", + "قد", + "قرابة", + "قرب", + "قيد", + "قَبلَ", + "قَد", + "قَيدَ", + "قُبَالَةَ", + "قُبَيلَ", + "قُربَ", + "قُرَابَةَ", + "ك", + "كأن", + "كأنما", + "كامل", + "كان", + "كانت", + "كانوا", + "كذا", + "كـ", + "كل", + "كلا", + "كلتا", + "كلما", + "كلي", + "كم", + "كما", + "كن", + "كنا", + "كنت", + "كون", + "كى", + "كي", + "كيف", + "كَ", + "كَأَنَّ", + "كَأَنَّمَا", + "كَان", + "كَذَا", + "كَلَّا", + "كَم", + "كَمَا", + "كَي", + "كَيفَ", + "كُل", + "كُلَّمَا", + "كِلَا", + "ل", + "لأن", + "لا", + "لازم", + "لان", + "لدى", + "لدي", + "لذا", + "لذلك", + "لذٰلك", + "لسنا", + "لـ", + "لقد", + "لكن", + "لكى", + "لكي", + "لم", + "لما", + "لماذا", + "لن", + "لهم", + "لو", + "لولا", + "ليس", + "ليست", + "ليسوا", + "لَ", + "لَا", + "لَازِم", + "لَدَى", + "لَم", + "لَمَّا", + "لَن", + "لَو", + "لَولَا", + "لَيس", + "لُو", + "لِ", + "لِأَن", + "لِأَنَّ", + "لِئَلّا", + "لِذَا", + "لِذٰلِكَ", + "لِكَي", + "لِمَاذَا", + "لٰكن", + "لٰكِن", + "لٰكِنَّ", + "م", + "ما", + "ماذا", + "مالم", + "ماهو", + "ماهُوَ", + "متى", + "مثـــل", + "مثل", + "مثلما", + "مش", + "مع", + "معظم", + "مــن", + "مـن", + "مقابل", + "مما", + "ممكن", + "من", + "منتصف", + "منذ", + "مهما", + "مين", + "مَا", + "مَاذَا", + "مَالَم", + "مَتَى", + "مَعَ", + "مَن", + "مَهمَا", + "مُقَابِلَ", + "مُمكِن", + "مُنذُ", + "مِثلَ", + "مِثلَمَا", + "مِمَّا", + "مِن", + "نا", + "ناهيك", + "نحسب", + "نحن", + "نحو", + "نصف", + "نعم", + "نكون", + "ني", + "نَاهِيك", + "نَحوَ", + "نَعَم", + "ه", + "هؤلاء", + "ها", + "هاتان", + "هاتين", + "هاد", + "هاي", + "هذا", + "هذان", + "هذــه", + "هذه", + "هذين", + "هـــذه", + "هــــذه", + "هكذا", + "هل", + "هم", + "هما", + "هن", + "هو", + "هى", + "هي", + "هَا", + "هَل", + "هُ", + "هُو", + "هُوَ", + "هِ", + "هٰؤلاء", + "هٰذا", + "هٰذان", + "هٰذه", + "هٰذَا", + "هٰكذا", + "هٰكَذَا", + "و", + "وأسلم", + "وراء", + "وسامراء", + "وسط", + "وســـط", + "وغربه", + "وفق", + "وقتما", + "وقف", + "ولا", + "ولَا", + "وهي", + "وَ", + "وَرَاءَ", + "وَسطَ", + "وِفقَ", + "وِلّا", + "ي", + "يا", + "يجعل", + "يزال", + "يصبح", + "يكن", + "يكون", + "يكونا", + "يَا", + "ِي", + ], + "bn": [ + "অনেক", + "অন্য", + "অবশ্য", + "আগে", + "আছে", + "আজ", + "আবার", + "আমরা", + "আমাদের", + "আর", + "ই", + "উত্তর", + "উপর", + "উপরে", + "এ", + "এই", + "এক্", + "এখন", + "এত", + "এব", + "এমন", + "এমনি", + "এর", + "এস", + "এসে", + "ও", + "ওই", + "কমনে", + "করা", + "করে", + "কাছে", + "কাজ", + "কাজে", + "কারণ", + "কি", + "কিছু", + "কে", + "কেউ", + "কেখা", + "কেন", + "কোটি", + "কোনো", + "কয়েক", + "খুব", + "গিয়ে", + "গেল", + "চার", + "চালু", + "চেষ্টা", + "ছিল", + "জানা", + "জ্নজন", + "টি", + "তখন", + "তবে", + "তা", + "তাই", + "তো", + "থাকা", + "থেকে", + "দিন", + "দু", + "দুই", + "দেওয়া", + "ধামার", + "নতুন", + "না", + "নাগাদ", + "নিয়ে", + "নেওয়া", + "নয়", + "পর", + "পরে", + "পাচ", + "পি", + "পেয়্র্", + "প্রতি", + "প্রথম", + "প্রযন্ত", + "প্রাথমিক", + "প্রায়", + "বক্তব্য", + "বন", + "বলা", + "বলে", + "বলেন", + "বহু", + "বা", + "বি", + "বিভিন্ন", + "বেশ", + "বেশি", + "মতো", + "মধ্যে", + "মনে", + "যখন", + "যদি", + "যা", + "যাওয়া", + "যে", + "র", + "রকম", + "লক্ষ", + "শুধু", + "শুরু", + "সঙ্গে", + "সব", + "সহ", + "সাধারণ", + "সামনে", + "সি", + "সে", + "সেই", + "হতে", + "হাজার", + "হয়", + ], + "ca": [ + "-ho", + "-la", + "-lo", + "-ne", + "-se", + "a", + "abans", + "això", + "al", + "algun", + "alguna", + "algunes", + "alguns", + "algú", + "allò", + "als", + "altra", + "altre", + "altres", + "amb", + "aqueix", + "aqueixa", + "aqueixes", + "aqueixos", + "aquell", + "aquella", + "aquelles", + "aquells", + "aquest", + "aquesta", + "aquestes", + "aquestos", + "aquests", + "bastant", + "bastants", + "bé", + "cada", + "cadascun", + "cadascuna", + "cadascú", + "cap", + "cert", + "certa", + "certes", + "certs", + "com", + "con", + "contra", + "d", + "d'", + "da", + "damunt", + "darrere", + "davant", + "de", + "del", + "dels", + "des", + "dient", + "diferent", + "diferents", + "dins", + "dintre", + "dir", + "divers", + "diverses", + "diversos", + "durant", + "eixa", + "eixe", + "eixes", + "eixos", + "el", + "ell", + "ella", + "elles", + "ells", + "els", + "em", + "emperò", + "en", + "endavant", + "enfront", + "ens", + "entre", + "envers", + "era", + "eren", + "es", + "estan", + "estant", + "estar", + "estaran", + "estarem", + "estaria", + "estarien", + "estarà", + "estat", + "estava", + "estaven", + "este", + "estem", + "estes", + "esteu", + "estic", + "estiguem", + "estiguessin", + "estigui", + "estiguin", + "estigués", + "estos", + "està", + "et", + "ets", + "excepte", + "extra", + "fa", + "faci", + "facin", + "facis", + "faig", + "fan", + "faran", + "farem", + "fareu", + "faria", + "farien", + "faries", + "faràs", + "faràs", + "faré", + "faríem", + "faríeu", + "fas", + "feia", + "feien", + "feies", + "fem", + "fent", + "fer", + "fes", + "fessin", + "fessis", + "fet", + "feu", + "fins", + "foren", + "fos", + "fossin", + "fou", + "front", + "fèiem", + "fèieu", + "féssiu", + "gaire", + "gaires", + "gràcies", + "ha", + "hagi", + "hagin", + "haguem", + "haguessin", + "haguessis", + "hagut", + "hagués", + "haguéssim", + "haguéssin", + "haguéssiu", + "han", + "has", + "hauran", + "haurem", + "haureu", + "hauria", + "haurien", + "hauries", + "haurà", + "hauràs", + "hauré", + "hauríem", + "hauríeu", + "havent", + "haver", + "havia", + "havien", + "havies", + "havíem", + "havíeu", + "he", + "hem", + "heu", + "hi", + "ho", + "hom", + "hàgim", + "i", + "in", + "jo", + "l", + "l", + "l'", + "la", + "las", + "les", + "li", + "llur", + "llurs", + "lo", + "los", + "ls", + "m", + "m", + "m'", + "malgrat", + "mancant", + "massa", + "mateix", + "mateixa", + "mateixes", + "mateixos", + "me", + "mentre", + "menys", + "mes", + "meu", + "meus", + "meva", + "meves", + "mi", + "mitjançant", + "molt", + "molta", + "moltes", + "molts", + "moltíssim", + "moltíssima", + "moltíssimes", + "moltíssims", + "n", + "n'", + "ne", + "ni", + "ningun", + "ninguna", + "ningunes", + "ninguns", + "ningú", + "no", + "nombroses", + "nombrós", + "nos", + "nosaltres", + "nostra", + "nostre", + "nostres", + "ns", + "o", + "on", + "os", + "pel", + "pels", + "per", + "perqu", + "perquè", + "però", + "poc", + "poca", + "pocs", + "poques", + "prou", + "qual", + "quals", + "qualsevol", + "quan", + "quant", + "quantes", + "quants", + "que", + "quelcom", + "qui", + "quin", + "quina", + "quines", + "quins", + "què", + "rere", + "respecte", + "s", + "s", + "s'", + "sa", + "sabent", + "salvant", + "se", + "segons", + "sens", + "sense", + "sent", + "ser", + "seran", + "serem", + "seria", + "serien", + "serà", + "seré", + "seríem", + "ses", + "seu", + "seus", + "seva", + "seves", + "si", + "siguem", + "sigui", + "siguin", + "sigut", + "sinó", + "sobre", + "som", + "sota", + "su", + "suficient", + "séssim", + "sóc", + "són", + "t", + "t'", + "tal", + "tals", + "tant", + "tanta", + "tantes", + "tants", + "te", + "tenc", + "tendran", + "tendrem", + "tendreu", + "tendria", + "tendrien", + "tendries", + "tendràs", + "tendràs", + "tendré", + "tendríem", + "tendríeu", + "tenen", + "tenia", + "tenien", + "tenies teníem", + "tenim", + "tenir", + "teniu", + "tens", + "teníeu", + "teu", + "teus", + "teva", + "ti", + "tinc", + "tindran", + "tindre", + "tindrem", + "tindreu", + "tindria", + "tindrien", + "tindries", + "tindràs", + "tindràs", + "tindré", + "tindríem", + "tindríeu", + "tingut", + "tot", + "tota", + "total", + "totes", + "tothom", + "tots", + "tu", + "té", + "u", + "ultra", + "un", + "una", + "unes", + "uns", + "us", + "va", + "vagi", + "vagin", + "vaig", + "vam", + "van", + "varen", + "vau", + "vers", + "versus", + "via", + "vora", + "vos", + "vosaltres", + "vostre", + "vostè", + "vostès", + "vàrem", + "y", + "érem", + "és", + ], + "en": [ + "a", + "a.k.a", + "aboard", + "about", + "above", + "abt", + "accord", + "according", + "across", + "after", + "against", + "ago", + "aground", + "ahead", + "aka", + "ala", + "albeit", + "all", + "along", + "alongside", + "although", + "am", + "amid", + "amidst", + "among", + "amongst", + "amoung", + "an", + "and", + "and/or", + "another", + "any", + "any1", + "anybody", + "anyone", + "anything", + "are", + "around", + "as", + "aside", + "astride", + "at", + "atop", + "away", + "b", + "b/c", + "b/t", + "back", + "base", + "based", + "bc", + "be", + "because", + "been", + "before", + "behind", + "being", + "below", + "beneath", + "beside", + "besides", + "between", + "beyond", + "board", + "both", + "btwn", + "but", + "by", + "can", + "cause", + "circa", + "cos", + "could", + "coz", + "cus", + "depend", + "depending", + "despite", + "did", + "do", + "does", + "down", + "due", + "during", + "each", + "either", + "else", + "even", + "ever", + "every", + "everybody", + "everyone", + "everything", + "except", + "for", + "forth", + "from", + "get", + "gets", + "getting", + "give", + "given", + "got", + "had", + "half", + "has", + "hav", + "have", + "having", + "he", + "her", + "hers", + "herself", + "him", + "himself", + "his", + "how", + "however", + "i", + "i'd", + "if", + "in", + "include", + "including", + "inside", + "instead", + "into", + "is", + "it", + "it's", + "its", + "itself", + "lest", + "like", + "made", + "many", + "may", + "me", + "might", + "mine", + "minus", + "most", + "much", + "must", + "my", + "myself", + "nary", + "near", + "nearby", + "neither", + "next", + "nigh", + "no", + "nobody", + "none", + "noone", + "nor", + "not", + "nothing", + "notwithstanding", + "of", + "off", + "on", + "onboard", + "once", + "one", + "ones", + "oneself", + "only", + "onto", + "opposite", + "or", + "other", + "others", + "ought", + "our", + "ours", + "ourselves", + "out", + "outside", + "over", + "overt", + "own", + "past", + "per", + "plus", + "prior", + "quite", + "rather", + "re", + "regard", + "regarding", + "regardless", + "round", + "s/he", + "save", + "self", + "shall", + "she", + "should", + "side", + "since", + "so", + "some", + "somebody", + "someone", + "something", + "such", + "sure", + "teh", + "than", + "thanks", + "that", + "the", + "their", + "theirs", + "them", + "themselves", + "then", + "there", + "these", + "they", + "they're", + "thier", + "this", + "tho", + "those", + "thou", + "though", + "through", + "throughout", + "thru", + "thy", + "til", + "till", + "to", + "together", + "too", + "toward", + "towards", + "u", + "under", + "underneath", + "unless", + "unlike", + "until", + "unto", + "up", + "upon", + "ur", + "us", + "use", + "versus", + "via", + "vs", + "vs.", + "w/", + "w/o", + "w/out", + "was", + "we", + "were", + "what", + "whatever", + "whatnot", + "when", + "whenever", + "where", + "whereas", + "wherever", + "whether", + "which", + "while", + "whilst", + "whither", + "who", + "who's", + "whoever", + "whom", + "whomever", + "whose", + "why", + "will", + "with", + "within", + "without", + "wo", + "worth", + "would", + "wud", + "y'all", + "ya", + "yet", + "yo", + "you", + "you're", + "your", + "youre", + "yours", + "yourself", + "yourselves", + ], + "es": [ + "a", + "a fin de que", + "a medida que", + "a menos que", + "a modo de", + "a no ser que", + "a poco que", + "a que", + "abandono", + "acerca", + "acostumbra", + "adónde", + "ahora", + "al igual que", + "al lado de", + "algo", + "alguien", + "alguna", + "algunas", + "alguno", + "algunos", + "algún", + "alrededor", + "ambas", + "ambos", + "ante", + "aparece", + "aparecen", + "apareció", + "aparte", + "apenas", + "aquel", + "aquella", + "aquellas", + "aquello", + "aquellos", + "aquesa", + "aquesas", + "aquesos", + "aquesta", + "aquestas", + "aquesto", + "aquestos", + "aquél", + "aquélla", + "aquéllas", + "aquéllos", + "arrepentir", + "arrepentiréis", + "así", + "así como", + "así que", + "atlético", + "aun", + "aunque", + "aún", + "bajo", + "bastante", + "bastantes", + "bien", + "cada", + "casi", + "cerca", + "chance", + "cierta", + "ciertas", + "cierto", + "ciertos", + "comenzado", + "comenzó", + "comienzan", + "como", + "como quiera que", + "como si", + "con", + "con tal de", + "con tal que", + "conforme", + "conmigo", + "conque", + "considera", + "consideradas", + "consideran", + "consideró", + "consigo", + "contendrán", + "contigo", + "continuaba", + "continuar", + "continuaron", + "continuase", + "continuó", + "continúa", + "contra", + "corresponden", + "corresponder", + "cual", + "cual si", + "cuales", + "cualesquier", + "cualesquiera", + "cualquier", + "cualquiera", + "cuan", + "cuando", + "cuanta", + "cuantas", + "cuanto", + "cuanto quiera que", + "cuantos", + "cuya", + "cuyas", + "cuyo", + "cuyos", + "cuàles", + "cuál", + "cuáles", + "cuán", + "cuándo", + "cuánta", + "cuántas", + "cuánto", + "cuántos", + "cómo", + "da", + "dado que", + "dar", + "de", + "de manera que", + "de modo que", + "deba", + "debajo", + "deban", + "debas", + "debe", + "debemos", + "deben", + "deber", + "deberá", + "deberán", + "debería", + "deberíamos", + "deberían", + "debes", + "debido", + "debiera", + "debieron", + "debimos", + "debió", + "debo", + "debía", + "debíamos", + "debían", + "declaraba", + "declarada", + "declarado", + "declarase", + "declaro", + "declaró", + "dejaban", + "dejado", + "dejan", + "dejará", + "del", + "delante", + "demasiada", + "demasiadas", + "demasiado", + "demasiados", + "demás", + "den", + "dentro", + "dentro_de", + "des", + "desde", + "después", + "detrás", + "di", + "dicha", + "dichas", + "dicho", + "dichos", + "diferente", + "diferentes", + "distintas", + "distinto", + "distintos", + "diversas", + "diverso", + "diversos", + "don", + "donde", + "dos", + "durante", + "dónde", + "echar", + "el", + "el que", + "ella", + "ellas", + "ello", + "ellos", + "en", + "en cambio", + "en caso de", + "en la medida en que", + "en tanto que", + "encima", + "enfrente", + "entonces", + "entre", + "era", + "eramos", + "eran", + "eras", + "eres", + "ergo", + "es", + "esa", + "esas", + "escasa", + "escasas", + "escaso", + "escasos", + "escrito", + "ese", + "eso", + "eso que", + "esos", + "esotra", + "esotro", + "esta", + "estaba", + "estabais", + "estabamos", + "estaban", + "estabas", + "estado", + "estamos", + "estan", + "estando", + "estar", + "estaremos", + "estará", + "estarán", + "estaré", + "estaría", + "estaríamos", + "estarían", + "estarías", + "estas", + "este", + "estemos", + "esto", + "estos", + "estotra", + "estotro", + "estoy", + "estuve", + "estuviera", + "estuvieran", + "estuvieron", + "estuviese", + "estuviesen", + "estuvimos", + "estuvo", + "está", + "estábamos", + "estáis", + "están", + "estás", + "esté", + "estén", + "ex", + "excepto", + "frente", + "fue", + "fuera", + "fueran", + "fuere", + "fueron", + "fuese", + "fuesen", + "fui", + "fuimos", + "gracias", + "gracias_a", + "habeis", + "haber", + "haberle", + "haberse", + "habido", + "habiendo", + "habiéndo", + "habremos", + "habrá", + "habrán", + "habrás", + "habré", + "habría", + "habríamos", + "habrían", + "habéis", + "había", + "habíamos", + "habían", + "habías", + "hace", + "hacer", + "hacia", + "hacía", + "halla", + "han", + "has", + "hasta", + "hasta que", + "hay", + "haya", + "hayamos", + "hayan", + "hayas", + "he", + "hecho", + "hemos", + "hola", + "hubiera", + "hubieran", + "hubieron", + "hubiese", + "hubiesen", + "hubiéramos", + "hubo", + "iba", + "iban", + "ido", + "incluso", + "ir", + "irá", + "irán", + "iré", + "iría", + "junto a", + "la", + "las", + "le", + "lejos", + "les", + "lo", + "los", + "luego", + "mal que", + "mas", + "me", + "mediante", + "menos", + "mes", + "mi", + "mientras", + "mientras que", + "mis", + "misma", + "mismas", + "mismo", + "mismos", + "mismísimo", + "morir", + "moriría", + "mostrado", + "mostraron", + "mucha", + "muchas", + "muchisimas", + "muchisimio", + "muchisimo", + "mucho", + "muchos", + "muchísima", + "muchísimas", + "muchísimo", + "muchísimos", + "más", + "más bien", + "mí", + "mía", + "mías", + "mío", + "míos", + "nada", + "nadie", + "negar", + "ni", + "ni que", + "ningun", + "ninguna", + "ningunas", + "ninguno", + "ningunos", + "ningún", + "no", + "no obstante", + "noche", + "nombrado", + "nombró", + "nos", + "nosotros", + "nuestra", + "nuestras", + "nuestro", + "nuestros", + "o", + "os", + "otra", + "otras", + "otro", + "otros", + "pa", + "para", + "para que", + "parezca", + "partir", + "pasar", + "pero", + "po", + "poca", + "pocas", + "poco", + "pocos", + "podamos", + "podeis", + "podemos", + "poder", + "podes", + "podido", + "podras", + "podre", + "podremos", + "podriaís", + "podrá", + "podrán", + "podrás", + "podré", + "podréis", + "podría", + "podríamos", + "podrían", + "podéis", + "podía", + "podíamos", + "podían", + "poner", + "poquito", + "por", + "por el contrario", + "por ende", + "por eso", + "por lo que", + "por mucho que", + "por más que", + "por no hablar de", + "por si", + "porque", + "pos", + "post", + "pre", + "pro", + "propia", + "propias", + "propio", + "propios", + "pude", + "pudiendo", + "pudiera", + "pudieran", + "pudieras", + "pudieron", + "pudiese", + "pudiesen", + "pudimos", + "pudo", + "pueda", + "puedan", + "puedas", + "puede", + "pueden", + "puedes", + "puedo", + "pues", + "puesto", + "puesto que", + "que", + "queda", + "quedaba", + "quedan", + "quedó", + "queremos", + "querer", + "queriendo", + "quien", + "quienes", + "quienesquiera", + "quienquier", + "quienquiera", + "quiera", + "quiere", + "quisiera", + "quién", + "quiénes", + "qué", + "re", + "resulta", + "resultado", + "resultaría", + "resulte", + "sabe", + "saber", + "sabiendo", + "salen", + "salir", + "salió", + "salvo", + "se", + "sea", + "seamos", + "sean", + "seas", + "seguir", + "seguirá", + "seguía", + "según", + "semejante", + "semejantes", + "semi", + "sendas", + "sendo", + "sendos", + "ser", + "será", + "serán", + "serás", + "seré", + "seréis", + "sería", + "serían", + "serías", + "si", + "si bien", + "si y solo si", + "sido", + "siempre que", + "siendo", + "siente", + "siento", + "siga", + "sigamos", + "sigue", + "sin", + "sino", + "siquiera", + "sobre", + "sobrer", + "sobrir", + "soler", + "solían", + "somos", + "son", + "soy", + "sub", + "suele", + "suelen", + "suelo", + "super", + "supo", + "sur", + "sus", + "suya", + "suyas", + "suyo", + "suyos", + "sé", + "sí", + "tal", + "tales", + "tanta", + "tantas", + "tanto", + "tantos", + "tantísima", + "tantísimas", + "tantísimos", + "te", + "tendremos", + "tendrian", + "tendrá", + "tendrán", + "tendría", + "tendrían", + "tenemos", + "tener", + "tenga", + "tengan", + "tengo", + "tenia", + "tenido", + "teniendo", + "tenéis", + "tenía", + "teníamos", + "tenían", + "terminas", + "ti", + "tiene", + "tienen", + "tienes", + "toda", + "todas", + "todavía", + "todes", + "todo", + "todos", + "trabajado", + "trans", + "tras", + "tu", + "tus", + "tuve", + "tuviera", + "tuvieron", + "tuviese", + "tuvo", + "tuya", + "tuyas", + "tuyo", + "tuyos", + "tú", + "u", + "un", + "una", + "unas", + "une", + "unir", + "uno", + "unos", + "usted", + "ustedes", + "va", + "vamos", + "van", + "varias", + "varios", + "varía", + "vas", + "vaya", + "vayan", + "venir", + "venía", + "ver", + "vice", + "vieron", + "vino", + "vis a vis", + "visto que", + "volver", + "volverá", + "volveríamos", + "volvió", + "vos", + "vosotras", + "vosotros", + "voy", + "vuelva", + "vuelvan", + "vuelve", + "vuelven", + "vuestra", + "vuestras", + "vuestro", + "vuestros", + "vía", + "y", + "ya", + "ya que", + "yo", + "ámbos", + "él", + "éramos", + "ésa", + "ésas", + "ése", + "ésos", + "ésta", + "éstas", + "éste", + "ésto", + "éstos", + "íbamos", + "ó", + "ú", + "última", + "últimas", + "último", + "últimos", + "\ufeffdesde", + "\ufeffel", + "\ufeffen", + "\ufeffla", + "\ufefflas", + ], + "eu": [ + "*edin", + "*edun", + "*ezan", + "aitzitik", + "ala", + "alabaina", + "aldiz", + "alegia", + "alta", + "anitz", + "anitzek", + "anitzeko", + "anitzez", + "antzera", + "arabera", + "ari", + "ari_izan", + "ariko", + "arren", + "asko", + "askoan", + "askok", + "askoko", + "askorekin", + "askoren", + "askorengan", + "askorentzat", + "askori", + "askorik", + "askotako", + "askotan", + "askotariko", + "askotatik", + "askotaz", + "askotxo", + "askoz", + "at", + "aunitz", + "aurka", + "aurkako", + "aurretik", + "azpian", + "azpitik", + "ba", + "bada", + "badago", + "badezake", + "badidazu", + "badiezu", + "badio", + "badiogu", + "badiote", + "badiougu", + "badiozu", + "badira", + "badirela", + "baditu", + "baditugu", + "badituzte", + "badituzu", + "badu", + "badugu", + "badugun", + "badut", + "badute", + "baduzu", + "bagara", + "bagatzaizkio", + "bagenu", + "baginen", + "bai", + "baietz", + "baikaituzte", + "bailegoen", + "bailituen", + "bailitzake", + "bailitzateke", + "baina", + "bainan", + "bainintzen", + "bainizkion", + "baino", + "baita", + "baitabil", + "baitaiteke", + "baitan", + "baitaude", + "baitiete", + "baitigu", + "baitio", + "baitiote", + "baitira", + "baititu", + "baititugu", + "baitituzte", + "baitituzu", + "baititzaket", + "baitizkio", + "baitu", + "baitugu", + "baitute", + "baituzu", + "baitzaio", + "baitzaizkio", + "baitzara", + "baitzegoen", + "baitzen", + "baitzeuden", + "baitzien", + "baitzion", + "baitzioten", + "baitziren", + "baitzitekeen", + "baitzituen", + "baitzitzaion", + "baitzuen", + "baitzuten", + "baizik", + "baizituen", + "baldin", + "balego", + "balira", + "baliteke", + "balitu", + "balituzkete", + "balitz", + "balitzait", + "balu", + "balute", + "banintz", + "banitu", + "banu", + "barik", + "barru", + "bat", + "batera", + "batera\x97", + "batere", + "batzu", + "batzuei", + "batzuek", + "batzuekin", + "batzuen", + "batzuengatik", + "batzuentzat", + "batzuetako", + "batzuetakoak", + "batzuetan", + "batzuetara", + "batzuetatik", + "batzuez", + "batzuk", + "batzutako", + "batzutan", + "bazaigu", + "bazaizu", + "bazara", + "bazen", + "bazina", + "baziren", + "bazituen", + "bazituzten", + "bazuen", + "bazuten", + "bederen", + "behintzat", + "bera", + "beragatik", + "beraiei", + "beraiek", + "beraiekin", + "beraien", + "beraietaz", + "berak", + "berarekin", + "beraren", + "berarengan", + "berarengana", + "berarengandik", + "berarengatik", + "berarentzat", + "berari", + "berauek", + "berauen", + "berauetan", + "beraz", + "berbera", + "berberagatik", + "berberak", + "berberarekin", + "berberaren", + "berberera", + "bere", + "berea", + "bereak", + "berean", + "berek", + "bereko", + "berekoa", + "berekoak", + "beren", + "beretan", + "beretik", + "beretzat", + "berriz", + "bertze", + "bertzeekin", + "bertzela", + "bestalde", + "bestaldean", + "beste", + "bestea", + "besteak", + "bestean", + "bestearekiko", + "bestearekin", + "bestearen", + "bestearengandik", + "besteari", + "besteaz", + "besteei", + "besteen", + "besteengandik", + "besteetan", + "besteko", + "bestekoa", + "bestela", + "bestera", + "besterantz", + "besterik", + "bestetan", + "bestetik", + "bezala", + "bezalako", + "bezalakoa", + "bezalakoen", + "bidez", + "bitartean", + "bitarteko", + "bitarterako", + "bitartez", + "da", + "dabil", + "dabiltza", + "dadila", + "dadin", + "dago", + "dagoela", + "dagoelako", + "dagoen", + "dagoena", + "dagoenaren", + "dagoenean", + "dagoenez", + "daiteekenaren", + "daiteke", + "daitekeela", + "daitekeen", + "daitekeena", + "daitekeenaren", + "daitekeenez", + "daiteken", + "daitezela", + "daitezen", + "daitezke", + "daitezkeelako", + "daitezkeelarik", + "daitezkeen", + "daitezkeenak", + "daitezkela", + "dakizuke", + "danok", + "daude", + "daudela", + "daudelako", + "dauden", + "daudenak", + "daudenek", + "daudenen", + "daudenik", + "dautzuet", + "dela", + "delako", + "delarik", + "den", + "dena", + "denak", + "denaren", + "denarentzat", + "denari", + "denean", + "denek", + "denen", + "denera", + "denerako", + "denetan", + "denetarik", + "denetik", + "denez", + "denik", + "denok", + "denon", + "denona", + "denontzat", + "deus", + "dexente", + "dezadan", + "dezagun", + "dezake", + "dezakedala", + "dezakedan", + "dezakedanean", + "dezakeela", + "dezakeen", + "dezakeena", + "dezakegu", + "dezakegula", + "dezakegun", + "dezakela", + "dezakelako", + "dezaket", + "dezakete", + "dezaketela", + "dezaketen", + "dezakezu", + "dezakezuen", + "dezakezuenez", + "dezakezunez", + "dezala", + "dezan", + "dezaten", + "dezente", + "dezenterekin", + "dezentetan", + "diat", + "didala", + "didana", + "didate", + "didazue", + "die", + "diegu", + "diegun", + "diela", + "dien", + "dienak", + "diet", + "diete", + "dietela", + "dietelako", + "dietenean", + "diezaiekete", + "diezaiokeena", + "diezaiokete", + "diezaiola", + "diezaioten", + "diezaizkioke", + "diezazkioke", + "diezazkiokeen", + "digu", + "digun", + "digute", + "digutela", + "diguten", + "digutenean", + "diguzu", + "dik", + "din", + "dinat", + "dio", + "diogu", + "diogulako", + "diogun", + "diola", + "dion", + "diona", + "dionean", + "dionez", + "diot", + "diote", + "diotela", + "dioten", + "diotena", + "diotenak", + "diotenek", + "diozu", + "dira", + "direla", + "direlako", + "direlakoan", + "direlakotz", + "diren", + "direnak", + "direnean", + "direnek", + "direnen", + "direnetan", + "direnez", + "direnik", + "dit", + "ditake", + "ditazke", + "ditin", + "ditu", + "ditudala", + "ditudalako", + "ditudan", + "ditudanean", + "dituela", + "dituelako", + "dituelarik", + "dituen", + "dituena", + "dituenak", + "dituenean", + "ditugu", + "ditugula", + "ditugun", + "ditugunez", + "ditun", + "ditut", + "dituzte", + "dituztela", + "dituztelako", + "dituzten", + "dituztenak", + "dituztenean", + "dituztenek", + "dituztenekin", + "dituztenen", + "dituzu", + "dituzue", + "dituzuen", + "dituzula", + "dituzun", + "dituzunik", + "ditzagun", + "ditzake", + "ditzakeen", + "ditzakegu", + "ditzakegula", + "ditzakete", + "ditzaketela", + "ditzaketelako", + "ditzaketen", + "ditzakezu", + "ditzan", + "dizkidazu", + "dizkie", + "dizkien", + "dizkiet", + "dizkiete", + "dizkigu", + "dizkigula", + "dizkigunak", + "dizkigute", + "dizkio", + "dizkiola", + "dizkion", + "dizkiot", + "dizkiotela", + "dizkit", + "dizkizuet", + "dizkizugu", + "dizu", + "dizuet", + "dizugu", + "dizut", + "dizute", + "du", + "duan", + "dudala", + "dudalarik", + "dudan", + "dudanak", + "dudanarekin", + "dudanean", + "dudanik", + "duela", + "duelako", + "duelakoan", + "duen", + "duena", + "duenak", + "duenaren", + "duenarentzat", + "duenari", + "duenean", + "duenentz", + "duenez", + "duenik", + "dugu", + "dugula", + "dugulako", + "dugun", + "duguna", + "dugunari", + "dugunean", + "dugunez", + "dugunik", + "duk", + "dun", + "dunala", + "dut", + "dute", + "dutela", + "dutelako", + "dutelakoan", + "duten", + "dutena", + "dutenagatik", + "dutenak", + "dutenaren", + "dutenean", + "dutenek", + "duteneko", + "dutenen", + "dutenena", + "dutenenetatik", + "dutenentz", + "dutenetakoa", + "dutenetik", + "dutenez", + "duzu", + "duzue", + "duzuela", + "duzuen", + "duzuenean", + "duzuenez", + "duzula", + "duzun", + "duzunarekin", + "ea", + "edo", + "edonor", + "edota", + "edozein", + "edozeinek", + "edozer", + "edozertarako", + "elgarrekin", + "elgarri", + "elkar", + "elkarrekiko", + "elkarrekin", + "elkarren", + "elkarri", + "ene", + "era", + "ere", + "esker", + "eta", + "eurak", + "eurei", + "eurek", + "eurekin", + "euren", + "eurentzat", + "ez", + "ezan", + "ezazu", + "ezazue", + "ezean", + "ezein", + "ezen", + "ezer", + "ezerekin", + "ezerk", + "ezertarako", + "ezertaz", + "ezertxo", + "ezetz", + "ezik", + "ezta", + "gabe", + "gabeko", + "gainera", + "gainerakoan", + "gainerat", + "gainera\x97", + "gainetik", + "gaitezen", + "gaitezke", + "gaitezkeela", + "gaitu", + "gaituela", + "gaituzte", + "gaituztenak", + "gara", + "garela", + "garelako", + "garen", + "garenez", + "garenok", + "gaude", + "gaudenak", + "gehiago", + "gehiagoan", + "gehiagok", + "gehiagoko", + "gehiagorekin", + "gehiegi", + "gehiegirik", + "gehiegitxo", + "gehien", + "gehiena", + "gehienak", + "gehienek", + "gehienekin", + "gehienentzako", + "gehienentzat", + "gehienetako", + "gehienetan", + "gehienok", + "gehientsu", + "gehientsuen", + "gehitxo", + "gehixeago", + "genbiltzan", + "genezake", + "genien", + "genion", + "genituela", + "genituelako", + "genituen", + "genituzke", + "genituzkeelako", + "genizkion", + "genizuen", + "genizun", + "genuela", + "genuelako", + "genuen", + "genuenean", + "genuenetik", + "genuenez", + "genuke", + "genukeen", + "geratu", + "geratzen", + "geroztik", + "geu", + "geure", + "geuregan", + "geuri", + "ginela", + "ginen", + "ginenean", + "ginenekoa", + "gintezkeela", + "gintuen", + "gintuenagatik", + "gintunan", + "gintuzten", + "gintzaizkion", + "gu", + "guk", + "gure", + "gurean", + "gurekin", + "guretzat", + "guri", + "gutako", + "gutaz", + "guti", + "gutiz", + "gutiz-gehien", + "gutiz-gehienek", + "gutxi", + "gutxiago", + "gutxiagorako", + "gutxiagorekin", + "gutxian", + "gutxien", + "gutxienez", + "gutxik", + "gutxiko", + "gutxira", + "gutxiren", + "gutxitan", + "guzi", + "guziak", + "guziarekin", + "guziekin", + "guzientzat", + "guzti", + "guztia", + "guztiagatik", + "guztiak", + "guztian", + "guztiarekin", + "guztiaren", + "guztiari", + "guztiaz", + "guztiei", + "guztiek", + "guztien", + "guztiengan", + "guztientzako", + "guztientzat", + "guztietako", + "guztietan", + "guztietara", + "guztietatik", + "guztiez", + "guztioi", + "guztiok", + "guztion", + "guztionak", + "guztionen", + "guztiontzat", + "guztira", + "guztitako", + "haatik", + "haiek", + "haiekin", + "haien", + "haiengan", + "haiengandik", + "haietako", + "haietan", + "haietatik", + "hainbat", + "hainbatek", + "hainbaten", + "hainbatez", + "hainbertze", + "hainbeste", + "hainbesterako", + "haiteke", + "haiz", + "halaber", + "halere", + "harekin", + "haren", + "harena", + "harentzat", + "hargatik", + "hari", + "hark", + "hartako", + "hartan", + "hartara", + "hartarako", + "hartatik", + "hau", + "haudala", + "hauei", + "hauek", + "hauekin", + "hauen", + "hauetako", + "hauetan", + "hauetara", + "hauetarako", + "hauetarik", + "hauetatik", + "hauexek", + "hauez", + "hauxe", + "heu", + "heure", + "hhriek", + "hi", + "hik", + "hinduan", + "hintzen", + "hire", + "hiri", + "honegatik", + "honek", + "honekin", + "honen", + "honengatik", + "honentzat", + "honetako", + "honetan", + "honetara", + "honetarako", + "honetatik", + "honetaz", + "honez", + "honi", + "hori", + "horiei", + "horiek", + "horiekin", + "horien", + "horientzat", + "horietako", + "horietakoren", + "horietan", + "horietarako", + "horietariko", + "horietatik", + "horiez", + "horixe", + "horregatik", + "horrek", + "horrekin", + "horren", + "horrenbeste", + "horrenbestez", + "horrengatik", + "horretako", + "horretan", + "horretantxe", + "horretara", + "horretarako", + "horretatik", + "horretaz", + "horrexegatik", + "horrexekin", + "horrexetan", + "horrez", + "horrezaz", + "horri", + "hortaz", + "huan", + "huntan", + "hura", + "huraxe", + "iezaidazu", + "iezaiezu", + "iezaion", + "iezaiozu", + "inor", + "inoren", + "inorentzako", + "inori", + "inork", + "inortaz", + "irian", + "itzazu", + "izaki", + "kontra", + "lezake", + "lezakeen", + "lezakete", + "lezan", + "liekeela", + "liezaiokeen", + "lioke", + "liokeela", + "liokeen", + "lirateke", + "liratekeela", + "liteke", + "litekeela", + "litekeen", + "litekeena", + "litezke", + "lituzkeela", + "lituzkeen", + "lituzkete", + "litzaidake", + "litzaiguke", + "litzateke", + "litzatekeela", + "litzatekeelako", + "litzatekela", + "lizateke", + "luke", + "lukeela", + "lukeelako", + "lukeen", + "lukeena", + "lukete", + "luketen", + "nabil", + "nago", + "nahiko", + "nahikoa", + "nahikorik", + "nahiz", + "naiteke", + "naiz", + "naizela", + "naizen", + "naizenean", + "naizenetan", + "naizenetik", + "naizenez", + "naizenik", + "nau", + "nauen", + "nauenarentzat", + "nauenean", + "nauk", + "naun", + "naute", + "nautela", + "nauzu", + "nauzun", + "nazan", + "nazaten", + "nazazu", + "nazazun", + "nenbilen", + "nengoela", + "nengoen", + "nere", + "neu", + "neuk", + "neure", + "nezake", + "ni", + "nian", + "nien", + "nigan", + "nik", + "ninduen", + "ninduten", + "nintekeela", + "nintzaion", + "nintzateke", + "nintzatekeela", + "nintzela", + "nintzelako", + "nintzen", + "nintzenean", + "nion", + "nire", + "nirea", + "niregan", + "niregana", + "niregatik", + "nirekin", + "niretzako", + "niretzat", + "niri", + "nitaz", + "nituela", + "nituen", + "nituzke", + "nizuke", + "nor", + "norbait", + "norbaitek", + "norbaitekin", + "norbaiten", + "norbaitengana", + "norbaitentzat", + "norbaiti", + "norbera", + "norberak", + "norberaren", + "norbere", + "noren", + "nori", + "nork", + "nornahi", + "nornahik", + "nortzuk", + "nortzuren", + "nuela", + "nuen", + "nuena", + "nuenean", + "nuenetik", + "nuke", + "nukeela", + "omen", + "ondoan", + "ondoko", + "ondora", + "ondoren", + "ondorengo", + "ondotik", + "ordea", + "ordez", + "orduan", + "oro_har", + "orobat", + "orohar", + "orok", + "ororen", + "orori", + "ostean", + "ostera", + "osterantzean", + "pean", + "piskat", + "pixka_bat", + "pixkat", + "pranko", + "ugari", + "ugarik", + "ugarirekin", + "ugariren", + "ugaritan", + "zagok", + "zaidan", + "zaidanaren", + "zaie", + "zaiela", + "zaien", + "zaienez", + "zaigu", + "zaigun", + "zaiguna", + "zaigunean", + "zaik", + "zaio", + "zaiola", + "zaiolako", + "zaion", + "zaiona", + "zait", + "zaitez", + "zaitezen", + "zaitu", + "zaitut", + "zaituzte", + "zaitzakegu", + "zaizkidan", + "zaizkie", + "zaizkiela", + "zaizkien", + "zaizkigu", + "zaizkio", + "zaizkiola", + "zaizkion", + "zaizkit", + "zaizkizu", + "zaizkizue", + "zaizkizun", + "zaizu", + "zaizue", + "zara", + "zarela", + "zarete", + "zatekeela", + "zatekeen", + "zatzait", + "zaude", + "ze", + "zebilen", + "zedin", + "zegoan", + "zegoela", + "zegoelako", + "zegoen", + "zegoenez", + "zegok", + "zehar", + "zein", + "zeina", + "zeinek", + "zeinen", + "zeintzu", + "zeintzuetan", + "zeintzuk", + "zela", + "zelako", + "zelarik", + "zen", + "zena", + "zenak", + "zenarekin", + "zenari", + "zenbait", + "zenbaitek", + "zenbaiten", + "zenbaitetan", + "zenbaiti", + "zenbaitzuk", + "zenbat", + "zenbateraino", + "zenean", + "zenekoa", + "zenetik", + "zenez", + "zeniguten", + "zenigutenez", + "zenik", + "zenituen", + "zenitzakeen", + "zenuela", + "zenuen", + "zenuke", + "zenukete", + "zenutela", + "zenuten", + "zeozer", + "zer", + "zer_edo_zer", + "zerbait", + "zerbaitek", + "zerbaitengatik", + "zerbaitetarako", + "zeren", + "zerendako", + "zeri", + "zerk", + "zertan", + "zertara", + "zertarako", + "zertaz", + "zertxobait", + "zeu", + "zeudela", + "zeudelako", + "zeuden", + "zeudenak", + "zeuk", + "zeure", + "zezakeen", + "zezaken", + "zezaketen", + "zezala", + "zezan", + "zezaten", + "zidan", + "zidatelako", + "zidaten", + "zidatena", + "zidatenak", + "zidatenean", + "ziela", + "zien", + "zienez", + "zietela", + "zietelako", + "zieten", + "ziezaion", + "zigun", + "zigunez", + "ziguten", + "zinan", + "zinen", + "zintudan", + "zintuztela", + "zintuztenean", + "ziola", + "ziolako", + "ziolarik", + "zion", + "ziona", + "zionean", + "zionez", + "zioten", + "ziotenak", + "zirela", + "zirelako", + "zirelakoan", + "zirelarik", + "ziren", + "zirenak", + "zirenean", + "zirenetik", + "zirenez", + "zirenik", + "ziren\x97", + "zirezte", + "zitekeela", + "zitekeen", + "zitekeena", + "zitekeenik", + "zitezen", + "zitezkeela", + "zitezkeelakoan", + "zitezkeen", + "zituela", + "zituelako", + "zituelarik", + "zituen", + "zituenean", + "zituenei", + "zituztela", + "zituztelarik", + "zituzten", + "zituztenak", + "zituztenetik", + "zitzaidakeen", + "zitzaidala", + "zitzaidan", + "zitzaien", + "zitzaigun", + "zitzaiola", + "zitzaion", + "zitzaionagatik", + "zitzaionean", + "zitzaizkidan", + "zitzaizkien", + "zitzaizkienean", + "zitzaizkigun", + "zitzaizkion", + "zitzaizkon", + "zitzaizun", + "zitzakeen", + "zitzaketenak", + "zizioten", + "zizkidaten", + "zizkien", + "zizkienik", + "zizkieten", + "zizkigun", + "zizkiola", + "zizkion", + "zizkiona", + "zizkioten", + "zizkiotenekin", + "zizuen", + "zizun", + "zoin", + "zonbat", + "zu", + "zuei", + "zuek", + "zuela", + "zuelako", + "zuelarik", + "zuen", + "zuena", + "zuenak", + "zuenarentzat", + "zuenean", + "zuenetik", + "zuenez", + "zuenik", + "zuentzako", + "zuetako", + "zuetaz", + "zugandik", + "zuk", + "zukeen", + "zuketen", + "zure", + "zureak", + "zurekin", + "zuretzat", + "zutela", + "zutelako", + "zutelarik", + "zuten", + "zutena", + "zutenean", + "zuteneko", + "zutenetik", + "zutenez", + ], + "fr": [ + "a", + "afin", + "ai", + "aie", + "aient", + "ainsi", + "ait", + "alias", + "aller", + "allons", + "apres", + "après", + "as", + "au", + "au-delà", + "aucun", + "aucune", + "aucunes", + "aucuns", + "aujourd'", + "auprès", + "auquel", + "aura", + "aurai", + "auraient", + "aurais", + "aurait", + "aurions", + "aurons", + "auront", + "autant", + "autour", + "autre", + "autres", + "autrui", + "auxquelles", + "auxquels", + "avaient", + "avais", + "avait", + "avant", + "avec", + "avez", + "aviez", + "avions", + "avoir", + "avons", + "ayant", + "ayez", + "ayons", + "beaucoup", + "c'est-à-dire", + "c-à-d.", + "ca", + "car", + "ce", + "ceci", + "cela", + "celle", + "celle-ci", + "celles", + "celles-ci", + "celui", + "celui-ci", + "celui-là", + "cent", + "certain", + "certaine", + "certaines", + "certains", + "ces", + "cet", + "cette", + "ceux", + "ceux-ci", + "ceux-là", + "cf.", + "chacun", + "chacune", + "chaque", + "chez", + "ci", + "cinq", + "combien", + "comme", + "comment", + "concernant", + "contre", + "cà", + "d'après", + "d'autres", + "dans", + "de", + "dehors", + "depuis", + "derrière", + "des", + "deux", + "devait", + "devant", + "devez", + "devions", + "devoir", + "devons", + "devra", + "devraient", + "devrait", + "devrions", + "devrons", + "devront", + "doit", + "doivent", + "donc", + "dont", + "du", + "durant", + "dès", + "début", + "dû", + "elle", + "elle-même", + "elles", + "elles-mêmes", + "en", + "entre", + "entres", + "envers", + "environ", + "es", + "est", + "et", + "etaient", + "etant", + "etre", + "eut", + "eux", + "eux-mêmes", + "excepté", + "eût", + "faire", + "fais", + "faisaient", + "faisait", + "faisant", + "fait", + "faite", + "faites", + "fasse", + "fassent", + "fera", + "ferait", + "feront", + "firent", + "fit", + "font", + "furent", + "fussent", + "fut", + "fût", + "für", + "grâce", + "hormis", + "hors", + "i", + "il", + "ils", + "iront", + "je", + "jusque", + "l'on", + "la", + "ladite", + "laquelle", + "le", + "le/lui", + "ledit", + "lequel", + "les", + "lesdites", + "lesquelles", + "lesquels", + "leur", + "leurs", + "lors", + "lorsque", + "lui", + "lui-aussi", + "lui-même", + "là", + "ma", + "maint", + "maintes", + "mais", + "malgré", + "me", + "mes", + "mien", + "moi", + "moi-même", + "moins", + "mon", + "ne", + "ni", + "nonobstant", + "nos", + "notre", + "nous", + "nous-mêmes", + "nul", + "nôtre", + "nôtres", + "on", + "ont", + "onze", + "ou", + "outre", + "où", + "par", + "parce", + "parmi", + "pas", + "pendant", + "personne", + "peu", + "peut", + "peuvent", + "peux", + "plupart", + "plus", + "plusieurs", + "pour", + "pourquoi", + "pourra", + "pourraient", + "pourrait", + "pourrez", + "pourrons", + "pourront", + "pouvait", + "pouvez", + "pouvoir", + "pouvons", + "presque", + "près", + "pu", + "puis", + "puisque", + "puisse", + "puissent", + "puissions", + "qu", + "quand", + "quant", + "quarante", + "quatre", + "que", + "quel", + "quelconque", + "quelle", + "quelles", + "quelqu'un", + "quelque", + "quelques", + "quelques-unes", + "quelques-uns", + "quelqu’un", + "quels", + "qui", + "quiconque", + "quid", + "quoi", + "quoique", + "rien", + "sa", + "sans", + "sauf", + "se", + "selon", + "sera", + "serai", + "seraient", + "serais", + "serait", + "seras", + "serez", + "seriez", + "serions", + "serons", + "seront", + "ses", + "si", + "sien", + "sienne", + "siennes", + "siens", + "sinon", + "six", + "soi", + "soi-même", + "soient", + "sois", + "soit", + "sommes", + "son", + "sont", + "sous", + "soyez", + "soyons", + "suis", + "sur", + "t-il", + "ta", + "tandis", + "tant", + "tantôt", + "te", + "tel", + "telle", + "telles", + "tes", + "tien", + "toi", + "ton", + "tous", + "tout", + "toute", + "toutes", + "trois", + "tte", + "tu", + "un", + "une", + "unes", + "uns", + "unt", + "va", + "vais", + "van", + "vers", + "versus", + "via", + "voici", + "voilà", + "voir", + "voire", + "vont", + "vos", + "votre", + "vous", + "vous-même", + "vs", + "vu", + "y", + "à", + "á", + "ça", + "étaient", + "étais", + "était", + "étant", + "étiez", + "étions", + "été", + "êtes", + "être", + ], + "hi": [ + "अंदर", + "अत", + "अदि", + "अप", + "अपना", + "अपनि", + "अपनी", + "अपने", + "अभि", + "अभी", + "आदि", + "आप", + "इंहिं", + "इंहें", + "इंहों", + "इतयादि", + "इत्यादि", + "इन", + "इनका", + "इन्हीं", + "इन्हें", + "इन्हों", + "इस", + "इसका", + "इसकि", + "इसकी", + "इसके", + "इसमें", + "इसि", + "इसी", + "इसे", + "उंहिं", + "उंहें", + "उंहों", + "उन", + "उनका", + "उनकि", + "उनकी", + "उनके", + "उनको", + "उन्हीं", + "उन्हें", + "उन्हों", + "उस", + "उसके", + "उसि", + "उसी", + "उसे", + "एक", + "एवं", + "एस", + "एसे", + "ऐसे", + "ओर", + "और", + "कइ", + "कई", + "कर", + "करता", + "करते", + "करना", + "करने", + "करें", + "कहते", + "कहा", + "का", + "काफि", + "काफ़ी", + "कि", + "किंहें", + "किंहों", + "कितना", + "किन्हें", + "किन्हों", + "किया", + "किर", + "किस", + "किसि", + "किसी", + "किसे", + "की", + "कुछ", + "कुल", + "के", + "को", + "कोइ", + "कोई", + "कोन", + "कोनसा", + "कौन", + "कौनसा", + "गया", + "घर", + "जब", + "जहाँ", + "जहां", + "जा", + "जिंहें", + "जिंहों", + "जितना", + "जिधर", + "जिन", + "जिन्हें", + "जिन्हों", + "जिस", + "जिसे", + "जीधर", + "जेसा", + "जेसे", + "जैसा", + "जैसे", + "जो", + "तक", + "तब", + "तरह", + "तिंहें", + "तिंहों", + "तिन", + "तिन्हें", + "तिन्हों", + "तिस", + "तिसे", + "तो", + "था", + "थि", + "थी", + "थे", + "दबारा", + "दवारा", + "दिया", + "दुसरा", + "दुसरे", + "दूसरे", + "दो", + "द्वारा", + "न", + "नहिं", + "नहीं", + "ना", + "निचे", + "निहायत", + "नीचे", + "ने", + "पर", + "पहले", + "पुरा", + "पूरा", + "पे", + "फिर", + "बनि", + "बनी", + "बहि", + "बही", + "बहुत", + "बाद", + "बाला", + "बिलकुल", + "भि", + "भितर", + "भी", + "भीतर", + "मगर", + "मानो", + "मे", + "में", + "यदि", + "यह", + "यहाँ", + "यहां", + "यहि", + "यही", + "या", + "यिह", + "ये", + "रखें", + "रवासा", + "रहा", + "रहे", + "ऱ्वासा", + "लिए", + "लिये", + "लेकिन", + "व", + "वगेरह", + "वरग", + "वर्ग", + "वह", + "वहाँ", + "वहां", + "वहिं", + "वहीं", + "वाले", + "वुह", + "वे", + "वग़ैरह", + "संग", + "सकता", + "सकते", + "सबसे", + "सभि", + "सभी", + "साथ", + "साबुत", + "साभ", + "सारा", + "से", + "सो", + "हि", + "ही", + "हुअ", + "हुआ", + "हुइ", + "हुई", + "हुए", + "हे", + "हें", + "है", + "हैं", + "हो", + "होता", + "होति", + "होती", + "होते", + "होना", + "होने", + ], + "id": [ + "Anda", + "ada", + "adakah", + "adalah", + "adanya", + "adapaun", + "adapun", + "agar", + "akan", + "akau", + "akhirnya", + "akibat", + "akibatnya", + "aku", + "alias", + "anda", + "aneka", + "antar", + "antara", + "antaranya", + "apa", + "apabila", + "apakah", + "apalagi", + "apapun", + "asal", + "atas", + "atau", + "ataukah", + "ataupun", + "bagai", + "bagaimana", + "bagaimanakah", + "bagaimanapun", + "bagi", + "bagi-nya", + "bahkan", + "bahwa", + "bahwasanya", + "baik", + "bakal", + "balik", + "banyak", + "banyaknya", + "baru", + "bawah", + "beberapa", + "begini", + "beginilah", + "begitu", + "belakang", + "beliau", + "belum", + "beragam", + "berapa", + "berapakah", + "berbagai", + "berberapa", + "berdasar", + "berdasarkan", + "berdiri", + "berdirinya", + "berikut", + "berkat", + "bersama", + "bersamanya", + "berupa", + "beserta", + "betapa", + "bila", + "bilamana", + "bisa", + "boleh", + "buah", + "buat", + "bukan", + "bukankah", + "bukanlah", + "bukannya", + "buruh", + "cara", + "dalam", + "dalamnya", + "dan", + "dapat", + "dari", + "darimana", + "daripada", + "dekat", + "demi", + "demikian", + "dengan", + "dengannya", + "depan", + "dg", + "di", + "dia", + "diantara", + "diantaranya", + "diatas", + "dibalik", + "dibandingkan", + "dibawah", + "dibawahnya", + "dibeberapa", + "dibelakang", + "diberbagai", + "didalam", + "didalamnya", + "diluar", + "dimana", + "diri", + "dirinya", + "disaat", + "disamping", + "disebelah", + "disekeliling", + "diseluruh", + "disini", + "ditepi", + "dng", + "dr", + "engkau", + "gambar", + "gimana", + "hadap", + "hai", + "hanya", + "harus", + "hei", + "ia", + "ialah", + "ini", + "inikah", + "inilah", + "inipun", + "isi", + "isinya", + "itu", + "itua", + "itulah", + "itupun", + "iye", + "jadi", + "jangan", + "jauh", + "jelang", + "jenis", + "jika", + "juga", + "kah", + "kalau", + "kalian", + "kalo", + "kami", + "kamilah", + "kamu", + "kan", + "kapan", + "kapankah", + "karena", + "karenanya", + "kau", + "ke", + "kebanyakan", + "kecuali", + "kedalam", + "kedepan", + "kedua", + "keduanya", + "keliling", + "keluar", + "kemudian", + "kena", + "kenapa", + "kendati", + "kepada", + "kepadaku", + "kepadamu", + "kepadanya", + "kepusatnya", + "kerana", + "keseluruhan", + "keseluruhannya", + "kesemuanya", + "ketika", + "ketimbang", + "khususnya", + "kira", + "kita", + "kok", + "koq", + "kpd", + "ku", + "la", + "lagi", + "lah", + "lain", + "lainnya", + "lalu", + "lama", + "lantaran", + "lantas", + "layak", + "layaknya", + "lengah", + "lewat", + "loh", + "luar", + "macam", + "maka", + "makanya", + "maksud", + "maksudnya", + "malahan", + "mampu", + "mana", + "manakah", + "manakala", + "manapun", + "masa", + "masing", + "masing-masing", + "maupun", + "mayoritas", + "melainkan", + "melalui", + "melawan", + "melewati", + "menajak", + "menbeli", + "mengajak", + "mengapa", + "mengenai", + "mengenainya", + "menjadi", + "menjelang", + "menuju", + "menurut", + "menurutmu", + "mereka", + "merekapun", + "merupakan", + "meski", + "meskipn", + "meskipun", + "misalkan", + "misalnya", + "msl", + "mulai", + "mungkin", + "namun", + "nya", + "oleh", + "olehnya", + "orang", + "pada", + "padahal", + "padanya", + "para", + "pasca", + "pd", + "per", + "perihal", + "perlu", + "pula", + "pun", + "saat", + "saatnya", + "sama", + "sambil", + "sampai", + "sampai-sampai", + "samping", + "sana", + "sang", + "satu", + "satu-satunya", + "satunya", + "saya", + "seakan", + "seandainya", + "seantero", + "sebab", + "sebagai", + "sebagaimana", + "sebagian", + "sebaliknya", + "sebangsa", + "sebanyak", + "sebelah", + "sebelum", + "sebelumnya", + "seberang", + "seberat", + "sebesar", + "sebuah", + "secara", + "sedang", + "sedangkan", + "sedangkkan", + "sedari", + "sedikit", + "sedikitnya", + "seekor", + "segala", + "segenap", + "seharusnya", + "sehingga", + "sehubungan", + "seiring", + "sejak", + "sejauh", + "sejenis", + "sejumlah", + "sekali", + "sekaligus", + "sekalipun", + "sekitar", + "sekitarnya", + "selain", + "selaku", + "selama", + "selesai", + "seluas", + "seluruh", + "semacam", + "semasa", + "semenjak", + "sementara", + "sempat", + "semua", + "semuanya", + "sendiri", + "senilai", + "seorang", + "sepanjang", + "sepasang", + "sepeninggal", + "seperti", + "sepertinya", + "sepeti", + "sepucuk", + "seputar", + "serangkaian", + "seraya", + "serta", + "sesampai", + "sesampainya", + "seseorang", + "sesuai", + "sesuatu", + "sesudah", + "setebal", + "setelah", + "setelahnya", + "setengah", + "setiap", + "setinggi", + "seusai", + "sewaktu", + "si", + "siapa", + "siapakah", + "siapapun", + "silakan", + "sini", + "sinilah", + "situ", + "soal", + "suatu", + "sudah", + "supaya", + "tak", + "tan", + "tangguh", + "tanpa", + "tapi", + "tatkala", + "telah", + "tempat", + "tengah", + "tengahnya", + "tentang", + "tepat", + "tepatnya", + "teratas", + "terhadap", + "terhadapnya", + "termasuk", + "ternyata", + "tersebut", + "tertentu", + "terutama", + "tesebut", + "tetap", + "tetapi", + "tiada", + "tiap", + "tidak", + "tidakkah", + "tidaklah", + "tidaknya", + "tsb", + "tt", + "ttg", + "tuh", + "tujuh", + "untuk", + "untukmu", + "untuknya", + "untung", + "usah", + "usai", + "via", + "waktu", + "walau", + "walaupun", + "ya", + "yaitu", + "yakni", + "yang", + "yg", + ], + "mr": [ + "अधिक", + "अनेक", + "अशी", + "असलयाचे", + "असलेल्या", + "असा", + "असून", + "असे", + "आज", + "आणि", + "आता", + "आपल्या", + "आला", + "आली", + "आले", + "आहे", + "आहेत", + "एक", + "एका", + "कमी", + "करणयात", + "करून", + "का", + "काम", + "काय", + "काही", + "किवा", + "की", + "केला", + "केली", + "केले", + "कोटी", + "गेल्या", + "घेऊन", + "जात", + "झाला", + "झाली", + "झाले", + "झालेल्या", + "टा", + "डॉ", + "तर", + "तरी", + "तसेच", + "ता", + "ती", + "तीन", + "ते", + "तो", + "त्या", + "त्याचा", + "त्याची", + "त्याच्या", + "त्याना", + "त्यानी", + "त्यामुळे", + "त्री", + "दिली", + "दोन", + "न", + "नाही", + "निर्ण्य", + "पण", + "पम", + "परयतन", + "पाटील", + "म", + "मात्र", + "माहिती", + "मी", + "मुबी", + "म्हणजे", + "म्हणाले", + "म्हणून", + "या", + "याचा", + "याची", + "याच्या", + "याना", + "यानी", + "येणार", + "येत", + "येथील", + "येथे", + "लाख", + "व", + "व्यकत", + "सर्व", + "सागित्ले", + "सुरू", + "हजार", + "हा", + "ही", + "हे", + "होणार", + "होत", + "होता", + "होती", + "होते", + ], + "pt": [ + "a", + "a cabo de", + "a caminho de", + "a despeito de", + "a favor de", + "a fim de", + "a menos que", + "a não ser", + "a não ser que", + "a partir de", + "a propósito", + "a respeito de", + "a título de", + "abaixo de", + "acima", + "acima de", + "afinal", + "afora", + "agora", + "agora que", + "ai", + "ainda", + "ainda mais", + "algo", + "algum", + "alguma", + "algumas", + "alguns", + "alguém", + "além", + "além de", + "ambas", + "ambos", + "andar", + "andou", + "ante", + "antes", + "anti", + "antre", + "ao", + "ao cabo de", + "ao invés de", + "ao lado", + "ao longo de", + "ao passo que", + "ao redor de", + "aos cuidados de", + "apenas", + "apesar de", + "apesar de que", + "após", + "aquela", + "aquelas", + "aquele", + "aqueles", + "aquilo", + "as", + "assim", + "assim como", + "assim que", + "atras", + "através", + "através de", + "atráis", + "atrás", + "atrás de", + "até", + "até que", + "auto", + "avante", + "aí", + "bastante", + "bem", + "bem como", + "cada", + "cara a cara", + "caso", + "cerca", + "cima", + "com", + "comigo", + "como", + "como se", + "conforme", + "connosco", + "conosco", + "conquanto", + "consigo", + "consoante", + "contanto", + "contanto que", + "contigo", + "contra", + "contudo", + "convosco", + "cuja", + "cujas", + "cujo", + "cujos", + "d'", + "d.", + "da", + "dada", + "dado", + "dado que", + "dali", + "daquela", + "daquelas", + "daquele", + "daqui", + "daqui a", + "daí", + "de", + "de modo que", + "dela", + "delas", + "dele", + "deles", + "demais", + "dentre", + "dentro", + "dentro de", + "depois", + "depois de", + "desde", + "desde que", + "dessa", + "dessas", + "desse", + "desses", + "desta", + "destas", + "deste", + "destes", + "detrás de", + "deva", + "devam", + "deve", + "devem", + "devemos", + "devendo", + "dever", + "deveria", + "deveriam", + "deverá", + "deverão", + "deviam", + "devido", + "devido a", + "devo", + "diante de", + "disso", + "diversas", + "diversos", + "do que", + "donde", + "doutros", + "dum", + "duma", + "durante", + "e", + "e/ou", + "eba", + "eis", + "ela", + "elas", + "ele", + "eles", + "eles/elas", + "em", + "em cima de", + "em frente a", + "em meio a", + "em nome de", + "em prol de", + "em relação a", + "em torno de", + "em vez de", + "em virtude de", + "em vista de", + "em volta de", + "embaixo de", + "embora", + "enquanto", + "entre", + "entretanto", + "então", + "era", + "eram", + "ergo", + "essa", + "essas", + "esse", + "esses", + "esta", + "estado", + "estamos", + "estando", + "estar", + "estarem", + "estaria", + "estariam", + "estarmos", + "estará", + "estarão", + "estas", + "estava", + "estavam", + "este", + "esteja", + "estejam", + "estes", + "esteve", + "estivemos", + "estiver", + "estiveram", + "estiverem", + "estivesse", + "estivessem", + "estou", + "está", + "estávamos", + "estão", + "eu", + "excepto", + "exceto", + "fica", + "ficado", + "ficamos", + "ficando", + "ficar", + "ficaram", + "ficaria", + "ficou", + "fiquei", + "foi", + "fomos", + "for", + "fora", + "fora de", + "foram", + "forem", + "fosse", + "fossem", + "frente a", + "fui", + "fôr", + "gente", + "graças", + "graças a", + "havendo", + "haver", + "haverem", + "havia", + "haviam", + "houver", + "houvesse", + "há", + "i.e.", + "ia", + "iam", + "ido", + "igual a", + "inté", + "invés de", + "ir", + "ireii", + "irem", + "iremos", + "iria", + "iriam", + "irá", + "irão", + "isso", + "isto", + "junto a", + "junto com", + "já", + "já que", + "la", + "las", + "lhe", + "lhes", + "lo", + "logo", + "logo que", + "los", + "lá", + "mais", + "mais de", + "mais do que", + "mais que", + "mal", + "malgrado", + "mas", + "me", + "mediante", + "menos", + "mesma", + "mesmas", + "mesmo", + "mesmo que", + "mesmo se", + "mesmos", + "meu", + "meus", + "mim", + "minha", + "minhas", + "muita", + "muitas", + "muito", + "muito menos", + "muitos", + "muitíssimo", + "n'", + "na", + "na frente de", + "na sequência de", + "nada", + "naquela", + "naquele", + "naqueles", + "naquilo", + "nas", + "nele", + "neles", + "nem", + "nenhum", + "nenhuma", + "nenhumas", + "nenhuns", + "nessa", + "nessas", + "nesse", + "nesses", + "nesta", + "nestas", + "neste", + "nestes", + "ninguém", + "no", + "no que", + "nos", + "nosco", + "nossa", + "nossas", + "nosso", + "nossos", + "num", + "numa", + "nós", + "o", + "o(s)", + "onde", + "onde quer que", + "ora", + "os", + "ou", + "outra", + "outras", + "outrem", + "outro", + "outros", + "outrém", + "oxalá", + "p'ra", + "p/", + "pa", + "para", + "para com", + "para que", + "parece", + "parecer", + "pelo", + "per", + "perante", + "perantes", + "permanece", + "permanecer", + "perto de", + "pode", + "podem", + "podemos", + "podendo", + "poder", + "poderei", + "poderem", + "poderemos", + "poderia", + "poderiam", + "poderá", + "poderão", + "poderíamos", + "podia", + "podiam", + "podíamos", + "pois", + "por", + "por causa de", + "por causa que", + "por conta de", + "por entre", + "por isso", + "por isto", + "por meio de", + "por trás", + "por trás de", + "por volta de", + "porquanto", + "porque", + "portanto", + "porém", + "possa", + "possam", + "possamos", + "posso", + "pouca", + "poucas", + "pouco", + "poucos", + "pouquíssimos", + "pra", + "precisam", + "precisar", + "precisaram", + "precisarão", + "precisou", + "prestes a", + "pretender", + "pretendiam", + "pro", + "pré", + "pré-", + "pró", + "pude", + "pudemos", + "puderam", + "puderem", + "pudesse", + "pudessem", + "pós", + "pôde", + "pôr", + "público", + "q.b.", + "quais", + "quaisquer", + "qual", + "qualquer", + "quando", + "quanta", + "quantas", + "quanto", + "quanto a", + "quanto baste", + "quanto mais", + "quantos", + "que", + "quem", + "quer", + "quão", + "quê", + "rente a", + "rente de", + "rumo a", + "se", + "se bem que", + "se e somente se", + "se-", + "segundo", + "seja", + "sejam", + "sem", + "sem falar de", + "sempre que", + "sendo", + "sendo que", + "senão", + "ser", + "serei", + "serem", + "seremos", + "seria", + "seriam", + "sermos", + "será", + "serão", + "seu", + "seus", + "si", + "sido", + "sob", + "sobre", + "somos", + "sou", + "sse", + "sua", + "suas", + "sub", + "são", + "sê", + "só que", + "sôbre", + "ta", + "tais", + "tal", + "tampouco", + "tanta", + "tantas", + "tanto", + "tantos", + "te", + "tem", + "temos", + "tende", + "tendo", + "tenha", + "tenham", + "tenhamos", + "tenho", + "tentado", + "tentar", + "tentaram", + "ter", + "terei", + "terem", + "teremos", + "teria", + "teriam", + "termos", + "terá", + "terão", + "teríamos", + "teu", + "teus", + "teve", + "ti", + "tido", + "tinha", + "tinham", + "tive", + "tivemos", + "tiver", + "tiveram", + "tiverem", + "tivesse", + "tivessem", + "to", + "toda", + "todas", + "todavia", + "todo", + "todos", + "trás", + "tu", + "tua", + "tuas", + "tudo", + "tá", + "tão", + "tão logo", + "té", + "têm", + "tínhamos", + "ultra", + "um", + "uma", + "uma vez que", + "umas", + "uns", + "vai", + "vais", + "vamos", + "varias", + "varios", + "versus", + "via", + "visto", + "visto que", + "voce", + "você", + "vocês", + "vos", + "vossa", + "vossas", + "vosso", + "vossos", + "vou", + "vs", + "vá", + "várias", + "vários", + "vão", + "vérsus", + "vós", + "à", + "à beira de", + "à custa de", + "à expensa de", + "à luz de", + "à medida que", + "àquela", + "àqueles", + "às", + "às custas de", + "às expensas de", + "é", + "íamos", + "\u200b\u200bem", + ], + "sw": [ + "akasema", + "alikuwa", + "alisema", + "baada", + "basi", + "bila", + "cha", + "chini", + "hadi", + "hapo", + "hata", + "hivyo", + "hiyo", + "huku", + "huo", + "ili", + "ilikuwa", + "juu", + "kama", + "karibu", + "katika", + "kila", + "kima", + "kisha", + "kubwa", + "kutoka", + "kuwa", + "kwa", + "kwamba", + "kwenda", + "kwenye", + "la", + "lakini", + "mara", + "mdogo", + "mimi", + "mkubwa", + "mmoja", + "moja", + "muda", + "mwenye", + "na", + "naye", + "ndani", + "ng", + "ni", + "nini", + "nonkungu", + "pamoja", + "pia", + "sana", + "sasa", + "sauti", + "tafadhali", + "tena", + "tu", + "vile", + "wa", + "wakati", + "wake", + "walikuwa", + "wao", + "watu", + "wengine", + "wote", + "ya", + "yake", + "yangu", + "yao", + "yeye", + "yule", + "za", + "zaidi", + "zake", + ], + "ur": [ + "آئی", + "آئے", + "آج", + "آخر", + "آخرکبر", + "آدهی", + "آًب", + "آٹھ", + "آیب", + "اة", + "اخبزت", + "اختتبم", + "ادھر", + "ارد", + "اردگرد", + "ارکبى", + "اش", + "اضتعوبل", + "اضتعوبلات", + "اضطرذ", + "اضکب", + "اضکی", + "اضکے", + "اطراف", + "اغیب", + "افراد", + "الگ", + "اور", + "اوًچب", + "اوًچبئی", + "اوًچی", + "اوًچے", + "اى", + "اً", + "اًذر", + "اًہیں", + "اٹھبًب", + "اپٌب", + "اپٌے", + "اچھب", + "اچھی", + "اچھے", + "اکثر", + "اکٹھب", + "اکٹھی", + "اکٹھے", + "اکیلا", + "اکیلی", + "اکیلے", + "اگرچہ", + "اہن", + "ایطے", + "ایک", + "ب", + "ت", + "تبزٍ", + "تت", + "تر", + "ترتیت", + "تریي", + "تعذاد", + "تن", + "تو", + "توبم", + "توہی", + "توہیں", + "تٌہب", + "تک", + "تھب", + "تھوڑا", + "تھوڑی", + "تھوڑے", + "تھی", + "تھے", + "تیي", + "ثب", + "ثبئیں", + "ثبترتیت", + "ثبری", + "ثبرے", + "ثبعث", + "ثبلا", + "ثبلترتیت", + "ثبہر", + "ثدبئے", + "ثرآں", + "ثراں", + "ثرش", + "ثعذ", + "ثغیر", + "ثلٌذ", + "ثلٌذوثبلا", + "ثلکہ", + "ثي", + "ثٌب", + "ثٌبرہب", + "ثٌبرہی", + "ثٌبرہے", + "ثٌبًب", + "ثٌذ", + "ثٌذکرو", + "ثٌذکرًب", + "ثٌذی", + "ثڑا", + "ثڑوں", + "ثڑی", + "ثڑے", + "ثھر", + "ثھرا", + "ثھراہوا", + "ثھرپور", + "ثھی", + "ثہت", + "ثہتر", + "ثہتری", + "ثہتریي", + "ثیچ", + "ج", + "خب", + "خبرہب", + "خبرہی", + "خبرہے", + "خبهوظ", + "خبًب", + "خبًتب", + "خبًتی", + "خبًتے", + "خبًٌب", + "خت", + "ختن", + "خجکہ", + "خص", + "خططرذ", + "خلذی", + "خو", + "خواى", + "خوًہی", + "خوکہ", + "خٌبة", + "خگہ", + "خگہوں", + "خگہیں", + "خیطب", + "خیطبکہ", + "در", + "درخبت", + "درخہ", + "درخے", + "درزقیقت", + "درضت", + "دش", + "دفعہ", + "دلچطپ", + "دلچطپی", + "دلچطپیبں", + "دو", + "دور", + "دوراى", + "دوضرا", + "دوضروں", + "دوضری", + "دوضرے", + "دوًوں", + "دکھبئیں", + "دکھبتب", + "دکھبتی", + "دکھبتے", + "دکھبو", + "دکھبًب", + "دکھبیب", + "دی", + "دیب", + "دیتب", + "دیتی", + "دیتے", + "دیر", + "دیٌب", + "دیکھو", + "دیکھٌب", + "دیکھی", + "دیکھیں", + "دے", + "ر", + "راضتوں", + "راضتہ", + "راضتے", + "رریعہ", + "رریعے", + "رکي", + "رکھ", + "رکھب", + "رکھتب", + "رکھتبہوں", + "رکھتی", + "رکھتے", + "رکھی", + "رکھے", + "رہب", + "رہی", + "رہے", + "ز", + "زبصل", + "زبضر", + "زبل", + "زبلات", + "زبلیہ", + "زصوں", + "زصہ", + "زصے", + "زقبئق", + "زقیتیں", + "زقیقت", + "زکن", + "زکویہ", + "زیبدٍ", + "صبف", + "صسیر", + "صفر", + "صورت", + "صورتسبل", + "صورتوں", + "صورتیں", + "ض", + "ضبت", + "ضبتھ", + "ضبدٍ", + "ضبرا", + "ضبرے", + "ضبل", + "ضبلوں", + "ضت", + "ضرور", + "ضرورت", + "ضروری", + "ضلطلہ", + "ضوچ", + "ضوچب", + "ضوچتب", + "ضوچتی", + "ضوچتے", + "ضوچو", + "ضوچٌب", + "ضوچی", + "ضوچیں", + "ضکب", + "ضکتب", + "ضکتی", + "ضکتے", + "ضکٌب", + "ضکی", + "ضکے", + "ضیذھب", + "ضیذھی", + "ضیذھے", + "ضیکٌڈ", + "ضے", + "طرف", + "طریق", + "طریقوں", + "طریقہ", + "طریقے", + "طور", + "طورپر", + "ظبہر", + "ع", + "عذد", + "عظین", + "علاقوں", + "علاقہ", + "علاقے", + "علاوٍ", + "عووهی", + "غبیذ", + "غخص", + "غذ", + "غروع", + "غروعبت", + "غے", + "فرد", + "فی", + "ق", + "قجل", + "قجیلہ", + "قطن", + "لئے", + "لا", + "لازهی", + "لو", + "لوجب", + "لوجی", + "لوجے", + "لوسبت", + "لوسہ", + "لوگ", + "لوگوں", + "لڑکپي", + "لگتب", + "لگتی", + "لگتے", + "لگٌب", + "لگی", + "لگیں", + "لگے", + "لی", + "لیب", + "لیٌب", + "لیں", + "لے", + "ه", + "هتعلق", + "هختلف", + "هسترم", + "هسترهہ", + "هسطوش", + "هسیذ", + "هطئلہ", + "هطئلے", + "هطبئل", + "هطتعول", + "هطلق", + "هعلوم", + "هػتول", + "هلا", + "هوکي", + "هوکٌبت", + "هوکٌہ", + "هٌبضت", + "هڑا", + "هڑًب", + "هڑے", + "هکول", + "هگر", + "هہرثبى", + "هیرا", + "هیری", + "هیرے", + "هیں", + "و", + "وار", + "والے", + "وٍ", + "ًئی", + "ًئے", + "ًب", + "ًبپطٌذ", + "ًبگسیر", + "ًطجت", + "ًقطہ", + "ًو", + "ًوخواى", + "ًکبلٌب", + "ًکتہ", + "ًہ", + "ًہیں", + "ًیب", + "ًے", + "ٓ آش", + "ٹھیک", + "پبئے", + "پبش", + "پبًب", + "پبًچ", + "پر", + "پراًب", + "پطٌذ", + "پل", + "پورا", + "پوچھب", + "پوچھتب", + "پوچھتی", + "پوچھتے", + "پوچھو", + "پوچھوں", + "پوچھٌب", + "پوچھیں", + "پچھلا", + "پھر", + "پہلا", + "پہلی", + "پہلےضی", + "پہلےضے", + "پہلےضےہی", + "پیع", + "چبر", + "چبہب", + "چبہٌب", + "چبہے", + "چلا", + "چلو", + "چلیں", + "چلے", + "چکب", + "چکی", + "چکیں", + "چکے", + "چھوٹب", + "چھوٹوں", + "چھوٹی", + "چھوٹے", + "چھہ", + "چیسیں", + "ڈھوًڈا", + "ڈھوًڈلیب", + "ڈھوًڈو", + "ڈھوًڈًب", + "ڈھوًڈی", + "ڈھوًڈیں", + "ک", + "کئی", + "کئے", + "کب", + "کبفی", + "کبم", + "کت", + "کجھی", + "کرا", + "کرتب", + "کرتبہوں", + "کرتی", + "کرتے", + "کرتےہو", + "کررہب", + "کررہی", + "کررہے", + "کرو", + "کرًب", + "کریں", + "کرے", + "کطی", + "کل", + "کن", + "کوئی", + "کوتر", + "کورا", + "کوروں", + "کورٍ", + "کورے", + "کوطي", + "کوى", + "کوًطب", + "کوًطی", + "کوًطے", + "کھولا", + "کھولو", + "کھولٌب", + "کھولی", + "کھولیں", + "کھولے", + "کہ", + "کہب", + "کہتب", + "کہتی", + "کہتے", + "کہو", + "کہوں", + "کہٌب", + "کہی", + "کہیں", + "کہے", + "کی", + "کیب", + "کیطب", + "کیطرف", + "کیطے", + "کیلئے", + "کیوًکہ", + "کیوں", + "کیے", + "کے", + "کےثعذ", + "کےرریعے", + "گئی", + "گئے", + "گب", + "گرد", + "گروٍ", + "گروپ", + "گروہوں", + "گٌتی", + "گی", + "گیب", + "گے", + "ہر", + "ہن", + "ہو", + "ہوئی", + "ہوئے", + "ہوا", + "ہوبرا", + "ہوبری", + "ہوبرے", + "ہوتب", + "ہوتی", + "ہوتے", + "ہورہب", + "ہورہی", + "ہورہے", + "ہوضکتب", + "ہوضکتی", + "ہوضکتے", + "ہوًب", + "ہوًی", + "ہوًے", + "ہوچکب", + "ہوچکی", + "ہوچکے", + "ہوگئی", + "ہوگئے", + "ہوگیب", + "ہوں", + "ہی", + "ہیں", + "ہے", + "ی", + "یقیٌی", + "یہ", + "یہبں", + ], + "vi": [ + "bên", + "bấy nhiêu", + "bằng", + "bởi", + "cc", + "chao", + "cho", + "cho dù", + "chán", + "chính", + "chút", + "chứ", + "các", + "cái", + "còn", + "có", + "có vẻ", + "cùng", + "cơ mà", + "cả", + "của", + "do", + "do vậy", + "do đó", + "duy", + "dù", + "dù sao", + "dù vậy", + "dưới", + "dường như", + "dạ", + "dẫu", + "dẫu vậy", + "giữa", + "gì", + "hay", + "hay là", + "hoặc", + "hơn nữa", + "hả", + "hầu hết", + "hết", + "hề", + "hễ", + "không những", + "l", + "là", + "lên", + "lại nữa", + "lẫn", + "lắm", + "mà", + "mà còn", + "mấy", + "mặc dù", + "mặt khác", + "mọi", + "mỗi", + "một chút", + "một nửa", + "một số", + "một vài", + "một ít", + "ngay", + "ngoài", + "ngoài ra", + "ngược lại", + "nhá", + "nhân", + "nhé", + "như", + "như vậy", + "nhưng", + "nhất là", + "nhằm", + "nhỉ", + "nhờ", + "những", + "nào", + "này", + "nè", + "nên", + "nếu", + "nếu như", + "nửa", + "nữa", + "phía", + "phần lớn", + "qua", + "quả", + "ra", + "riêng", + "rùi", + "rằng", + "rồi", + "sang", + "sao", + "sau", + "song", + "thay", + "theo", + "thiệt", + "thì", + "thí dụ", + "thôi", + "thật", + "thế", + "thế là", + "thế mà", + "thế nhưng", + "toàn", + "toàn bộ", + "toàn thể", + "trong", + "trên", + "trước", + "trời", + "tuy", + "tuy nhiên", + "tuy vậy", + "tóm lại", + "tại", + "tất cả", + "tận", + "tổ", + "tới", + "tức", + "tức là", + "từ", + "ui", + "và", + "vài", + "vài ba", + "vào", + "vì", + "vì thế", + "vì vậy", + "ví dụ", + "vô", + "vô số", + "vô vàn", + "vậy", + "vậy là", + "vậy mà", + "về", + "với", + "xuống", + "à", + "đa số", + "đi", + "đâu", + "đây", + "đó", + "đôi", + "được", + "đấy", + "đến", + "để", + "đối với", + "ạ", + "ấy", + "ở", + ], + "yo": [ + "a", + "an", + "bá", + "bí", + "bẹ̀rẹ̀", + "fún", + "fẹ́", + "gbogbo", + "inú", + "jù", + "jẹ", + "jẹ́", + "kan", + "kì", + "kí", + "kò", + "láti", + "lè", + "lọ", + "mi", + "mo", + "máa", + "mọ̀", + "ni", + "náà", + "ní", + "nígbà", + "nítorí", + "nǹkan", + "o", + "padà", + "pé", + "púpọ̀", + "pẹ̀lú", + "rẹ̀", + "sì", + "sí", + "sínú", + "ṣ", + "ti", + "tí", + "wà", + "wá", + "wọn", + "wọ́n", + "yìí", + "àti", + "àwọn", + "é", + "í", + "òun", + "ó", + "ń", + "ńlá", + "ṣe", + "ṣé", + "ṣùgbọ́n", + "ẹmọ́", + "ọjọ́", + "ọ̀pọ̀lọpọ̀", + ], + "zh": [ + "", + "一", + "一争", + "一些", + "一切", + "一旦", + "一点", + "一爭", + "上", + "上前", + "上表", + "下", + "不", + "不仅", + "不会", + "不但", + "不僅", + "不光", + "不关", + "不准", + "不单", + "不可", + "不單", + "不够", + "不夠", + "不应", + "不得", + "不想", + "不愿", + "不應", + "不是", + "不會", + "不準", + "不用", + "不管", + "不經", + "不肯", + "不能", + "不要", + "不該", + "不論", + "不论", + "不该", + "不過", + "不需", + "不願", + "与", + "与其", + "且", + "且是", + "並", + "並且", + "並非", + "个", + "个人", + "中", + "临", + "为", + "为了", + "为人", + "为什么", + "主", + "乃至", + "之", + "之上", + "之下", + "之中", + "之內", + "之内", + "之初", + "之前", + "之后", + "之外", + "之後", + "之所以", + "之时", + "之時", + "之間", + "之间", + "也", + "也是", + "书", + "了", + "争辩", + "事", + "于", + "井", + "亚", + "亞", + "亦为", + "亦是", + "亦為", + "亭", + "亲", + "人", + "人人", + "人家", + "什么", + "什麼", + "今", + "仍是", + "仍算", + "从", + "他", + "他们", + "他俩", + "他倆", + "他們", + "代", + "令", + "以", + "以上", + "以下", + "以为", + "以來", + "以前", + "以北", + "以及", + "以后", + "以外", + "以往", + "以後", + "以来", + "以為", + "以爲", + "以至", + "们", + "价", + "任", + "任何", + "众", + "会", + "传", + "伪", + "似乎", + "似的", + "但", + "但是", + "位", + "低", + "住", + "体", + "何", + "何方", + "佛", + "作", + "作为", + "作為", + "你", + "你们", + "你們", + "你自己", + "你门", + "佬", + "併", + "使", + "來", + "供", + "依", + "依据", + "依據", + "依照", + "依靠", + "侠", + "侧", + "侨", + "侯", + "便是", + "係", + "保存", + "保級", + "保级", + "俠", + "信", + "修复", + "修復", + "個", + "個人", + "們", + "倘若", + "借助", + "借由", + "借着", + "值", + "假使", + "假如", + "偏", + "做", + "側", + "偽", + "傳", + "傻", + "像", + "像是", + "僑", + "價", + "儘管", + "元", + "先", + "光", + "光棍", + "党", + "內", + "內外", + "全", + "全体", + "全副", + "全套", + "全部", + "全體", + "公", + "关", + "关于", + "关心", + "兵", + "其", + "其中", + "其他", + "其余", + "其它", + "其餘", + "典", + "兼", + "内", + "内外", + "军", + "冠", + "冢", + "冲", + "冷", + "准", + "准备", + "减慢", + "几", + "凭", + "凭借", + "出手", + "刀", + "分", + "分布", + "列", + "则为", + "则是", + "初", + "別", + "別人", + "别", + "别人", + "别的", + "到", + "到处", + "制", + "券", + "剂", + "則是", + "則為", + "前", + "前任", + "前后", + "前後", + "剑", + "剧", + "副", + "劇", + "劍", + "劑", + "力", + "办", + "办学", + "功", + "加", + "劣", + "努力", + "包", + "包裹", + "化", + "区", + "医", + "區", + "半", + "单", + "卡", + "卫", + "即", + "即使", + "即便", + "却是", + "卻", + "卻是", + "卿", + "厂", + "厅", + "历届", + "压", + "原", + "去", + "县", + "又", + "又或", + "又是", + "及", + "友", + "发展", + "发育", + "变", + "变得", + "口", + "古", + "另", + "另外", + "只是", + "只有", + "只能", + "只要", + "可", + "可以", + "可是", + "可能", + "台", + "史", + "叶", + "号", + "司", + "吃", + "各", + "各个", + "各位", + "各個", + "各天", + "各州", + "各式", + "各樣", + "各种", + "各种各样", + "各種", + "各種各樣", + "各类", + "各級", + "各级", + "各自", + "各項", + "各類", + "各项", + "同", + "同年", + "名", + "后", + "向", + "吗", + "君", + "否", + "吧", + "呀", + "员", + "呢", + "周", + "味", + "和", + "和美", + "咱们", + "品", + "哈尔滨", + "哈爾濱", + "員", + "哪", + "哪个", + "哪些", + "哪個", + "哪儿", + "哪兒", + "哪怕", + "哪裏", + "哪裡", + "哪里", + "唯有", + "商", + "啊", + "啦", + "喇", + "喜", + "喜欢", + "喜歡", + "單", + "單憑", + "嗎", + "嗬", + "嘛", + "嘴", + "器", + "回", + "因", + "因为", + "因应", + "因應", + "因此", + "因為", + "团", + "园", + "围", + "国", + "图", + "圆", + "圈", + "國", + "圍", + "園", + "圓", + "圖", + "團", + "土", + "圣", + "在", + "在內", + "在内", + "地", + "场", + "坊", + "坟", + "坡", + "型", + "埋", + "城", + "埤", + "執政", + "基", + "基于", + "基於", + "堂", + "堡", + "堤", + "報", + "場", + "塔", + "塘", + "墓", + "墙", + "增長", + "增长", + "墟", + "墳", + "壓", + "士", + "处", + "外", + "多", + "多少", + "多次", + "夜", + "够", + "夠", + "夢", + "大", + "大家", + "天", + "头", + "夹", + "夾", + "奏", + "奖", + "套", + "女", + "女士们", + "女士门", + "奸", + "她", + "她们", + "她俩", + "她倆", + "她們", + "好", + "好了", + "好像", + "如", + "如何", + "如同", + "如果", + "妃", + "妇", + "妳", + "妹", + "始", + "娘", + "婆", + "婦", + "子", + "孔", + "字", + "季", + "学", + "學", + "宁愿", + "它", + "它们", + "它們", + "安全", + "宏", + "宗", + "官", + "实属", + "审", + "客", + "室", + "宫", + "宮", + "家", + "宽", + "富", + "實屬", + "審", + "寬", + "对", + "对于", + "对方", + "对此", + "寺", + "将", + "將", + "對", + "對方", + "對於", + "對此", + "小", + "尖", + "就", + "就是", + "就算", + "尸", + "尽管", + "局", + "层", + "屋", + "屍", + "展", + "属", + "層", + "屬", + "屯", + "山", + "屿", + "岗", + "岛", + "岩", + "岭", + "岸", + "峡", + "峰", + "島", + "峽", + "崖", + "崗", + "嶺", + "嶼", + "川", + "州", + "工", + "左右", + "差", + "巷", + "币", + "市", + "布", + "师", + "希望", + "帝", + "带", + "師", + "席", + "帮", + "帶", + "帽", + "幣", + "幫", + "年", + "并", + "并且", + "并非", + "幾", + "庄", + "床", + "庐", + "库", + "应", + "应当", + "应该", + "底", + "店", + "庙", + "府", + "度", + "座", + "庫", + "庭", + "廟", + "廠", + "廬", + "廳", + "廷", + "建基於", + "开口", + "开始", + "式", + "弯", + "張", + "強", + "弹", + "强", + "彈", + "彎", + "当", + "当中", + "当届", + "录", + "形", + "形容", + "形成", + "影响", + "影響", + "彼此", + "往", + "径", + "待", + "很多", + "後", + "徑", + "徒", + "得", + "得宠", + "得寵", + "從", + "御", + "微", + "徽", + "心", + "必", + "必須", + "必须", + "志", + "快", + "态", + "怎么样", + "怎樣", + "怎麼", + "怕", + "性", + "怪", + "总", + "恆", + "恋", + "恒", + "您", + "想", + "愛", + "感", + "感到", + "感覺", + "感觉", + "愿意", + "態", + "憑", + "憑藉", + "懂", + "懂得", + "應", + "應當", + "應該", + "懒得", + "戀", + "戏", + "我", + "我们", + "我們", + "我自己", + "我门", + "或", + "或是", + "或者", + "战", + "截止", + "截至", + "戰", + "戲", + "戶", + "户", + "房", + "所", + "所以", + "所有", + "手", + "才是", + "打", + "执政", + "把", + "报", + "拖", + "持續", + "按", + "按照", + "挡", + "损失", + "据", + "排行", + "接唱", + "接触", + "接觸", + "控制", + "推进", + "推進", + "描述", + "損失", + "擋", + "據", + "支", + "教", + "敢", + "数", + "整", + "整个", + "整個", + "整场", + "整块", + "整場", + "整塊", + "整套", + "整所", + "整架", + "整片", + "整顆", + "整颗", + "數", + "文", + "斋", + "斗", + "新", + "方", + "於", + "族", + "旗", + "无论", + "既", + "既是", + "既然", + "日", + "日趋", + "日趨", + "旧", + "时", + "星", + "是", + "是否", + "是否是", + "是次", + "显", + "显得", + "時", + "晚", + "暖", + "暗", + "暨", + "曲", + "更为", + "更是", + "更為", + "更趋", + "更趨", + "書", + "替", + "會", + "會不會", + "月", + "有", + "有些", + "有关", + "有的", + "有關", + "服", + "朝", + "期", + "期間", + "期间", + "未能", + "末", + "本", + "本人", + "本地", + "本屆", + "本届", + "本班", + "本身", + "术", + "机", + "权", + "杆", + "材", + "村", + "束", + "来", + "杯", + "板", + "林", + "枪", + "架", + "某", + "某个", + "某些", + "某個", + "某种", + "某種", + "染色", + "柜", + "树", + "校", + "株", + "核", + "根据", + "根據", + "格", + "案", + "档", + "桥", + "桨", + "桿", + "梁", + "梁耀忠", + "梦", + "棍", + "棒", + "棚", + "椭", + "業", + "楼", + "榜", + "槍", + "槳", + "樂", + "樂意", + "樓", + "樹", + "橋", + "橙", + "機", + "橢", + "檔", + "櫃", + "權", + "次", + "欲", + "款", + "歌", + "正", + "正如", + "正是", + "此", + "此套", + "此次", + "此种", + "此種", + "此等", + "此类", + "此項", + "此類", + "此项", + "歷", + "歷屆", + "死", + "段", + "殿", + "母", + "毎年", + "每", + "每个", + "每位", + "每個", + "每元", + "每升", + "每卡", + "每周", + "每天", + "每幅", + "每年", + "每座", + "每当", + "每戶", + "每户", + "每所", + "每日", + "每枚", + "每次", + "每段", + "每片", + "每秒", + "每組", + "每组", + "每边", + "每週", + "每邊", + "每間", + "每间", + "每队", + "每隊", + "每集", + "每首", + "毒", + "比", + "比如說", + "比起", + "氏", + "气", + "氣", + "水", + "永保", + "江", + "池", + "沒", + "沒有", + "沒能", + "沟", + "没", + "没有", + "没能", + "河", + "治军", + "治軍", + "沼", + "沿", + "沿着", + "沿著", + "況且", + "泉", + "法", + "波", + "洋", + "洞", + "洲", + "派", + "流沙", + "浅", + "浊", + "浓", + "浦", + "海", + "涉世", + "涌", + "液", + "淡", + "深", + "深感", + "混", + "淺", + "清", + "減慢", + "渡", + "港", + "湖", + "湾", + "準", + "準備", + "溝", + "溥仪", + "溥儀", + "溪", + "满", + "满洲", + "滩", + "滿", + "滿洲", + "潮", + "澡", + "澳", + "濁", + "濃", + "灘", + "灣", + "火", + "炉", + "炎", + "炮", + "点", + "為", + "為了", + "為人", + "烃", + "烟", + "热", + "烴", + "無", + "無論", + "煙", + "熟", + "熱", + "營", + "爐", + "爭取", + "爭辯", + "爱", + "爲", + "父", + "爷", + "爺", + "牆", + "片", + "版", + "牌", + "牠", + "牠們", + "物", + "犯", + "状", + "狀", + "狂", + "狗", + "狮", + "猫", + "獅", + "獎", + "獲利", + "率", + "王", + "班", + "球", + "琴", + "甚么", + "甚至", + "甚至是", + "甚麼", + "甚麽", + "生", + "用", + "由", + "由于", + "由於", + "电", + "男", + "町", + "画", + "界", + "畔", + "畫", + "當", + "當中", + "當屆", + "病", + "症", + "癌", + "癖", + "發展", + "發育", + "的", + "的話", + "的话", + "皮", + "盃", + "监管", + "盖因", + "監管", + "目", + "直到", + "直至", + "相对", + "相對", + "相比", + "省", + "看", + "看似", + "看得", + "眼", + "眾", + "眾多", + "着", + "督", + "瞭", + "短", + "石", + "矿", + "码", + "砲", + "硅", + "碑", + "碱", + "碼", + "礁", + "礦", + "礼", + "社", + "祂", + "神", + "祠", + "禮", + "离", + "离开", + "秀", + "私交", + "秋", + "种", + "科", + "秤", + "稅", + "税", + "種", + "突感", + "窑", + "窟", + "窯", + "站", + "端", + "競選", + "符", + "笨", + "等", + "管", + "管理", + "箱", + "節", + "篇", + "籍", + "米", + "类", + "粉", + "精", + "糖", + "系", + "紀", + "紅", + "紋", + "純", + "紙", + "級", + "素", + "組", + "結", + "給", + "綉", + "經", + "經由", + "經過", + "綜", + "綫", + "綱", + "網", + "線", + "緣", + "縣", + "縱使", + "總", + "繞", + "繼", + "红", + "级", + "纪", + "纯", + "纲", + "纵使", + "纸", + "纹", + "线", + "组", + "经", + "经由", + "经过", + "结", + "绕", + "给", + "绣", + "继", + "综", + "网", + "罩", + "罪", + "署", + "羊", + "美", + "群", + "翁", + "老", + "者", + "而", + "而且", + "而已", + "而是", + "而非", + "聖", + "肉", + "肯", + "肺", + "胎", + "胚", + "胶", + "能", + "能否", + "能够", + "能夠", + "脚", + "脸", + "腔", + "腳", + "腿", + "膜", + "膠", + "臉", + "臨", + "自", + "自从", + "自家", + "自己", + "自從", + "自我", + "自身", + "至", + "至于", + "至於", + "臺", + "與", + "與其", + "舊", + "舞", + "舟", + "舰", + "舱", + "船", + "艇", + "艙", + "艦", + "色", + "节", + "花", + "若", + "若是", + "茶", + "药", + "莊", + "获利", + "菌", + "菜", + "营", + "葉", + "著", + "蓋因", + "蓝", + "藉", + "藉助", + "藉由", + "藉著", + "藍", + "藤", + "藥", + "藩", + "處", + "號", + "虽", + "虽则", + "虽然", + "蛙", + "行", + "術", + "街", + "衛", + "衣", + "表", + "表现", + "表現", + "表示", + "被", + "装", + "裏", + "裔", + "裙", + "裝", + "裡", + "裡面", + "裤", + "製", + "褲", + "要", + "要不要", + "要么", + "要是", + "要求", + "親", + "覺得", + "觀", + "观", + "觉得", + "角", + "計劃", + "記", + "詞", + "試圖", + "詩", + "話", + "該", + "該屆", + "該批", + "該族", + "該條", + "該段", + "該組", + "該集", + "該項", + "誌", + "認為", + "認識", + "語", + "誤信", + "說", + "誰", + "課", + "請", + "論", + "諸", + "諸如", + "謂", + "證", + "譜", + "變", + "變得", + "认为", + "认识", + "记", + "许多", + "许许多多", + "论", + "证", + "词", + "诗", + "话", + "该", + "该届", + "该批", + "该族", + "该条", + "该段", + "该组", + "该集", + "语", + "误信", + "说", + "请", + "诸", + "诸如", + "课", + "谁", + "谓", + "谱", + "谷", + "豆", + "象", + "貓", + "負債", + "費", + "資", + "賣", + "質", + "賽", + "负债", + "质", + "费", + "资", + "赛", + "起", + "起伏", + "起来", + "趁", + "超", + "趋", + "趋于", + "趨", + "趨於", + "距", + "距离", + "距離", + "跟", + "路", + "躁", + "身", + "車", + "軍", + "軒", + "軟", + "軸", + "較", + "輕", + "车", + "轩", + "软", + "轴", + "轻", + "较", + "辦", + "辦學", + "边", + "达到", + "过", + "过后", + "运作", + "近", + "还", + "还是", + "还有", + "这", + "这些", + "这儿", + "这养", + "这样", + "这次", + "这种", + "这里", + "远", + "连", + "连任", + "连同", + "迷", + "追溯", + "透过", + "透過", + "這", + "這些", + "這個", + "這兒", + "這樣", + "這樣子", + "這次", + "這種", + "這裏", + "這裡", + "這邊", + "這麼", + "通", + "通过", + "通過", + "逢", + "連", + "連任", + "連同", + "週", + "運作", + "過", + "過後", + "道", + "達到", + "遠", + "選舉", + "還是", + "邊", + "那", + "那个", + "那些", + "那儿", + "那兒", + "那样", + "那樣", + "那裏", + "那裡", + "那邊", + "那里", + "邦", + "邨", + "郎", + "郡", + "部", + "都", + "都是", + "鄉", + "配", + "酒", + "酸", + "醣", + "醫", + "里", + "里面", + "重", + "量", + "金", + "針", + "針對", + "銘", + "鋼", + "錄", + "錦", + "鍋", + "鍵", + "鎊", + "鎮", + "鏈", + "鏡", + "鐵", + "鑒於", + "针", + "针对", + "钢", + "铁", + "铭", + "链", + "锅", + "锦", + "键", + "镇", + "镜", + "長", + "长", + "門", + "開口", + "開始", + "間", + "閣", + "閣下", + "關", + "關心", + "關於", + "门", + "间", + "阁", + "队", + "阶", + "际", + "陆", + "降解", + "院", + "除", + "除了", + "除外", + "除非", + "陵", + "陸", + "隊", + "階", + "随", + "随同", + "隔", + "際", + "隨", + "隨同", + "难过", + "集", + "雖", + "雖則", + "雖然", + "離", + "離開", + "難過", + "電", + "需", + "需要", + "非", + "靠", + "面", + "音", + "頂", + "須", + "頭", + "頭個", + "題", + "額", + "願意", + "類", + "顯", + "顯得", + "顶", + "须", + "题", + "额", + "風", + "风", + "飯", + "餅", + "餐", + "館", + "饃", + "首先", + "點", + ], +}