diff --git a/.gitattributes b/.gitattributes
index 957b2579c6ef20995a09efd9a17f8fd90606f5ed..ed96f8a80fb2d088cbd2247ab146383664f57c28 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e2915adf89870c95283705ac56f9e3a3fe96578d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*cpython-39.pyc
+.DS_Store
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..9843adf90a7825684ca9ab05b4baea8823573310
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,204 @@
+------------- LICENSE FOR Bigscience code  --------------
+
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2021] [Bigscience]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/README.md b/README.md
index 5862195ec9bbb4043fc6ec1626402ef2c696efa7..afc159b8e250a60e5faa9000a337b110235aa65d 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 ---
-title: Text Data Filtering 2
-emoji: 📈
-colorFrom: green
-colorTo: yellow
+title: Text Data Filtering
+emoji: 👁
+colorFrom: blue
+colorTo: pink
 sdk: streamlit
 app_file: app.py
 pinned: false
@@ -10,36 +10,28 @@ pinned: false
 
 # Configuration
 
-`title`: _string_
+`title`: _string_  
 Display title for the Space
 
-`emoji`: _string_
+`emoji`: _string_  
 Space emoji (emoji-only character allowed)
 
-`colorFrom`: _string_
+`colorFrom`: _string_  
 Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
 
-`colorTo`: _string_
+`colorTo`: _string_  
 Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
 
-`sdk`: _string_
-Can be either `gradio`, `streamlit`, or `static`
+`sdk`: _string_  
+Can be either `gradio` or `streamlit`
 
-`sdk_version` : _string_
+`sdk_version` : _string_  
 Only applicable for `streamlit` SDK.  
 See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
 
-`app_file`: _string_
-Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
+`app_file`: _string_  
+Path to your main application file (which contains either `gradio` or `streamlit` Python code).  
 Path is relative to the root of the repository.
 
-`models`: _List[string]_
-HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
-Will be parsed automatically from your code if not specified here.
-
-`datasets`: _List[string]_
-HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
-Will be parsed automatically from your code if not specified here.
-
-`pinned`: _boolean_
+`pinned`: _boolean_  
 Whether the Space stays on top of your list.
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..98597a78499ff628a96217d0cbea07aefde4df0b
--- /dev/null
+++ b/app.py
@@ -0,0 +1,916 @@
+# Run with: streamlit run visualization.py
+
+import streamlit as st
+
+import os
+
+from io import StringIO
+import base64
+import json
+import pandas as pd
+
+pd.options.mode.chained_assignment = None
+
+import numpy as np
+
+import matplotlib.pyplot as plt
+
+from filtering import LoadParameters, ModifyingDocuments, Filtering
+from languages_id import langs_id
+
+
+class Visualization_for_lang:
+    def __init__(
+        self,
+        path_data,
+        lang,
+        num_docs,
+        num_docs_for_words,
+        max_len_text_display,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
+    ):
+        self.path_data = path_data
+        self.lang = lang
+        self.num_docs = num_docs
+        self.num_docs_for_words = num_docs_for_words
+        self.max_len_text_display = max_len_text_display
+
+        self.lang_dataset_id = lang_dataset_id
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+        self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
+        self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
+        self.model_lang_id = LoadParameters.load_model_lang_id(
+            lang_dataset_id, path_fasttext_model
+        )
+        self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
+            lang_dataset_id, path_sentencepiece_model
+        )
+        self.sentencepiece_model_tok = (
+            self.sentencepiece_model if self.param["tokenization"] else None
+        )
+        self.kenlm_model = LoadParameters.load_kenlm_model(
+            lang_dataset_id, path_kenlm_model
+        )
+
+    def set_title(self):
+        st.title(f"Filtering visualization for {self.lang}")
+
+    def open_data(self):
+        with open(self.path_data) as json_file:
+            data = json.load(json_file)
+
+        self.num_docs = min(self.num_docs, len(data))
+        self.num_docs_for_words = min(self.num_docs_for_words, len(data))
+
+        if "words" in data[0]:
+            words = [doc["words"] for doc in data[: self.num_docs_for_words]]
+            words = [word for doc in words for word in doc]
+            self.words = pd.DataFrame(words)
+        else:
+            self.words = None
+
+        docs = data[: self.num_docs]
+        for doc in docs:
+            if not (self.words is None):
+                del doc["words"]
+            if len(doc["text"]) > self.max_len_text_display:
+                doc["text"] = (
+                    doc["text"][: self.max_len_text_display]
+                    + " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
+                )
+        self.docs_checkpoint = pd.DataFrame(docs)
+        self.docs = self.docs_checkpoint
+
+    @staticmethod
+    def print_discarded_by_cond(cond):
+        st.caption(
+            f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
+        )
+
+    @staticmethod
+    def plot_hist(dataframe, key, num_bins=50):
+        checkbox = st.checkbox(
+            "Diplay distribution", value=True, key=f"display_distribution_{key[0]}"
+        )
+        if checkbox:
+            fig, ax = plt.subplots()
+            val = dataframe[key[0]].values
+            if np.median(val) != 0:
+                val = val[
+                    abs(val - np.median(val))
+                    < 9 * np.median(np.absolute(val - np.median(val)))
+                ]
+            ax.hist(val, bins=num_bins, density=True)
+            ax.set_title(" ".join(key[0].split("_")))
+            ax.axvline(x=key[1], color="r", linestyle="dashed")
+            st.pyplot(fig)
+
+    @staticmethod
+    def display_dataset(dataframe, cond, description, type_of_examples):
+        displayed_examples = dataframe.loc[cond]
+        st.subheader(
+            f"{description}: {len(displayed_examples)} {type_of_examples} ({len(displayed_examples) / len(dataframe.index) * 100:.2f}%)"
+        )
+        st.markdown(
+            "Click on a column to sort by it, place the cursor on the text to display it."
+        )
+        st.dataframe(displayed_examples)
+
+    def filtering_of_docs(self):
+        def set_sliders():
+            columns = list(self.docs)
+            keys = []
+            conds = {}
+
+            def get_cond(key, cutoff, max_cutoff):
+                if max_cutoff:
+                    return self.docs[key] <= cutoff
+                return self.docs[key] >= cutoff
+
+            if "number_words" in columns:
+                with st.sidebar.expander("Number of words"):
+                    cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
+                    max_nb_words = int(np.max(self.docs["number_words"])) + 1
+                    cutoff_min_number_words = st.slider(
+                        cutoff_def, 0, min(max_nb_words, 500), 0
+                    )
+                    new_key = ("number_words", cutoff_min_number_words, False)
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond_1)
+
+                    cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
+                    cutoff_max_number_words = st.slider(
+                        cutoff_def, 0, max_nb_words, max_nb_words
+                    )
+                    new_key = ("number_words", cutoff_max_number_words, True)
+                    keys.append(new_key)
+                    cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond_2)
+
+                    conds["number_words"] = [cond_1, cond_2]
+
+            if "character_repetition_ratio" in columns:
+                with st.sidebar.expander("Character repetition ratio"):
+                    val_repetitions_lengths = list(
+                        self.docs["character_repetition_ratio"].iloc[0].keys()
+                    )
+                    default_index = (
+                        val_repetitions_lengths.index("10")
+                        if "10" in val_repetitions_lengths
+                        else 0
+                    )
+                    label_selectbox = "Length of repetitions in characters (that will influence the character repetition ratio)."
+                    repetitions_length = st.selectbox(
+                        label=label_selectbox,
+                        options=val_repetitions_lengths,
+                        index=default_index,
+                    )
+                    st.caption(
+                        "Choosing a higher or lower number does not mean that the filtering "
+                        "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
+                        "tends to associate a high character repetition ratio to very long documents (like book chapters), but with "
+                        "few or no repetitions, simply because their length gives them more diversity, and we do "
+                        "not want to discard such documents. It is generally better to increase this number, so that false "
+                        "positives are very short documents (which we want to delete anyway) rather than long ones. However, "
+                        "a low number can be useful for Chinese, where a character can designate a whole word."
+                    )
+                    self.docs["character_repetition_ratio"] = self.docs_checkpoint[
+                        "character_repetition_ratio"
+                    ]
+                    for i in range(len(self.docs["character_repetition_ratio"])):
+                        self.docs["character_repetition_ratio"].iloc[i] = self.docs[
+                            "character_repetition_ratio"
+                        ].iloc[i][repetitions_length]
+
+                    cutoff_def = "If the character repetition ratio of a document is higher than this number, the document is removed."
+                    cutoff_character_repetition_ratio = st.slider(
+                        cutoff_def, 0.0, 1.0, 1.0, step=0.01
+                    )
+                    new_key = (
+                        "character_repetition_ratio",
+                        cutoff_character_repetition_ratio,
+                        True,
+                        repetitions_length,
+                    )
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["character_repetition_ratio"] = [cond]
+
+            if "word_repetition_ratio" in columns:
+                with st.sidebar.expander("Word repetition ratio"):
+                    val_repetitions_lengths = list(
+                        self.docs["word_repetition_ratio"].iloc[0].keys()
+                    )
+                    default_index = (
+                        val_repetitions_lengths.index("5")
+                        if "5" in val_repetitions_lengths
+                        else 0
+                    )
+                    label_selectbox = "Length of repetitions in words (that will influence the word repetition ratio)."
+                    repetitions_length = st.selectbox(
+                        label=label_selectbox,
+                        options=val_repetitions_lengths,
+                        index=default_index,
+                    )
+                    st.caption(
+                        "Choosing a higher or lower number does not mean that the filtering "
+                        "is stronger or weaker. Be careful, choosing a low number (like 3) could "
+                        "tend to associate a high word repetition ratio to very long documents (like book chapters), but with "
+                        "few or no repetitions, simply because their length gives them more diversity, and we do "
+                        "not want to discard such documents. It is generally better to increase a bit this number, so that false "
+                        "positives are very short documents (which we want to delete anyway) rather than long ones."
+                    )
+                    self.docs["word_repetition_ratio"] = self.docs_checkpoint[
+                        "word_repetition_ratio"
+                    ]
+                    for i in range(len(self.docs["word_repetition_ratio"])):
+                        self.docs["word_repetition_ratio"].iloc[i] = self.docs[
+                            "word_repetition_ratio"
+                        ].iloc[i][repetitions_length]
+
+                    cutoff_def = "If the word repetition ratio of a document is higher than this number, the document is removed."
+                    cutoff_word_repetition_ratio = st.slider(
+                        cutoff_def, 0.0, 1.0, 1.0, step=0.01
+                    )
+                    new_key = (
+                        "word_repetition_ratio",
+                        cutoff_word_repetition_ratio,
+                        True,
+                        repetitions_length,
+                    )
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["word_repetition_ratio"] = [cond]
+
+            if "special_characters_ratio" in columns:
+                with st.sidebar.expander("Special characters ratio"):
+                    cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
+                    cutoff_special_characters_ratio = st.slider(
+                        cutoff_def, 0.0, 1.0, 1.0, step=0.01
+                    )
+                    new_key = (
+                        "special_characters_ratio",
+                        cutoff_special_characters_ratio,
+                        True,
+                    )
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["special_characters_ratio"] = [cond]
+
+            if "stopwords_ratio" in columns:
+                with st.sidebar.expander("Stop words ratio"):
+                    stopwords_file = st.file_uploader(
+                        "Upload your own list of stop words (one per line). If there is none, the default one is used."
+                    )
+                    if stopwords_file:
+                        new_stopwords = StringIO(
+                            stopwords_file.getvalue().decode("utf-8")
+                        ).read()
+                        new_stopwords = set(new_stopwords.split("\n"))
+                        self.docs["stopwords_ratio"] = self.docs_checkpoint[
+                            "stopwords_ratio"
+                        ]
+                        for i in range(len(self.docs["stopwords_ratio"])):
+                            self.docs["stopwords_ratio"].iloc[
+                                i
+                            ] = Filtering.compute_stopwords_ratio(
+                                self.docs["text"].iloc[i],
+                                self.sentencepiece_model_tok,
+                                self.param["strip_characters"],
+                                self.param["cond_words_augmentation"],
+                                self.param["words_augmentation_group_sizes"],
+                                self.param["words_augmentation_join_char"],
+                                new_stopwords,
+                            )
+                    cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
+                    cutoff_stopwords_ratio = st.slider(
+                        cutoff_def, 0.0, 1.0, 0.0, step=0.01
+                    )
+                    new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["stopwords_ratio"] = [cond]
+
+            if "flagged_words_ratio" in columns:
+                with st.sidebar.expander("Flagged words ratio"):
+                    flagged_words_file = st.file_uploader(
+                        "Upload your own list of flagged words (one per line). If there is none, the default one is used."
+                    )
+                    if flagged_words_file:
+                        new_flagged_words = StringIO(
+                            flagged_words_file.getvalue().decode("utf-8")
+                        ).read()
+                        new_flagged_words = set(new_flagged_words.split("\n"))
+                        self.docs["flagged_words_ratio"] = self.docs_checkpoint[
+                            "flagged_words_ratio"
+                        ]
+                        for i in range(len(self.docs["flagged_words_ratio"])):
+                            self.docs["flagged_words_ratio"].iloc[
+                                i
+                            ] = Filtering.compute_flagged_words_ratio(
+                                self.docs["text"].iloc[i],
+                                self.sentencepiece_model_tok,
+                                self.param["strip_characters"],
+                                self.param["cond_words_augmentation"],
+                                self.param["words_augmentation_group_sizes"],
+                                self.param["words_augmentation_join_char"],
+                                new_flagged_words,
+                            )
+                    cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
+                    max_fwr = np.max(self.docs["flagged_words_ratio"])
+                    max_fwr = np.ceil(max_fwr * 1000) / 1000
+                    max_fwr = float(max_fwr)
+                    cutoff_flagged_words_ratio = st.slider(
+                        cutoff_def,
+                        0.000,
+                        max_fwr,
+                        max_fwr,
+                        step=0.001,
+                        format="%f",
+                    )
+                    new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["flagged_words_ratio"] = [cond]
+
+            if "lang_id_score" in columns:
+                with st.sidebar.expander("Language ID confidence score"):
+                    cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
+                    cutoff_lang_id_score = st.slider(
+                        cutoff_def, 0.0, 1.0, 0.0, step=0.01
+                    )
+                    new_key = ("lang_id_score", cutoff_lang_id_score, False)
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["lang_id_score"] = [cond]
+
+            if "perplexity_score" in columns:
+                with st.sidebar.expander("Perplexity score"):
+                    cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
+                    max_pp = int(np.max(self.docs["perplexity_score"])) + 1
+                    cutoff_perplexity_score = st.slider(cutoff_def, 0, max_pp, max_pp)
+                    new_key = ("perplexity_score", cutoff_perplexity_score, True)
+                    keys.append(new_key)
+                    Visualization_for_lang.plot_hist(self.docs, new_key)
+                    cond = get_cond(new_key[0], new_key[1], new_key[2])
+                    Visualization_for_lang.print_discarded_by_cond(cond)
+                    conds["perplexity_score"] = [cond]
+
+            return keys, conds
+
+        with st.expander(
+            f"Filtering on documents, for {self.num_docs} {self.lang} documents"
+        ):
+            st.header(
+                f"Filtering on documents, for {self.num_docs} {self.lang} documents"
+            )
+
+            if "labels" in list(self.docs):
+                chosen_label = st.selectbox(
+                    label="Consider only documents that include the following label",
+                    options=[
+                        "All",
+                        "NA: Narrative",
+                        "IN: Informational Description",
+                        "OP: Opinion",
+                        "ID: Interactive Discussion",
+                        "HI: How-to/Instruction",
+                        "IP: Informational Persuasion",
+                        "LY: Lyrical",
+                        "SP: Spoken",
+                    ],
+                )
+                chosen_label = chosen_label.split(":")[0]
+                if chosen_label != "All":
+                    cond_label = list(
+                        self.docs["labels"].apply(
+                            lambda x: True if chosen_label in x else False
+                        )
+                    )
+                    self.docs = self.docs[cond_label]
+
+            if self.docs.empty:
+                st.markdown(
+                    "No document to display, please try to select a different label."
+                )
+                self.keys = []
+                self.parameters = []
+
+            else:
+                st.sidebar.subheader("Parameters of the filtering on documents")
+                self.keys, conds = set_sliders()
+                self.parameters = self.keys * 1
+
+                all_conds = [
+                    subcond for cond in list(conds.values()) for subcond in cond
+                ]
+                all_conds = np.all(all_conds, axis=0)
+
+                Visualization_for_lang.display_dataset(
+                    self.docs, np.invert(all_conds), "Discarded documents", "docs"
+                )
+
+                # st.subheader("Display discarded documents by filter")
+                display_discarded_documents_by_filter = st.checkbox(
+                    "Display discarded documents by filter"
+                )
+
+                if display_discarded_documents_by_filter:
+                    columns = list(self.docs)
+
+                    if "number_words" in columns:
+                        cond_filter = np.invert(np.all(conds["number_words"], axis=0))
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the number of words",
+                            "docs",
+                        )
+
+                    if "character_repetition_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["character_repetition_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the character repetition ratio",
+                            "docs",
+                        )
+
+                    if "word_repetition_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["word_repetition_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the word repetition ratio",
+                            "docs",
+                        )
+
+                    if "special_characters_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["special_characters_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the special characters ratio",
+                            "docs",
+                        )
+
+                    if "stopwords_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["stopwords_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the stop words ratio",
+                            "docs",
+                        )
+
+                    if "flagged_words_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["flagged_words_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the flagged words ratio",
+                            "docs",
+                        )
+
+                    if "lang_id_score" in columns:
+                        cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the language identification confidence score",
+                            "docs",
+                        )
+
+                    if "perplexity_score" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["perplexity_score"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the perplexity score",
+                            "docs",
+                        )
+
+                Visualization_for_lang.display_dataset(
+                    self.docs, all_conds, "Retained documents", "docs"
+                )
+
+            st.header("Download data")
+
+            with open(self.path_data) as json_file:
+                btn = st.download_button(
+                    label="Download data as json",
+                    data=json_file,
+                    file_name="data.json",
+                )
+
+    def filtering_of_words(self):
+        if not (self.words is None):
+            columns = list(self.words)
+
+            st.sidebar.subheader("Parameter of the filtering on words")
+
+            conds_words = {}
+
+            if "len_word" in columns:
+                with st.sidebar.expander("Length of words"):
+                    cutoff_def = "If the length of a word is higher than this number, the word is removed."
+                    max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
+                    cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
+                    new_key = ("len_word", cutoff_word, True)
+                    self.parameters.append(new_key)
+                    Visualization_for_lang.plot_hist(self.words, new_key)
+                    cond_len_words = self.words["len_word"] <= cutoff_word
+                    Visualization_for_lang.print_discarded_by_cond(cond_len_words)
+                    conds_words["len_word"] = cond_len_words
+
+            if "incorrect_substrings" in columns:
+                with st.sidebar.expander("Words with incorrect substrings"):
+                    incorrect_substrings = st.checkbox(
+                        "Remove words with incorrect substrings."
+                    )
+                    self.parameters.append(
+                        ("incorrect_substrings", incorrect_substrings)
+                    )
+
+                    checkbox = st.checkbox(
+                        "Diplay distribution",
+                        value=True,
+                        key="display_distribution_incorrect_substrings",
+                    )
+                    if checkbox:
+                        incor_sub = np.array(self.words["incorrect_substrings"]) * 1
+                        with_incor_sub = np.sum(incor_sub)
+                        without_incor_sub = len(incor_sub) - with_incor_sub
+                        st.markdown(
+                            f"Number of words with incorrect substrings: {with_incor_sub}"
+                        )
+                        st.markdown(
+                            f"Number of words without incorrect substrings: {without_incor_sub}"
+                        )
+
+                    if incorrect_substrings:
+                        cond_incorrect_substrings = np.invert(
+                            self.words["incorrect_substrings"]
+                        )
+                    else:
+                        cond_incorrect_substrings = np.array(
+                            [
+                                True
+                                for i in range(len(self.words["incorrect_substrings"]))
+                            ]
+                        )
+                    Visualization_for_lang.print_discarded_by_cond(
+                        cond_incorrect_substrings
+                    )
+                    conds_words["incorrect_substrings"] = cond_incorrect_substrings
+
+            all_conds_words = np.all(list(conds_words.values()), axis=0)
+
+            with st.expander(
+                f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents"
+            ):
+                st.header(
+                    f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents"
+                )
+
+                st.markdown(
+                    f"Since the number of words is way larger than the number of documents, "
+                    f"we consider in this section words for only {self.num_docs_for_words} documents."
+                )
+
+                Visualization_for_lang.display_dataset(
+                    self.words, np.invert(all_conds_words), "Discarded words", "words"
+                )
+
+                # st.subheader("Display discarded words by filter")
+                display_discarded_words_by_filter = st.checkbox(
+                    "Display discarded words by filter"
+                )
+
+                if display_discarded_words_by_filter:
+
+                    if "len_word" in columns:
+                        cond_filter = np.invert(conds_words["len_word"])
+                        Visualization_for_lang.display_dataset(
+                            self.words,
+                            cond_filter,
+                            "Discarded words for the filter on length",
+                            "words",
+                        )
+
+                    if "incorrect_substrings" in columns:
+                        cond_filter = np.invert(conds_words["incorrect_substrings"])
+                        Visualization_for_lang.display_dataset(
+                            self.words,
+                            cond_filter,
+                            "Discarded words for the filter on incorrect substrings",
+                            "words",
+                        )
+
+                Visualization_for_lang.display_dataset(
+                    self.words, all_conds_words, "Retained words", "words"
+                )
+
+    def download_parameters(self):
+        st.sidebar.subheader("Download parameters")
+        btn = st.sidebar.download_button(
+            label="Download current parameters as json",
+            data=json.dumps(self.parameters),
+            file_name=f"parameters_{self.lang_dataset_id}.json",
+        )
+
+    """
+    def plot_zipf_law(self):
+        if not (self.words is None):
+            st.header("Zipf's Law")
+
+            display_zipf_law = st.checkbox("Display Zipf's Law")
+
+            if display_zipf_law:
+
+                freq_words = {}
+                for _, row in self.words.iterrows():
+                    freq_words[row["word"]] = freq_words.get(row["word"], 0) + 1
+                freq_words = np.array(list(freq_words.values()))
+                freq_words = -np.sort(-freq_words)
+
+                fig, ax = plt.subplots()
+                ax.loglog(freq_words)
+                ax.set_title("Zipf's Law")
+                ax.set_xlabel("$i$-th most frequent word")
+                ax.set_ylabel("frequency in the documents")
+                st.pyplot(fig)
+    """
+
+    def analyse_personal_doc(self):
+        with st.expander("Analyse your own document"):
+            st.header("Analyse your own document")
+
+            personal_doc = st.text_area(
+                label="Paste here the document you want to analyse",
+                value="",
+                max_chars=10000,
+            )
+
+            is_discarded = False
+
+            def is_doc_discarded(key, score):
+                if key[2]:  # max cutoff
+                    return score > key[1]
+                else:
+                    return score < key[1]
+
+            if personal_doc:
+
+                st.markdown("Statistics of the document:")
+
+                for key in self.keys:
+                    if key[0] == "number_words":
+                        words = ModifyingDocuments.get_words_from_document(
+                            personal_doc,
+                            self.sentencepiece_model_tok,
+                            lower_case=False,
+                            strip_characters=self.param["strip_characters"],
+                        )
+                        if key[2]:
+                            st.markdown(f"Number of words: {len(words)}")
+                        if is_doc_discarded(key, len(words)):
+                            is_discarded = True
+
+                    elif key[0] == "character_repetition_ratio":
+                        character_repetition_ratio = (
+                            Filtering.compute_character_repetition_ratio(
+                                personal_doc, int(key[3])
+                            )
+                        )
+                        character_repetition_ratio = round(
+                            character_repetition_ratio, 3
+                        )
+                        st.markdown(
+                            f"Character repetition ratio: {character_repetition_ratio}"
+                        )
+                        if is_doc_discarded(key, character_repetition_ratio):
+                            is_discarded = True
+
+                    elif key[0] == "word_repetition_ratio":
+                        word_repetition_ratio = Filtering.compute_word_repetition_ratio(
+                            personal_doc,
+                            self.sentencepiece_model_tok,
+                            self.param["strip_characters"],
+                            int(key[3]),
+                        )
+                        word_repetition_ratio = round(word_repetition_ratio, 3)
+                        st.markdown(f"Word repetition ratio: {word_repetition_ratio}")
+                        if is_doc_discarded(key, word_repetition_ratio):
+                            is_discarded = True
+
+                    elif key[0] == "special_characters_ratio":
+                        special_characters_ratio = (
+                            Filtering.compute_special_characters_ratio(
+                                personal_doc, self.param["special_characters"]
+                            )
+                        )
+                        special_characters_ratio = round(special_characters_ratio, 3)
+                        st.markdown(
+                            f"Special characters ratio: {special_characters_ratio}"
+                        )
+                        if is_doc_discarded(key, special_characters_ratio):
+                            is_discarded = True
+
+                    elif key[0] == "stopwords_ratio":
+                        stopwords_ratio = Filtering.compute_stopwords_ratio(
+                            personal_doc,
+                            self.sentencepiece_model_tok,
+                            self.param["strip_characters"],
+                            self.param["cond_words_augmentation"],
+                            self.param["words_augmentation_group_sizes"],
+                            self.param["words_augmentation_join_char"],
+                            self.stopwords,
+                        )
+                        stopwords_ratio = round(stopwords_ratio, 3)
+                        st.markdown(f"Stop words ratio: {stopwords_ratio}")
+                        if is_doc_discarded(key, stopwords_ratio):
+                            is_discarded = True
+
+                    elif key[0] == "flagged_words_ratio":
+                        flagged_words_ratio = Filtering.compute_flagged_words_ratio(
+                            personal_doc,
+                            self.sentencepiece_model_tok,
+                            self.param["strip_characters"],
+                            self.param["cond_words_augmentation"],
+                            self.param["words_augmentation_group_sizes"],
+                            self.param["words_augmentation_join_char"],
+                            self.flagged_words,
+                        )
+                        flagged_words_ratio = round(flagged_words_ratio, 3)
+                        st.markdown(f"Flagged words ratio: {flagged_words_ratio}")
+                        if is_doc_discarded(key, flagged_words_ratio):
+                            is_discarded = True
+
+                    elif key[0] == "lang_id_score":
+                        (
+                            lang_pred_dataset_id,
+                            lang_id_score,
+                        ) = Filtering.compute_lang_id_pred_score(
+                            personal_doc, self.model_lang_id
+                        )
+                        lang_id_score = round(lang_id_score, 3)
+                        st.markdown(
+                            f"Language identification confidence score: {lang_id_score}"
+                        )
+                        if is_doc_discarded(key, flagged_words_ratio) or (
+                            self.lang_dataset_id != lang_pred_dataset_id
+                        ):
+                            is_discarded = True
+
+                    elif key[0] == "perplexity_score":
+                        perplexity_score = Filtering.compute_perplexity_score(
+                            personal_doc,
+                            self.sentencepiece_model,
+                            self.kenlm_model,
+                        )
+                        perplexity_score = round(perplexity_score, 3)
+                        st.markdown(f"Perplexity score: {perplexity_score}")
+                        if is_doc_discarded(key, perplexity_score):
+                            is_discarded = True
+
+                is_discarded = "" if is_discarded else "not "
+                st.markdown(
+                    f"With the current filtering parameters, this document **is {is_discarded}discarded**."
+                )
+
+    def visualization_for_lang(self):
+        self.set_title()
+        self.open_data()
+        self.filtering_of_docs()
+        self.filtering_of_words()
+        self.download_parameters()
+        self.analyse_personal_doc()
+
+
+class Visualization:
+    def __init__(self, path_instructions, param_visu_langs):
+        self.path_instructions = path_instructions
+        self.param_visu_langs = param_visu_langs
+
+    def preamble(self):
+        def get_binary_file_downloader_html(bin_file, file_label="File"):
+            with open(bin_file, "rb") as f:
+                data = f.read()
+            bin_str = base64.b64encode(data).decode()
+            href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
+            return href
+
+        st.markdown(
+            "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this "
+            + get_binary_file_downloader_html(
+                self.path_instructions,
+                "pdf",
+            )
+            + ".",
+            unsafe_allow_html=True,
+        )
+
+    def warning_preamble(self):
+        st.markdown(
+            "This demo can be a little slow, and only allows you to process up to 5000 documents "
+            "for a decent speed. If you want to display up to three times more documents and have "
+            "a faster visualization, we invite you to run this "
+            "[code](https://github.com/bigscience-workshop/data_tooling/tree/master/ac_dc/visualization) "
+            "on your computer."
+        )
+
+    def choose_lang(self):
+        options = [
+            self.param_visu_langs[lang_dataset_id]["lang"]
+            for lang_dataset_id in self.param_visu_langs
+        ]
+        index = options.index("English") if ("English" in options) else 0
+        lang_chosen = st.selectbox(
+            label="Select the language for visualization",
+            options=options,
+            index=index,
+        )
+        if lang_chosen != "None":
+            lang_chosen_dataset_id = langs_id.loc[
+                langs_id["lang"] == lang_chosen, "dataset_id"
+            ].iloc[0]
+            visualization_for_lang = Visualization_for_lang(
+                path_data=self.param_visu_langs[lang_chosen_dataset_id]["path_data"],
+                lang=self.param_visu_langs[lang_chosen_dataset_id]["lang"],
+                num_docs=self.param_visu_langs[lang_chosen_dataset_id]["num_docs"],
+                num_docs_for_words=self.param_visu_langs[lang_chosen_dataset_id][
+                    "num_docs_for_words"
+                ],
+                max_len_text_display=self.param_visu_langs[lang_chosen_dataset_id][
+                    "max_len_text_display"
+                ],
+                lang_dataset_id=self.param_visu_langs[lang_chosen_dataset_id][
+                    "lang_dataset_id"
+                ],
+                path_fasttext_model=self.param_visu_langs[lang_chosen_dataset_id][
+                    "path_fasttext_model"
+                ],
+                path_sentencepiece_model=self.param_visu_langs[lang_chosen_dataset_id][
+                    "path_sentencepiece_model"
+                ],
+                path_kenlm_model=self.param_visu_langs[lang_chosen_dataset_id][
+                    "path_kenlm_model"
+                ],
+            )
+            visualization_for_lang.visualization_for_lang()
+
+    def visualization(self):
+        self.preamble()
+        self.warning_preamble()
+        self.choose_lang()
+
+
+path_instructions = "./explanation_filtering_pipeline.pdf"
+
+param_visu_langs = {
+    lang_dataset_id: {
+        "path_data": f"./{lang_dataset_id}_examples_with_stats.json",
+        "lang": langs_id.loc[langs_id["dataset_id"] == lang_dataset_id, "lang"].iloc[0],
+        "num_docs": 5000,
+        "num_docs_for_words": 500,
+        "max_len_text_display": 10000,
+        "lang_dataset_id": lang_dataset_id,
+        "path_fasttext_model": "./lid.176.bin",
+        "path_sentencepiece_model": f"./{lang_dataset_id}.sp.model",
+        "path_kenlm_model": f"./{lang_dataset_id}.arpa.bin",
+    }
+    for lang_dataset_id in ["en", "pt"]
+}
+
+visualization = Visualization(path_instructions, param_visu_langs)
+visualization.visualization()
diff --git a/en.arpa.bin b/en.arpa.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b74834cdda59ef28c35b172721256d427086ddff
--- /dev/null
+++ b/en.arpa.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04923fccbb4e63005c40f01d66112659416de01accd80d16e366a592289ee07a
+size 4444690658
diff --git a/en.sp.model b/en.sp.model
new file mode 100644
index 0000000000000000000000000000000000000000..d5cd3c4f88420f22d0a8a7123311ce894baec8ac
--- /dev/null
+++ b/en.sp.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf8147a573770b4e6c0d4df1dcb75453baa88190706dab406be7711b84f059de
+size 931348
diff --git a/en_examples_with_stats.json b/en_examples_with_stats.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec7242ca7b49a8c9faf23267418c60bfaeaad5a9
--- /dev/null
+++ b/en_examples_with_stats.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1dccf03710e9dc7ec68c676175e711be815bc29a50260f5d334156b03fe2e6d1
+size 241408394
diff --git a/explanation_filtering_pipeline.pdf b/explanation_filtering_pipeline.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..d8319eaec32419a168e1ede890fa1d5fc9547076
Binary files /dev/null and b/explanation_filtering_pipeline.pdf differ
diff --git a/filtering.py b/filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb2f4358b0a520098dd41a678a51250b4be88176
--- /dev/null
+++ b/filtering.py
@@ -0,0 +1,957 @@
+import re
+
+import numpy as np
+
+import fasttext
+
+import sentencepiece
+import kenlm
+
+import pathlib
+
+from languages_id import langs_id
+from parameters_filtering import parameters_filtering
+from normalization import normalization
+from stopwords import stopwords
+from flagged_words import flagged_words
+
+
+class LoadParameters:
+    @staticmethod
+    def load_parameters(lang_dataset_id):
+        if lang_dataset_id in parameters_filtering:
+            param = parameters_filtering[lang_dataset_id]
+        else:
+            param = parameters_filtering["default"]
+        return param
+
+    @staticmethod
+    def load_stopwords(lang_dataset_id):
+        stopwords_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "stopwords_id"
+        ].iloc[0]
+        if stopwords_lang_id:
+            stopwords_lang = set(stopwords[stopwords_lang_id])
+        else:
+            stopwords_lang = None
+        return stopwords_lang
+
+    @staticmethod
+    def load_flagged_words(lang_dataset_id):
+        flagged_words_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "flagged_words_id"
+        ].iloc[0]
+        if flagged_words_lang_id:
+            flagged_words_lang = set(flagged_words[flagged_words_lang_id])
+        else:
+            flagged_words_lang = None
+        return flagged_words_lang
+
+    @staticmethod
+    def load_model_lang_id(lang_dataset_id, path_fasttext_model):
+        fasttext_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "fasttext_id"
+        ].iloc[0]
+        if fasttext_lang_id:
+            model_lang_id = fasttext.load_model(path_fasttext_model)
+        else:
+            model_lang_id = None
+        return model_lang_id
+
+    @staticmethod
+    def load_sentencepiece_model(lang_dataset_id, path_sentencepiece_model):
+        sentencepiece_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "sentencepiece_id"
+        ].iloc[0]
+        if sentencepiece_lang_id:
+            sentencepiece_model = sentencepiece.SentencePieceProcessor()
+            sentencepiece_model.load(path_sentencepiece_model)
+        else:
+            sentencepiece_model = None
+        return sentencepiece_model
+
+    @staticmethod
+    def load_kenlm_model(lang_dataset_id, path_kenlm_model):
+        kenlm_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "kenlm_id"
+        ].iloc[0]
+        if kenlm_lang_id:
+            kenlm_model = kenlm.Model(path_kenlm_model)
+        else:
+            kenlm_model = None
+        return kenlm_model
+
+
+class ModifyingDocuments:
+    @staticmethod
+    def remove_empty_el_from_list(list_):
+        return [el for el in list_ if el]
+
+    @staticmethod
+    def remove_non_printing_characters(document, non_printing_characters_re):
+        return non_printing_characters_re.sub("", document)
+
+    @staticmethod
+    def uniform_whitespace(
+        document,
+        whitespace=[
+            " ",
+            " ",
+            " ",
+            " ",
+            " ",
+            "　",
+            " ",
+            " ",
+            " ",
+            " ",
+            "￼",
+            "",
+        ],
+    ):
+        """There are different whitespace characters."""
+        whitespace = set(whitespace)
+        document = "".join(
+            [char if char not in whitespace else " " for char in document]
+        )
+        return document
+
+    @staticmethod
+    def replace_digits_with_zeros(document, digits_re):
+        return digits_re.sub("0", document)
+
+    @staticmethod
+    def replace_unicode_punctuation(document, unicode_punctuation):
+        return "".join(unicode_punctuation.get(c, c) for c in document)
+
+    @staticmethod
+    def normalization(
+        document,
+        remove_non_printing_characters,
+        strip,
+        lower_case,
+        uniform_whitespace,
+        replace_digits_with_zeros,
+        replace_unicode_punctuation,
+        non_printing_characters_re=normalization["non_printing_characters_re"],
+        digits_re=normalization["digits_re"],
+        unicode_punctuation=normalization["unicode_punctuation"],
+    ):
+        if remove_non_printing_characters:
+            document = ModifyingDocuments.remove_non_printing_characters(
+                document, non_printing_characters_re
+            )
+        if strip:
+            document = document.strip()
+        if not document:
+            return document
+        if lower_case:
+            document = document.lower()
+        if uniform_whitespace:
+            document = ModifyingDocuments.uniform_whitespace(document)
+        if replace_digits_with_zeros:
+            document = ModifyingDocuments.replace_digits_with_zeros(document, digits_re)
+        if replace_unicode_punctuation:
+            document = ModifyingDocuments.replace_unicode_punctuation(
+                document, unicode_punctuation
+            )
+        return document
+
+    @staticmethod
+    def tokenization(document, sentencepiece_model, join_on_whitespace):
+        document_tokenized = sentencepiece_model.encode_as_pieces(document)
+        if join_on_whitespace:
+            document_tokenized = " ".join(document_tokenized)
+        return document_tokenized
+
+    @staticmethod
+    def split_on_whitespace(
+        document,
+        new_line=False,
+        tab=False,
+    ):
+        """This method also removes concatenated spaces."""
+        sep = [" "] + new_line * ["\n"] + tab * ["\t"]
+        sep = "|".join(sep)
+        split_document = re.split(sep, document)
+        split_document = ModifyingDocuments.remove_empty_el_from_list(split_document)
+        return split_document
+
+    @staticmethod
+    def strip(document, strip_characters):
+        """Way faster than document.strip(strip_characters)
+        since strip_characters is now a set instead of a str,
+        and it contains a lot of elements (all the emojis)."""
+        if not document:
+            return document
+        beg_ind = 0
+        end_ind = len(document)
+        for i in range(len(document)):
+            if document[i] in strip_characters:
+                beg_ind += 1
+            else:
+                break
+        for i in range(1, len(document) + 1):
+            if document[-i] in strip_characters:
+                end_ind -= 1
+            else:
+                break
+        document_stripped = document[beg_ind:end_ind]
+        return document_stripped
+
+    @staticmethod
+    def get_words_from_document(
+        document, sentencepiece_model_tok, lower_case, strip_characters
+    ):
+        """Get words from a document. Non reversible since the document
+        is split on multiple characters, words are stripped of
+        special characters and characters are converted to lower case.
+        Useful to compute ratios, like the stopwords ratio."""
+        if sentencepiece_model_tok:
+            document_normalized = ModifyingDocuments.normalization(
+                document=document,
+                remove_non_printing_characters=True,
+                strip=True,
+                lower_case=True,
+                uniform_whitespace=True,
+                replace_digits_with_zeros=True,
+                replace_unicode_punctuation=True,
+            )
+            words = ModifyingDocuments.tokenization(
+                document_normalized, sentencepiece_model_tok, join_on_whitespace=False
+            )
+        else:
+            words = ModifyingDocuments.split_on_whitespace(
+                document, new_line=True, tab=True
+            )
+        if lower_case:
+            words = [word.lower() for word in words]
+        if strip_characters:
+            words = [ModifyingDocuments.strip(word, strip_characters) for word in words]
+            words = ModifyingDocuments.remove_empty_el_from_list(words)
+        return words
+
+    @staticmethod
+    def words_augmentation(words, group_size, join_char):
+        """Augment words, especially for Chinese (without a space between words)
+        and Vietnamese (with a space between syllables)."""
+        augmentation = [
+            join_char.join(words[i : i + group_size])
+            for i in range(len(words) - group_size + 1)
+        ]
+        return augmentation
+
+    @staticmethod
+    def split_on_newline_tab_whitespace(document):
+        """First split on "\n", then on "\t", then on " "."""
+        sentences = document.split("\n")
+        sentences = [sentence.split("\t") for sentence in sentences]
+        sentences = [
+            [
+                ModifyingDocuments.split_on_whitespace(subsentence)
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        return sentences
+
+    @staticmethod
+    def merge_on_whitespace_tab_newline(sentences):
+        """Invert the method split_on_newline_tab_whitespace.
+        Removes concatenated separators."""
+        sentences = [
+            [" ".join(subsentence) for subsentence in sentence if subsentence]
+            for sentence in sentences
+        ]
+        sentences = ["\t".join(sentence) for sentence in sentences if sentence]
+        if not sentences:
+            return ""
+        document = "\n".join(sentences)
+        return document
+
+    @staticmethod
+    def should_keep_word_with_incorrect_substrings(
+        word, strip_characters, incorrect_word_substrings
+    ):
+        word = ModifyingDocuments.strip(word, strip_characters)
+        should_keep = all(
+            [(i_substr not in word) for i_substr in incorrect_word_substrings]
+        )
+        return should_keep
+
+    @staticmethod
+    def remove_words_with_incorrect_substrings(
+        document,
+        strip_characters,
+        incorrect_word_substrings,
+    ):
+        sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document)
+        sentences = [
+            [
+                [
+                    word
+                    for word in subsentence
+                    if ModifyingDocuments.should_keep_word_with_incorrect_substrings(
+                        word, strip_characters, incorrect_word_substrings
+                    )
+                ]
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences)
+        return document
+
+    @staticmethod
+    def should_keep_long_word(word, strip_characters, length_word_max_cutoff):
+        """If the word is too long but it contains only one
+        special character, it might be a concatenation of one word,
+        a punctuation, and another word, with no space between them.
+        In this case, we give the word a pass."""
+        if len(word) <= length_word_max_cutoff:
+            return True
+        word = ModifyingDocuments.strip(word, strip_characters)
+        if not word:  # The word consisted only of strip characters
+            return False
+        if len(word) <= length_word_max_cutoff:
+            return True
+        return False
+
+    def remove_long_words(
+        document,
+        strip_characters,
+        length_word_max_cutoff,
+    ):
+        sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document)
+        sentences = [
+            [
+                [
+                    word
+                    for word in subsentence
+                    if ModifyingDocuments.should_keep_long_word(
+                        word,
+                        strip_characters,
+                        length_word_max_cutoff,
+                    )
+                ]
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences)
+        return document
+
+    @staticmethod
+    def modifying_documents(
+        document,
+        cond_uniform_whitespace,
+        cond_replace_unicode_punctuation,
+        cond_remove_words_with_incorrect_substrings,
+        strip_characters,
+        incorrect_word_substrings,
+        cond_remove_long_words,
+        length_word_max_cutoff,
+    ):
+        document = ModifyingDocuments.normalization(
+            document=document,
+            remove_non_printing_characters=False,
+            strip=True,
+            lower_case=False,
+            uniform_whitespace=cond_uniform_whitespace,
+            replace_digits_with_zeros=False,
+            replace_unicode_punctuation=cond_replace_unicode_punctuation,
+        )
+        if cond_remove_words_with_incorrect_substrings:
+            document = ModifyingDocuments.remove_words_with_incorrect_substrings(
+                document,
+                strip_characters,
+                incorrect_word_substrings,
+            )
+        if cond_remove_long_words:
+            document = ModifyingDocuments.remove_long_words(
+                document,
+                strip_characters,
+                length_word_max_cutoff,
+            )
+        return document
+
+
+class FunctionDatasetModifyingDocuments:
+    def __init__(self, lang_dataset_id):
+        self.lang_dataset_id = lang_dataset_id
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+
+    def __call__(self, example):
+        example["text"] = ModifyingDocuments.modifying_documents(
+            document=example["text"],
+            cond_uniform_whitespace=self.param["cond_uniform_whitespace"],
+            cond_replace_unicode_punctuation=self.param[
+                "cond_replace_unicode_punctuation"
+            ],
+            cond_remove_words_with_incorrect_substrings=self.param[
+                "cond_remove_words_with_incorrect_substrings"
+            ],
+            strip_characters=self.param["strip_characters"],
+            incorrect_word_substrings=self.param["incorrect_word_substrings"],
+            cond_remove_long_words=self.param["cond_remove_long_words"],
+            length_word_max_cutoff=self.param["length_word_max_cutoff"],
+        )
+        return example
+
+    def __reduce__(self):
+        return (self.__class__, (self.lang_dataset_id,))
+
+
+class Filtering:
+    @staticmethod
+    def check_number_words(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        number_words_min_cutoff,
+        number_words_max_cutoff,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=False,
+            strip_characters=strip_characters,
+        )
+        cond = (len(words) >= number_words_min_cutoff) and (
+            len(words) <= number_words_max_cutoff
+        )
+        return cond
+
+    @staticmethod
+    def compute_character_repetition_ratio(document, character_repetition_length):
+        def get_freq_character_ngrams(document, n):
+            character_ngrams = [
+                document[i : i + n] for i in range(len(document) - n + 1)
+            ]
+            freq_character_ngrams = {}
+            for character_ngram in character_ngrams:
+                freq_character_ngrams[character_ngram] = (
+                    freq_character_ngrams.get(character_ngram, 0) + 1
+                )
+            return freq_character_ngrams
+
+        freq_character_ngrams = get_freq_character_ngrams(
+            document, character_repetition_length
+        )
+        if len(freq_character_ngrams) == 0:
+            return 0
+        freq_character_ngrams = list(freq_character_ngrams.values())
+        freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
+        val_less_than_one = len([el for el in freq_character_ngrams if el > 1])
+        num_rep_character_ngrams = min(
+            int(np.sqrt(len(freq_character_ngrams))),
+            len(freq_character_ngrams) - val_less_than_one,
+        )
+        character_repetition_ratio = sum(
+            freq_character_ngrams[:num_rep_character_ngrams]
+        ) / sum(freq_character_ngrams)
+        return character_repetition_ratio
+
+    @staticmethod
+    def check_character_repetition_removal(
+        document,
+        character_repetition_length,
+        character_repetition_max_cutoff,
+    ):
+        character_repetition_ratio = Filtering.compute_character_repetition_ratio(
+            document, character_repetition_length
+        )
+        cond = character_repetition_ratio <= character_repetition_max_cutoff
+        return cond
+
+    @staticmethod
+    def compute_word_repetition_ratio(
+        document, sentencepiece_model_tok, strip_characters, word_repetition_length
+    ):
+        def get_freq_word_ngrams(
+            document, sentencepiece_model_tok, strip_characters, n
+        ):
+            words = ModifyingDocuments.get_words_from_document(
+                document,
+                sentencepiece_model_tok,
+                lower_case=True,
+                strip_characters=strip_characters,
+            )
+            word_ngrams = [
+                " ".join(words[i : i + n]) for i in range(len(words) - n + 1)
+            ]
+            freq_word_ngrams = {}
+            for word_ngram in word_ngrams:
+                freq_word_ngrams[word_ngram] = freq_word_ngrams.get(word_ngram, 0) + 1
+            return freq_word_ngrams
+
+        freq_word_ngrams = get_freq_word_ngrams(
+            document, sentencepiece_model_tok, strip_characters, word_repetition_length
+        )
+        if len(freq_word_ngrams) == 0:
+            return 0
+        freq_word_ngrams = list(freq_word_ngrams.values())
+        word_repetition_ratio = sum(
+            freq for freq in freq_word_ngrams if freq > 1
+        ) / sum(freq_word_ngrams)
+        return word_repetition_ratio
+
+    @staticmethod
+    def check_word_repetition_removal(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        word_repetition_length,
+        word_repetition_max_cutoff,
+    ):
+        word_repetition_ratio = Filtering.compute_word_repetition_ratio(
+            document, sentencepiece_model_tok, strip_characters, word_repetition_length
+        )
+        cond = word_repetition_ratio <= word_repetition_max_cutoff
+        return cond
+
+    @staticmethod
+    def compute_special_characters_ratio(document, special_characters):
+        if len(document) == 0:
+            return 0
+        special_characters_ratio = len(
+            [char for char in document if char in special_characters]
+        ) / len(document)
+        return special_characters_ratio
+
+    @staticmethod
+    def check_special_characters(
+        document,
+        special_characters,
+        special_characters_max_cutoff,
+    ):
+        special_characters_ratio = Filtering.compute_special_characters_ratio(
+            document, special_characters
+        )
+        cond = special_characters_ratio <= special_characters_max_cutoff
+        return cond
+
+    @staticmethod
+    def compute_stopwords_ratio(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        stopwords,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=True,
+            strip_characters=strip_characters,
+        )
+        if not words:
+            return 0
+        augmentation = []
+        if cond_words_augmentation:
+            augmentation = [
+                ModifyingDocuments.words_augmentation(
+                    words, group_size, words_augmentation_join_char
+                )
+                for group_size in words_augmentation_group_sizes
+            ]
+            augmentation = [word for augm in augmentation for word in augm]
+        stopwords_ratio = len(
+            [word for word in words + augmentation if word in stopwords]
+        ) / len(words)
+        if stopwords_ratio > 1.0:
+            stopwords_ratio = 1.0
+        return stopwords_ratio
+
+    @staticmethod
+    def check_stopwords(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        stopwords,
+        stopwords_min_cutoff,
+    ):
+        cond = True
+        if stopwords:
+            stopwords_ratio = Filtering.compute_stopwords_ratio(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                stopwords,
+            )
+            cond = stopwords_ratio >= stopwords_min_cutoff
+        return cond
+
+    @staticmethod
+    def compute_flagged_words_ratio(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        flagged_words,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=True,
+            strip_characters=strip_characters,
+        )
+        if not words:
+            return 0
+        augmentation = []
+        if cond_words_augmentation:
+            augmentation = [
+                ModifyingDocuments.words_augmentation(
+                    words, group_size, words_augmentation_join_char
+                )
+                for group_size in words_augmentation_group_sizes
+            ]
+            augmentation = [word for augm in augmentation for word in augm]
+        flagged_words_ratio = len(
+            [word for word in words + augmentation if word in flagged_words]
+        ) / len(words)
+        if flagged_words_ratio > 1.0:
+            flagged_words_ratio = 1.0
+        return flagged_words_ratio
+
+    @staticmethod
+    def check_flagged_words(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        flagged_words,
+        flagged_words_max_cutoff,
+    ):
+        cond = True
+        if flagged_words:
+            flagged_words_ratio = Filtering.compute_flagged_words_ratio(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                flagged_words,
+            )
+            cond = flagged_words_ratio <= flagged_words_max_cutoff
+        return cond
+
+    @staticmethod
+    def compute_lang_id_pred_score(document, model_lang_id):
+        document = document.lower().replace("\n", " ")
+        pred = model_lang_id.predict(document)
+        lang_pred_fasttext_id = pred[0][0].replace("__label__", "")
+        score_pred = pred[1][0]
+        lang_pred_dataset_id = langs_id.loc[
+            langs_id["fasttext_id"] == lang_pred_fasttext_id, "dataset_id"
+        ]
+        if len(lang_pred_dataset_id) > 0:
+            lang_pred_dataset_id = lang_pred_dataset_id.iloc[0]
+        else:
+            lang_pred_dataset_id = "unknown"
+        return lang_pred_dataset_id, score_pred
+
+    @staticmethod
+    def check_lang_id(
+        document,
+        lang_dataset_id,
+        model_lang_id,
+        lang_id_min_cutoff,
+    ):
+        cond = True
+        if model_lang_id:
+            lang_pred_dataset_id, score_pred = Filtering.compute_lang_id_pred_score(
+                document, model_lang_id
+            )
+            cond = (lang_pred_dataset_id == lang_dataset_id) and (
+                score_pred >= lang_id_min_cutoff
+            )
+        return cond
+
+    @staticmethod
+    def compute_perplexity_score(document, sentencepiece_model, kenlm_model):
+        document = ModifyingDocuments.normalization(
+            document=document,
+            remove_non_printing_characters=True,
+            strip=True,
+            lower_case=False,
+            uniform_whitespace=True,
+            replace_digits_with_zeros=True,
+            replace_unicode_punctuation=True,
+        )
+        document = ModifyingDocuments.tokenization(
+            document, sentencepiece_model, join_on_whitespace=True
+        )
+        doc_log_score, doc_length = 0, 0
+        for line in document.split("\n"):
+            log_score = kenlm_model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        pp_score = 10.0 ** (-doc_log_score / doc_length)
+        pp_score = round(pp_score, 1)
+        return pp_score
+
+    @staticmethod
+    def check_perplexity(
+        document,
+        sentencepiece_model,
+        kenlm_model,
+        perplexity_max_cutoff,
+    ):
+        cond = True
+        if kenlm_model:
+            score = Filtering.compute_perplexity_score(
+                document, sentencepiece_model, kenlm_model
+            )
+            cond = score <= perplexity_max_cutoff
+        return cond
+
+    @staticmethod
+    def filtering(
+        document,
+        cond_check_number_words,
+        sentencepiece_model_tok,
+        strip_characters,
+        number_words_min_cutoff,
+        number_words_max_cutoff,
+        cond_check_character_repetition_removal,
+        character_repetition_length,
+        character_repetition_max_cutoff,
+        cond_check_word_repetition_removal,
+        word_repetition_length,
+        word_repetition_max_cutoff,
+        cond_check_special_characters,
+        special_characters,
+        special_characters_max_cutoff,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        cond_check_stopwords,
+        stopwords,
+        stopwords_min_cutoff,
+        cond_check_flagged_words,
+        flagged_words,
+        flagged_words_max_cutoff,
+        cond_check_lang_id,
+        lang_dataset_id,
+        model_lang_id,
+        lang_id_min_cutoff,
+        cond_check_perplexity,
+        sentencepiece_model,
+        kenlm_model,
+        perplexity_max_cutoff,
+    ):
+        if cond_check_number_words:
+            if not Filtering.check_number_words(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                number_words_min_cutoff,
+                number_words_max_cutoff,
+            ):
+                return False
+        if cond_check_character_repetition_removal:
+            if not Filtering.check_character_repetition_removal(
+                document,
+                character_repetition_length,
+                character_repetition_max_cutoff,
+            ):
+                return False
+        if cond_check_word_repetition_removal:
+            if not Filtering.check_word_repetition_removal(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                word_repetition_length,
+                word_repetition_max_cutoff,
+            ):
+                return False
+        if cond_check_special_characters:
+            if not Filtering.check_special_characters(
+                document,
+                special_characters,
+                special_characters_max_cutoff,
+            ):
+                return False
+        if cond_check_stopwords:
+            if not Filtering.check_stopwords(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                stopwords,
+                stopwords_min_cutoff,
+            ):
+                return False
+        if cond_check_flagged_words:
+            if not Filtering.check_flagged_words(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                flagged_words,
+                flagged_words_max_cutoff,
+            ):
+                return False
+        if cond_check_lang_id:
+            if not Filtering.check_lang_id(
+                document,
+                lang_dataset_id,
+                model_lang_id,
+                lang_id_min_cutoff,
+            ):
+                return False
+        if cond_check_perplexity:
+            if not Filtering.check_perplexity(
+                document,
+                sentencepiece_model,
+                kenlm_model,
+                perplexity_max_cutoff,
+            ):
+                return False
+        return True
+
+
+class FunctionDatasetFiltering:
+    def __init__(
+        self,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
+    ):
+        self.lang_dataset_id = lang_dataset_id
+        self.path_fasttext_model = path_fasttext_model
+        self.path_sentencepiece_model = path_sentencepiece_model
+        self.path_kenlm_model = path_kenlm_model
+
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+        self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
+        self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
+        self.model_lang_id = LoadParameters.load_model_lang_id(
+            lang_dataset_id, path_fasttext_model
+        )
+        self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
+            lang_dataset_id, path_sentencepiece_model
+        )
+        self.sentencepiece_model_tok = (
+            self.sentencepiece_model if self.param["tokenization"] else None
+        )
+        self.kenlm_model = LoadParameters.load_kenlm_model(
+            lang_dataset_id, path_kenlm_model
+        )
+
+    def __call__(self, example):
+        keep_example = Filtering.filtering(
+            document=example["text"],
+            cond_check_number_words=self.param["cond_check_number_words"],
+            sentencepiece_model_tok=self.sentencepiece_model_tok,
+            strip_characters=self.param["strip_characters"],
+            number_words_min_cutoff=self.param["number_words_min_cutoff"],
+            number_words_max_cutoff=self.param["number_words_max_cutoff"],
+            cond_check_character_repetition_removal=self.param[
+                "cond_check_character_repetition_removal"
+            ],
+            character_repetition_length=self.param["character_repetition_length"],
+            character_repetition_max_cutoff=self.param[
+                "character_repetition_max_cutoff"
+            ],
+            cond_check_word_repetition_removal=self.param[
+                "cond_check_word_repetition_removal"
+            ],
+            word_repetition_length=self.param["word_repetition_length"],
+            word_repetition_max_cutoff=self.param["word_repetition_max_cutoff"],
+            cond_check_special_characters=self.param["cond_check_special_characters"],
+            special_characters=self.param["special_characters"],
+            special_characters_max_cutoff=self.param["special_characters_max_cutoff"],
+            cond_words_augmentation=self.param["cond_words_augmentation"],
+            words_augmentation_group_sizes=self.param["words_augmentation_group_sizes"],
+            words_augmentation_join_char=self.param["words_augmentation_join_char"],
+            cond_check_stopwords=self.param["cond_check_stopwords"],
+            stopwords=self.stopwords,
+            stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
+            cond_check_flagged_words=self.param["cond_check_flagged_words"],
+            flagged_words=self.flagged_words,
+            flagged_words_max_cutoff=self.param["flagged_words_max_cutoff"],
+            cond_check_lang_id=self.param["cond_check_lang_id"],
+            lang_dataset_id=self.lang_dataset_id,
+            model_lang_id=self.model_lang_id,
+            lang_id_min_cutoff=self.param["lang_id_min_cutoff"],
+            cond_check_perplexity=self.param["cond_check_perplexity"],
+            sentencepiece_model=self.sentencepiece_model,
+            kenlm_model=self.kenlm_model,
+            perplexity_max_cutoff=self.param["perplexity_max_cutoff"],
+        )
+        return keep_example
+
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (
+                self.lang_dataset_id,
+                self.path_fasttext_model,
+                self.path_sentencepiece_model,
+                self.path_kenlm_model,
+            ),
+        )
+
+
+class DatasetFiltering:
+    def __init__(
+        self,
+        dataset,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
+        num_proc,
+        path_dir_save_dataset,
+    ):
+        self.ds = dataset
+        self.lang_dataset_id = lang_dataset_id
+        self.path_fasttext_model = path_fasttext_model
+        self.path_sentencepiece_model = path_sentencepiece_model
+        self.path_kenlm_model = path_kenlm_model
+        self.num_proc = num_proc
+        self.path_dir_save_dataset = path_dir_save_dataset
+
+    def modifying_documents(self):
+        func_dataset_modifying_documents = FunctionDatasetModifyingDocuments(
+            self.lang_dataset_id
+        )
+        self.ds = self.ds.map(func_dataset_modifying_documents, num_proc=self.num_proc)
+
+    def filtering(self):
+        func_dataset_filtering = FunctionDatasetFiltering(
+            self.lang_dataset_id,
+            self.path_fasttext_model,
+            self.path_sentencepiece_model,
+            self.path_kenlm_model,
+        )
+        self.ds = self.ds.filter(func_dataset_filtering, num_proc=self.num_proc)
+
+    def save_dataset(self):
+        pathlib.Path(self.path_dir_save_dataset).mkdir(parents=True, exist_ok=True)
+        path_dir_save_dataset = pathlib.PurePath(
+            self.path_dir_save_dataset, self.lang_dataset_id
+        )
+        pathlib.Path(path_dir_save_dataset).mkdir(parents=True, exist_ok=True)
+        self.ds.save_to_disk(path_dir_save_dataset)
diff --git a/flagged_words.py b/flagged_words.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcec0e8243ecc8c9b9e65014d05812d8ac8fdc4b
--- /dev/null
+++ b/flagged_words.py
@@ -0,0 +1,1055 @@
+# Merge
+# https://github.com/zacanger/profane-words
+# and
+# https://github.com/thisandagain/washyourmouthoutwithsoap/blob/develop/data/build.json
+# and
+# https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
+
+
+english_flagged_words = [
+    "anal",
+    "bareback",
+    "bbw",
+    "bdsm",
+    "blowjob",
+    "blowjobs",
+    "brazzers",
+    "bukkake",
+    "camgirl",
+    "camwhore",
+    "cocksucking",
+    "cougar",
+    "creampie",
+    "cuckold",
+    "cum",
+    "cumming",
+    "cums",
+    "cumshot",
+    "cumshots",
+    "cumslut",
+    "cunnilingus",
+    "deepthroat",
+    "deepthroating",
+    "dildo",
+    "dildos",
+    "dogging",
+    "doggystyle",
+    "dominatrix",
+    "erotic",
+    "fellatio",
+    "femdom",
+    "fingering",
+    "fisting",
+    "footjob",
+    "gangbang",
+    "handjob",
+    "hentai",
+    "horney",
+    "horniest",
+    "horny",
+    "jism",
+    "jizz",
+    "masterbating",
+    "masturbate",
+    "masturbating",
+    "masturbation",
+    "milf",
+    "orgies",
+    "orgy",
+    "pegging",
+    "porn",
+    "pornhub",
+    "porno",
+    "pornos",
+    "pornstar",
+    "pornstars",
+    "redtube",
+    "rimming",
+    "slutty",
+    "squirting",
+    "strapon",
+    "threesome",
+    "vibrator",
+    "xhamster",
+    "xnxx",
+    "xvideos",
+    "xxx",
+    "youporn",
+]
+
+
+flagged_words = {
+    "ar": english_flagged_words
+    + [
+        "إباحي",
+        "احتلام",
+        "است",
+        "استمناء",
+        "اغتصاب",
+        "أورغازم",
+        "إيروتيك",
+        "أير",
+        "بز",
+        "بزاز",
+        "بظر",
+        "بورن",
+        "بيضان",
+        "مص",
+        "ثدي",
+        "جماع",
+        "جنس",
+        "حلمة",
+        "خلاعة",
+        "خنثي",
+        "خول",
+        "دعارة",
+        "زب",
+        "سحاق",
+        "سحاقية",
+        "سكس",
+        "سيكس",
+        "شاذ",
+        "شبق",
+        "شرج",
+        "شرموطة",
+        "شهواني",
+        "شهوة",
+        "طيز",
+        "عادة السرية",
+        "عاهرة",
+        "عرص",
+        "فاسقة",
+        "فرج",
+        "قحبة",
+        "قضيب",
+        "كس",
+        "لحس",
+        "لعق",
+        "لواط",
+        "لوطي",
+        "مؤخرة",
+        "متناك",
+        "متناكة",
+        "مومس",
+        "مثير",
+        "مص",
+        "مضاجعة",
+        "مفلقسة",
+        "مني",
+        "مهتاج",
+        "نشوة",
+        "نكاح",
+        "نيك",
+    ],
+    "ca": english_flagged_words
+    + [
+        "cagarro",
+        "cagarros",
+        "cipote",
+        "cipotes",
+        "collons",
+        "colló",
+        "consolador",
+        "consoladors",
+        "cony",
+        "conys",
+        "corre's",
+        "corre't",
+        "corregut",
+        "cunillingus",
+        "córrer-se",
+        "escorreguda",
+        "escorregudes",
+        "escorregut",
+        "escrot",
+        "escrots",
+        "escórre's",
+        "escórre't",
+        "escórrer-se",
+        "mamada",
+        "mamadera",
+        "mamaderes",
+        "mamades",
+        "masturba",
+        "masturbacions",
+        "masturbació",
+        "masturbant",
+        "masturbar",
+        "masturbar-se",
+        "masturbat",
+        "masturbats",
+        "masturbes",
+        "orgasme",
+        "orgasmes",
+        "ou",
+        "ous",
+        "palla",
+        "palles",
+        "pornografia",
+        "semen",
+        "semens",
+        "verga",
+        "vergues",
+        "xxx",
+    ],
+    "en": english_flagged_words,
+    "es": english_flagged_words
+    + [
+        "chupar el coño",
+        "chupar la concha",
+        "chupar la polla",
+        "chupar la verga",
+        "comer el coño",
+        "comer la concha",
+        "comer la polla",
+        "comer la verga",
+        "coprofagía",
+        "correrse",
+        "cunillingus",
+        "fagging",
+        "felación",
+        "felching",
+        "follada",
+        "follador de culo",
+        "folladores",
+        "fudge packer",
+        "hacer una paja",
+        "hacerse una paja",
+        "hore",
+        "kock",
+        "macizorra",
+        "madre folladora",
+        "mamada",
+        "perro follador",
+        "pisser",
+        "pornografía",
+        "sado",
+        "sadomasoquismo",
+        "sadomasoquista",
+        "sexo anal",
+        "skank",
+        "smegma",
+        "x clasificado",
+    ],
+    "eu": english_flagged_words + [],
+    "fr": english_flagged_words
+    + [
+        "baiseurs",
+        "baiseur",
+        "baiseuse",
+        "baiseuses",
+        "branlette",
+        "branlettes",
+        "branleuse",
+        "branleuses",
+        "cunillingus",
+        "cunilingus",
+        "enculée",
+        "enculées",
+        "enculation",
+        "enculations",
+        "enculement",
+        "enculements",
+        "fellation",
+        "fellations",
+        "porno",
+        "pornos",
+        "pornographie",
+        "pornographique",
+        "pornographiques",
+        "salope",
+        "salopes",
+        "suceuse",
+        "suceuses",
+        "xxx",
+    ],
+    "ha": english_flagged_words
+    + [
+        "bf",
+        "bura",
+        "burar",
+        "cin gindi",
+        "duri",
+        "durin",
+        "gindi",
+        "gindin",
+        "guntsu",
+        "guntsun",
+        "karuwa",
+        "karuwai",
+        "karuwar",
+        "maniyyi",
+        "maɗigo",
+        "maɗugo",
+        "nonuwa",
+        "shan gindi",
+        "tsuliya",
+        "tsuliyariskanci",
+        "ɗuwaiwai",
+        "ɗuwaiwan",
+        "ɗuwawu",
+        "ɗuwawun",
+    ],
+    "hi": english_flagged_words
+    + [
+        "अंडकोश की थैली",
+        "एक्स रेटेड",
+        "ओगाज़्म",
+        "कामोद्दीपक चित्र",
+        "कालीन का चूरा",
+        "कून",
+        "कॉक",
+        "गेंद का थैला",
+        "चाकलेट का रंग",
+        "चूची",
+        "चूतड़",
+        "झटका बंद",
+        "ठगना पैकर",
+        "डिल्डो",
+        "नितंब",
+        "पिछाड़ी",
+        "पीड़न कामुक",
+        "पॉर्न",
+        "फटना",
+        "फूहड़",
+        "बट",
+        "बहुत मदहोश",
+        "बेल अंत",
+        "भगवान-शापित",
+        "भगशेफ",
+        "माँ कमीने",
+        "मुखमैथुन",
+        "मुर्गा चूसने वाला",
+        "रक्तरंजित",
+        "लेबिया",
+        "वहशी",
+        "वहशीता",
+        "वैंग",
+        "शिश्नमल",
+        "संभोग सुख",
+        "सह शॉट",
+        "सींग का बना हुआ",
+        "होर",
+        "घपा घप",
+        "चुदाई",
+        "चुदक्कड़",
+    ],
+    "id": english_flagged_words
+    + [
+        "bokep",
+        "coli",
+        "colmek",
+        "grepe",
+        "horni",
+        "janda",
+        "jembut",
+        "jilat memek",
+        "jilmek",
+        "kontol",
+        "masturbasi",
+        "memek",
+        "ngentot",
+        "ngewe",
+        "peju",
+        "pepek",
+        "pornografi",
+        "sange",
+        "sepong",
+        "tusbol",
+    ],
+    "kn": english_flagged_words
+    + [
+        "ಗರ್ಭಪಾತ",
+        "ಗುದ",
+        "ಗುದದ್ವಾರ",
+        "ಕತ್ತೆ",
+        "ಆಶ್-ಫಕರ್",
+        "ಅಸ್ಹೋಲ್",
+        "ಅಸೋಲೆಸ್",
+        "ಬಾಲ್ಬಾಗ್",
+        "ಚೆಂಡುಗಳು",
+        "ಬಾಸ್ಟರ್ಡ್",
+        "ಬೆಲೆಂಡ್",
+        "ಮೃದ್ವಂಗಿ",
+        "ಪ್ರಾಣಿಜನ್ಯತೆ",
+        "ಬಿಚ್",
+        "ಬಿಟ್ಚಿಸ್",
+        "ಬೆಚಿಂಗ್",
+        "ರಕ್ತಸಿಕ್ತ",
+        "ಬ್ಲೋಜಾಬ್",
+        "ಬೊಲ್ಲೊಕ್",
+        "ಕುರುಚಲು ಗಿಡ",
+        "ಬೂಬಿಗಳು",
+        "ಸ್ತನಗಳನ್ನು",
+        "ಬುಕೆಟಾ",
+        "ತಿಕ",
+        "ಬಟ್",
+        "ಕಾರ್ಪೆಟ್ ಮಂಚರ್",
+        "ಚಿಂಕ್",
+        "ಸಿಪಾ",
+        "ಚಂದ್ರನಾಡಿ",
+        "ಕೋಳಿ",
+        "ಕೋಳಿ ಸಕ್ಕರ್",
+        "ಕಾಕ್ಸ್",
+        "ಕೂನ್",
+        "ಅಮೇಧ್ಯ",
+        "ಕಮ್",
+        "ಕಮ್ಶಾಟ್",
+        "ಕುನಿಲ್ಲಸ್",
+        "ಕಂಟ್",
+        "ಡ್ಯಾಮ್",
+        "ಡಿಕ್",
+        "ದ್ವಿಧ್ರುವಿ",
+        "dildos",
+        "ಡಿಂಕ್",
+        "ನಾಯಿ-ಫಕರ್",
+        "ಡಚೆ",
+        "ಡೈಕ್",
+        "ಹೊರಹೊಮ್ಮಿಸು",
+        "ಸ್ಫೂರ್ತಿ",
+        "ಎಜಾಕ್ಯುಲೇಟ್ಸ್",
+        "ಇಜಲಲೇಟಿಂಗ್",
+        "ಉದ್ಗಾರ",
+        "ತಮಾಷೆ",
+        "ಮಂದಗತಿ",
+        "ಮಬ್ಬು",
+        "fagots",
+        "ಫ್ಯಾನಿ",
+        "ಹೊಡೆತ",
+        "ಪತನ",
+        "ಚಾಚುಪಟ್ಟಿ",
+        "ಫಕ್",
+        "ನಾಶವಾಗಿದ್ದನು",
+        "ಫಕರ್",
+        "fuckers",
+        "ಫಕಿಂಗ್",
+        "ಫಕಿಂಗ್ಸ್",
+        "ಇಷ್ಟಪಡುತ್ತಾನೆ",
+        "ಮಿಠಾಯಿ ಪ್ಯಾಕರ್",
+        "ದೇವರನ್ನು ಹಾನಿಗೊಳಗಾಯಿತು",
+        "ಗಾಡ್ಡಮ್",
+        "ನರಕ",
+        "ಹೋರ್",
+        "ಮೊನಚಾದ",
+        "ಜರ್ಕ್-ಆಫ್",
+        "ಕೋಕ್",
+        "ಯೋನಿಯ",
+        "ಕಾಮ",
+        "ಕಾಮುಕ",
+        "ಮಾಸೋಚಿಸ್ಟ್",
+        "ಹಸ್ತಮೈಥುನ ಮಾಡು",
+        "ತಾಯಿ ಫಕರ್",
+        "ನಾಜಿ",
+        "ನಿಗರ್",
+        "ನಿಗ್ಗರ್ಗಳು",
+        "ಒರಾಸಿಮ್",
+        "ಪರಾಕಾಷ್ಠೆ",
+        "ಪರಾಕಾಷ್ಠೆಗಳನ್ನು",
+        "ಪೆಕರ್",
+        "ಶಿಶ್ನ",
+        "ಮೂತ್ರ ವಿಸರ್ಜಿಸು",
+        "ನಿರುತ್ಸಾಹಗೊಂಡಿದೆ",
+        "ಪಿಸರ್",
+        "ಮೂತ್ರಪಿಂಡಗಳು",
+        "pissing",
+        "ಪಿಸ್ಸಾಫ್",
+        "ಪೂಪ್",
+        "ಅಶ್ಲೀಲತೆ",
+        "ಅಶ್ಲೀಲ",
+        "ಚುಚ್ಚು",
+        "ಪ್ರಿಕ್ಸ್",
+        "ಪಬ್",
+        "ಪುಸಿಗಳು",
+        "ಪುಸಿ",
+        "ಅತ್ಯಾಚಾರ",
+        "ಅತ್ಯಾಚಾರಿ",
+        "ಗುದನಾಳದ",
+        "ರಿಟಾರ್ಡ್",
+        "ಹಚ್ಚುವುದು",
+        "ದುಃಖಗಾರ",
+        "ತಿರುಗಿಸುವುದು",
+        "ಸ್ಕ್ರೋಟಮ್",
+        "ವೀರ್ಯ",
+        "ಲೈಂಗಿಕತೆ",
+        "ಶಾಗ್",
+        "ಶಾಗ್ಗಿಂಗ್",
+        "ಶೆಮೇಲ್",
+        "ಶಿಟ್",
+        "ಷೈಟ್",
+        "ಶಿಟ್ಸ್",
+        "shitted",
+        "ಅಲುಗಾಡುವಿಕೆ",
+        "ಅಸಹ್ಯ",
+        "ಸ್ಕಾಂಕ್",
+        "ಸೂಳೆ",
+        "ಸ್ಲಟ್ಗಳು",
+        "ಸ್ಮೆಗ್ಮಾ",
+        "ಕೊಳೆತ",
+        "ಸ್ನ್ಯಾಚ್",
+        "ಮಗ-ಆಫ್-ಬಿಚ್",
+        "spac",
+        "ಉಬ್ಬು",
+        "ವೃಷಣ",
+        "ಟಿಟ್",
+        "ಚೇಕಡಿ ಹಕ್ಕಿಗಳು",
+        "turd",
+        "ಯೋನಿ",
+        "ವಯಾಗ್ರ",
+        "ವಾಂಗ್",
+        "ಮುಷ್ಕರ",
+        "x ರೇಟೆಡ್",
+        "xxx",
+    ],
+    "ml": english_flagged_words
+    + [
+        "ഗർഭഛിദ്രം",
+        "വിശപ്പ്",
+        "മലദ്വാരം",
+        "കഴുത",
+        "അസി ഫക്കർ",
+        "കഴുതകളെ",
+        "ആസ്ഹോൾ",
+        "അശ്ളീലങ്ങൾ",
+        "ബോൾബാഗ്",
+        "പന്തുകൾ",
+        "തന്തയില്ലാത്തവൻ",
+        "ബെല്ലെൻഡ്",
+        "മൃഗീയമായ",
+        "മൃഗീയത",
+        "ബിച്ച്",
+        "ബിച്ചുകൾ",
+        "ബിപിഡിംഗ്",
+        "രക്തരൂക്ഷിതമായ",
+        "ആശ്വാസം",
+        "ബലോക്ക്",
+        "ബോബ്",
+        "പൂക്കൾ",
+        "സ്തനങ്ങൾ",
+        "ബ്യൂട്ടാ",
+        "ബം",
+        "മയക്കുമരുന്ന്",
+        "പരവതാനി മാൻച്ചർ",
+        "ചുംബ്",
+        "സിപാ",
+        "ക്ലോറിസിസ്",
+        "കോക്ക്",
+        "കോക്ക് സക്കർ",
+        "കോക്സ്",
+        "കോൺ",
+        "ക്രാപ്പ്",
+        "ശുക്ലം",
+        "പുരുഷാരം",
+        "സി",
+        "മുഷിഞ്ഞ",
+        "കഷ്ടം",
+        "ഡിക്ക്",
+        "ഡിൽഡോ",
+        "dildos",
+        "ഡൈൻ",
+        "നായ-ഫക്കർ",
+        "ഡച്ച്",
+        "ഡൈകെ",
+        "ശമിപ്പിക്കുക",
+        "മോഷ്ടിച്ചു",
+        "വികാരങ്ങൾ",
+        "വിരസത",
+        "മടി",
+        "ക്ഷീണിപ്പിക്കുക",
+        "fagot",
+        "വഞ്ചന",
+        "ഫാനി",
+        "വേദന",
+        "flange",
+        "ഊമ്പി",
+        "സംഭോഗം ചെയ്യുക",
+        "ഫക്കർ",
+        "നർമ്മം",
+        "ഫഡ്ജ് പാക്കർ",
+        "ദൈവം-കൊള്ളിത",
+        "ഗോഡ്ഡം",
+        "നരകം",
+        "വയ്ക്കുക",
+        "വൃത്തികെട്ട",
+        "ജെർക് ഓഫ്",
+        "കിക്ക്",
+        "ലാബിയ",
+        "മോഹം",
+        "മോഹഭംഗം",
+        "മാസോച്ചിസ്റ്റ്",
+        "സ്വയംഭോഗം ചെയ്യുക",
+        "അമ്മ ഫക്കർ",
+        "നാസി",
+        "നിഗർ",
+        "മയക്കുമരുന്നുകൾ",
+        "രതിമൂർച്ഛ",
+        "പെക്കർ",
+        "ലിംഗം",
+        "മൂത്രമൊഴിക്കുക",
+        "കുഴഞ്ഞുവീഴുന്നു",
+        "പിസ്സർ",
+        "പിസ്സകൾ",
+        "pissing",
+        "പിസ്സോഫ്",
+        "poop",
+        "അശ്ലീലം",
+        "അശ്ലീലത",
+        "പ്രാവി",
+        "വിസർജ്യങ്ങൾ",
+        "പ്യൂബ്",
+        "pussies",
+        "pussy",
+        "ബലാൽസംഗം",
+        "ബലാത്സംഗം",
+        "മലാശയം",
+        "തുടരുക",
+        "റിമ്മിംഗ്",
+        "സചിസ്റ്റ്",
+        "വഞ്ചി",
+        "പുല്ല്",
+        "ബീജം",
+        "ശവം",
+        "ഷാഗിംഗ്",
+        "അവൾ",
+        "ഷീറ്റ്",
+        "ഷെയ്റ്റ്",
+        "shits",
+        "തിന്നിട്ടില്ല",
+        "ഷോർട്ട്",
+        "ഷൈറ്റി",
+        "സ്കാൻ",
+        "മന്ദഹസരം",
+        "സ്നെഗമാ",
+        "പുഞ്ചിരി",
+        "പിടിക്കുക",
+        "വെറുക്കപ്പെട്ടയാൾ",
+        "സ്പെയ്ക്",
+        "തുളച്ച്",
+        "വൃഷണം",
+        "പേ",
+        "ടിത്ത്",
+        "കുഴപ്പമില്ല",
+        "യോനി",
+        "വരാഗ്ര",
+        "വാൽവ",
+        "വാങ്",
+        "വാൻ",
+        "വേശ്യ",
+        "x റേറ്റുചെയ്തു",
+        "xxx",
+    ],
+    "mr": english_flagged_words
+    + [
+        "गर्भपात",
+        "गुदा",
+        "गाढव",
+        "गांडुळ",
+        "asses",
+        "asshole",
+        "assholes",
+        "ballbag",
+        "चेंडू",
+        "बॅस्टर्ड",
+        "बेलेंड",
+        "बेस्टियल",
+        "प्राण्यांबरोबर",
+        "कुत्री",
+        "बिट्स",
+        "खूनी",
+        "blowjob",
+        "बोलोक",
+        "बोब",
+        "स्तन",
+        "बसीटा",
+        "बम",
+        "बट",
+        "कार्पेट मुन्चर",
+        "चिंक",
+        "सिपा",
+        "क्लिटोरिस",
+        "मुर्ख",
+        "मांसाहारी",
+        "कॉक्स",
+        "कॉनन",
+        "बकवास",
+        "सह",
+        "cumshot",
+        "कनिलिंगस",
+        "कांट",
+        "धिक्कार",
+        "डिक",
+        "dildo",
+        "डिल्डो",
+        "डंक",
+        "duche",
+        "डाईक",
+        "उद्गार",
+        "उत्साही",
+        "ejaculates",
+        "उत्सुकता",
+        "स्खलन",
+        "फॅग",
+        "फॅगिंग",
+        "फॅगॉट",
+        "फॅगॉट्स",
+        "फॅनी",
+        "फेलिंग",
+        "फॅलेटीओ",
+        "निकला",
+        "fucked",
+        "गुप्तचर",
+        "fuckers",
+        "fucking",
+        "fuckings",
+        "fucks",
+        "फडगे पॅकर",
+        "देव-शापित",
+        "देव",
+        "नरक",
+        "होरे",
+        "शिंग",
+        "झटका बंद",
+        "कॉक",
+        "लॅबिया",
+        "वासना",
+        "मासोचिस्ट",
+        "हस्तमैथुन करा",
+        "आई माकड",
+        "नाझी",
+        "निगर",
+        "निगार",
+        "ऑर्गॅसिम",
+        "संभोग",
+        "orgasms",
+        "चापटी",
+        "पुरुषाचे जननेंद्रिय",
+        "पेशी",
+        "pissed",
+        "पिसर",
+        "pisses",
+        "पिसिंग",
+        "पिसोफ",
+        "घाट",
+        "अश्लील",
+        "पोर्नोग्राफी",
+        "मुरुम",
+        "प्रिक्स",
+        "प्यूब",
+        "pussies",
+        "मांजर",
+        "बलात्कार",
+        "गुदाशय",
+        "मंद",
+        "rimming",
+        "दुःखी",
+        "screwing",
+        "स्क्रोटम",
+        "वीर्य",
+        "लिंग",
+        "शेग",
+        "shagging",
+        "शेमले",
+        "विचित्र",
+        "shite",
+        "shits",
+        "shitted",
+        "shitting",
+        "shitty",
+        "घाणेरडा",
+        "फट",
+        "sluts",
+        "सुगंध",
+        "स्मट",
+        "छेडछाड",
+        "मुलगा-एक-कुत्री",
+        "spac",
+        "तिरस्कार",
+        "परीक्षक",
+        "शीर्षक",
+        "टिट",
+        "टर्ड",
+        "योनी",
+        "वियाग्रा",
+        "वल्वा",
+        "वांग",
+        "विंक",
+        "वेश्या",
+        "एक्स रेट केले",
+        "xxx",
+    ],
+    "pt": english_flagged_words
+    + [
+        "balalao",
+        "bate uma",
+        "beijo grego",
+        "boceta",
+        "boquete",
+        "buceta",
+        "caralho",
+        "chochota",
+        "coito",
+        "cona",
+        "consolo",
+        "corno",
+        "cu",
+        "dar a bunda",
+        "dar o rabo",
+        "dildo",
+        "dildos",
+        "esporrar",
+        "estrovenga",
+        "felação",
+        "filho da puta",
+        "filhos da puta",
+        "gozada",
+        "jeba",
+        "perereca",
+        "pica",
+        "piru",
+        "porno",
+        "pornografia",
+        "pornô",
+        "porra",
+        "prostituta",
+        "pube",
+        "punheta",
+        "punheteiro",
+        "putaria",
+        "queca",
+        "sexo",
+        "siririca",
+        "tesão",
+        "trepada",
+        "verga",
+        "vibrador",
+        "xana",
+        "xochota",
+        "xoxota",
+    ],
+    "ta": english_flagged_words
+    + [
+        "ஓதா",
+        "ஒத்தா",
+        "புண்டை",
+        "ஒம்மாளே",
+        "பக்கி",
+        "கூமுட்டை",
+        "கருமம்",
+        "சனியன்",
+        "கஸ்மாலம்",
+        "சூத்து",
+    ],
+    "te": english_flagged_words
+    + [
+        "గర్భస్రావం",
+        "అంగ",
+        "పాయువు",
+        "గాడిద",
+        "గాడిద-fucker",
+        "asses",
+        "assholes",
+        "బాల్బ్యాగ్",
+        "బంతుల్లో",
+        "బాస్టర్డ్",
+        "బెల్లెండ్",
+        "మృగ",
+        "బెస్టియాలిటీ",
+        "బిచ్",
+        "bitches",
+        "బిట్చింగ్",
+        "బ్లడీ",
+        "blowjob",
+        "బోల్లక",
+        "బూబ్",
+        "వక్షోజాలను",
+        "ఛాతీ",
+        "buceta",
+        "బం",
+        "బట్",
+        "కార్పెట్ ముంచర్",
+        "చింక్",
+        "cipa",
+        "స్త్రీగుహ్యాంకురము",
+        "ఆత్మవిశ్వాసం",
+        "కాక్-సక్కర్",
+        "కాక్స్",
+        "కూన్",
+        "చెత్త",
+        "కం",
+        "cumshot",
+        "క్యునిల్లింగస్",
+        "కంట్",
+        "తిట్టు",
+        "డిక్",
+        "లైంగిక సంతృప్తి కోసం స్త్రీలు ఉపయోగించే పురుషాంగము వంటి పరికరము",
+        "డిల్డోస్",
+        "dink",
+        "కుక్క-fucker",
+        "డూష్",
+        "డైక్",
+        "స్ఖలించు",
+        "ఎజాక్యులేటెడ్",
+        "ఎజాక్యులేట్స్",
+        "ఎరాక్యులేటింగ్",
+        "స్ఖలనం",
+        "నవుకరు",
+        "ఫాగ్గింగ్",
+        "ఫాగాట్",
+        "ఫగాట్స్",
+        "fanny",
+        "ఫెల్చింగ్",
+        "కుడుచుట",
+        "అచ్చు",
+        "ఫక్",
+        "ఇబ్బంది పెట్టాడు",
+        "fucker",
+        "ఫకర్స్",
+        "ఫకింగ్",
+        "ఫకింగ్స్",
+        "ఫక్స్",
+        "ఫడ్జ్ ప్యాకర్",
+        "దేవతలా మంచిది",
+        "గాడ్డామ్",
+        "నరకం",
+        "హోర్",
+        "horny",
+        "జెర్క్-ఆఫ్",
+        "కాక్",
+        "పెదవి",
+        "కామం",
+        "మనసు పడ్డట్లు చిత్రించారు",
+        "masochist",
+        "హస్తప్రయోగం",
+        "తల్లి ఫెకర్",
+        "నాజీ",
+        "నిగ్గర్",
+        "నిగ్గర్స్",
+        "ఆర్గాసిమ్",
+        "స్కలనం",
+        "orgasms",
+        "pecker",
+        "పురుషాంగం",
+        "విసర్జన",
+        "pissed",
+        "పిస్సర్",
+        "పిస్సీస్",
+        "పిస్సింగ్",
+        "పిస్సాఫ్",
+        "poop",
+        "శృంగార",
+        "పోర్నో",
+        "అశ్లీల",
+        "బుడతడు",
+        "ప్రిక్స్",
+        "ప్యూబ్",
+        "pussies",
+        "పుస్సీ",
+        "రేప్",
+        "ఉన్నప్పటికీ బలాత్కారం",
+        "పురీషనాళం",
+        "రిటార్డ్",
+        "రిమ్మింగ్",
+        "పీడన కాముకత",
+        "screwing",
+        "స్క్రోటమ్",
+        "వీర్యం",
+        "సెక్స్",
+        "బొచ్చు",
+        "షగ్గింగ్",
+        "షీమేల్",
+        "ఒంటి",
+        "షైట్",
+        "షిట్స్",
+        "షిట్టెడ్",
+        "షిట్టింగ్",
+        "shitty",
+        "స్కాన్క్",
+        "నీతి",
+        "స్లట్స్",
+        "శిశ్న",
+        "స్మట్",
+        "స్నాచ్",
+        "ఒక బిచ్ కుమారుడు ఆఫ్",
+        "spac",
+        "స్పంక్",
+        "వృషణాలు",
+        "తునక",
+        "టిట్స్",
+        "టిట్",
+        "turd",
+        "యోని",
+        "వయాగ్రా",
+        "జననాంగం",
+        "వాంగ్",
+        "వ్యాంక్",
+        "వేశ్య",
+        "x రేట్",
+        "xxx",
+    ],
+    "vi": english_flagged_words
+    + [
+        "cặc",
+        "lồn",
+        "địt",
+        "buồi",
+        "đụ",
+        "nứng",
+        "phò",
+        "bú lol",
+        "bú lon",
+        "bú cac",
+        "bú cak",
+        "bú cax",
+        "bú buoi",
+        "bú mút",
+        "húp lol",
+        "húp lon",
+        "húp sò",
+        "nung lol",
+        "nung lon",
+        "nug lon",
+        "nung cac",
+        "nung buoi",
+        "thèm dit",
+        "thèm lon",
+        "thèm lol",
+        "thèm buoi",
+        "thèm dit",
+        "thèm chich",
+        "thèm chịch",
+        "thèm cax",
+        "thèm cak",
+        "móc lol",
+        "móc lon",
+        "buscu",
+        "lol non",
+        "bướm non",
+        "vú to",
+        "vú bự",
+        "lon to",
+        "phim sex",
+        "phim xex",
+        "phim xxx",
+    ],
+    "zh": english_flagged_words
+    + [
+        "三级片",
+        "乳交",
+        "卖B",
+        "卖比",
+        "卖淫",
+        "口交",
+        "姣西",
+        "打炮",
+        "打飞机",
+        "抽插",
+        "招妓",
+        "插你",
+        "插死你",
+        "淫水",
+        "肉壁",
+        "肉棍子",
+        "肉缝",
+        "肏",
+        "肛交",
+    ],
+}
diff --git a/languages_id.py b/languages_id.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bd1a696b8fc56a3cf58e3f0abc2396810f94898
--- /dev/null
+++ b/languages_id.py
@@ -0,0 +1,222 @@
+import pandas as pd
+
+
+langs_id = [
+    {
+        "lang": "Afrikaans",
+        "dataset_id": "af",
+        "stopwords_id": "af",
+        "flagged_words_id": None,
+        "fasttext_id": "af",
+        "sentencepiece_id": "af",
+        "kenlm_id": "af",
+    },
+    {
+        "lang": "Arabic",
+        "dataset_id": "ar",
+        "stopwords_id": "ar",
+        "flagged_words_id": "ar",
+        "fasttext_id": "ar",
+        "sentencepiece_id": "ar",
+        "kenlm_id": "ar",
+    },
+    {
+        "lang": "Egyptian Arabic",
+        "dataset_id": "arz",
+        "stopwords_id": None,
+        "flagged_words_id": None,
+        "fasttext_id": "arz",
+        "sentencepiece_id": "arz",
+        "kenlm_id": "arz",
+    },
+    {
+        "lang": "Assamese",
+        "dataset_id": "as",
+        "stopwords_id": None,
+        "flagged_words_id": None,
+        "fasttext_id": "as",
+        "sentencepiece_id": "as",
+        "kenlm_id": "as",
+    },
+    {
+        "lang": "Bengali",
+        "dataset_id": "bn",
+        "stopwords_id": "bn",
+        "flagged_words_id": None,
+        "fasttext_id": "bn",
+        "sentencepiece_id": "bn",
+        "kenlm_id": "bn",
+    },
+    {
+        "lang": "Catalan",
+        "dataset_id": "ca",
+        "stopwords_id": "ca",
+        "flagged_words_id": "ca",
+        "fasttext_id": "ca",
+        "sentencepiece_id": "ca",
+        "kenlm_id": "ca",
+    },
+    {
+        "lang": "English",
+        "dataset_id": "en",
+        "stopwords_id": "en",
+        "flagged_words_id": "en",
+        "fasttext_id": "en",
+        "sentencepiece_id": "en",
+        "kenlm_id": "en",
+    },
+    {
+        "lang": "Spanish",
+        "dataset_id": "es",
+        "stopwords_id": "es",
+        "flagged_words_id": "es",
+        "fasttext_id": "es",
+        "sentencepiece_id": "es",
+        "kenlm_id": "es",
+    },
+    {
+        "lang": "Basque",
+        "dataset_id": "eu",
+        "stopwords_id": "eu",
+        "flagged_words_id": "eu",
+        "fasttext_id": "eu",
+        "sentencepiece_id": "eu",
+        "kenlm_id": "eu",
+    },
+    {
+        "lang": "French",
+        "dataset_id": "fr",
+        "stopwords_id": "fr",
+        "flagged_words_id": "fr",
+        "fasttext_id": "fr",
+        "sentencepiece_id": "fr",
+        "kenlm_id": "fr",
+    },
+    {
+        "lang": "Gujarati",
+        "dataset_id": "gu",
+        "stopwords_id": None,
+        "flagged_words_id": None,
+        "fasttext_id": "gu",
+        "sentencepiece_id": "gu",
+        "kenlm_id": "gu",
+    },
+    {
+        "lang": "Hindi",
+        "dataset_id": "hi",
+        "stopwords_id": "hi",
+        "flagged_words_id": "hi",
+        "fasttext_id": "hi",
+        "sentencepiece_id": "hi",
+        "kenlm_id": "hi",
+    },
+    {
+        "lang": "Indonesian",
+        "dataset_id": "id",
+        "stopwords_id": "id",
+        "flagged_words_id": "id",
+        "fasttext_id": "id",
+        "sentencepiece_id": "id",
+        "kenlm_id": "id",
+    },
+    {
+        "lang": "Kannada",
+        "dataset_id": "kn",
+        "stopwords_id": None,
+        "flagged_words_id": "kn",
+        "fasttext_id": "kn",
+        "sentencepiece_id": "kn",
+        "kenlm_id": "kn",
+    },
+    {
+        "lang": "Malayalam",
+        "dataset_id": "ml",
+        "stopwords_id": None,
+        "flagged_words_id": "ml",
+        "fasttext_id": "ml",
+        "sentencepiece_id": "ml",
+        "kenlm_id": "ml",
+    },
+    {
+        "lang": "Marathi",
+        "dataset_id": "mr",
+        "stopwords_id": "mr",
+        "flagged_words_id": "mr",
+        "fasttext_id": "mr",
+        "sentencepiece_id": "mr",
+        "kenlm_id": "mr",
+    },
+    {
+        "lang": "Portuguese",
+        "dataset_id": "pt",
+        "stopwords_id": "pt",
+        "flagged_words_id": "pt",
+        "fasttext_id": "pt",
+        "sentencepiece_id": "pt",
+        "kenlm_id": "pt",
+    },
+    {
+        "lang": "Swahili",
+        "dataset_id": "sw",
+        "stopwords_id": "sw",
+        "flagged_words_id": None,
+        "fasttext_id": "sw",
+        "sentencepiece_id": "sw",
+        "kenlm_id": "sw",
+    },
+    {
+        "lang": "Tamil",
+        "dataset_id": "ta",
+        "stopwords_id": None,
+        "flagged_words_id": "ta",
+        "fasttext_id": "ta",
+        "sentencepiece_id": "ta",
+        "kenlm_id": "ta",
+    },
+    {
+        "lang": "Telugu",
+        "dataset_id": "te",
+        "stopwords_id": None,
+        "flagged_words_id": "te",
+        "fasttext_id": "te",
+        "sentencepiece_id": "te",
+        "kenlm_id": "te",
+    },
+    {
+        "lang": "Urdu",
+        "dataset_id": "ur",
+        "stopwords_id": "ur",
+        "flagged_words_id": None,
+        "fasttext_id": "ur",
+        "sentencepiece_id": "ur",
+        "kenlm_id": "ur",
+    },
+    {
+        "lang": "Vietnamese",
+        "dataset_id": "vi",
+        "stopwords_id": "vi",
+        "flagged_words_id": "vi",
+        "fasttext_id": "vi",
+        "sentencepiece_id": "vi",
+        "kenlm_id": "vi",
+    },
+    {
+        "lang": "Yoruba",
+        "dataset_id": "yo",
+        "stopwords_id": "yo",
+        "flagged_words_id": None,
+        "fasttext_id": "yo",
+        "sentencepiece_id": "yo",
+        "kenlm_id": "yo",
+    },
+    {
+        "lang": "Chinese",
+        "dataset_id": "zh",
+        "stopwords_id": "zh",
+        "flagged_words_id": "zh",
+        "fasttext_id": "zh",
+        "sentencepiece_id": "zh",
+        "kenlm_id": "zh",
+    },
+]
+langs_id = pd.DataFrame(langs_id)
diff --git a/lid.176.bin b/lid.176.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f8707035ea3cc86ac248a4e31fa6368cd845476a
--- /dev/null
+++ b/lid.176.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e
+size 131266198
diff --git a/normalization.py b/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..652e810fb5019c5177f6fd0abf9635f322f23927
--- /dev/null
+++ b/normalization.py
@@ -0,0 +1,52 @@
+import re
+from typing import Dict
+
+
+non_printing_characters_re = re.compile(
+    f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
+)
+
+digits_re: re.Pattern = re.compile(r"\d")
+
+unicode_punctuation: Dict[str, str] = {
+    "，": ",",
+    "。": ".",
+    "、": ",",
+    "„": '"',
+    "”": '"',
+    "“": '"',
+    "«": '"',
+    "»": '"',
+    "１": '"',
+    "」": '"',
+    "「": '"',
+    "《": '"',
+    "》": '"',
+    "´": "'",
+    "∶": ":",
+    "：": ":",
+    "？": "?",
+    "！": "!",
+    "（": "(",
+    "）": ")",
+    "；": ";",
+    "–": "-",
+    "—": " - ",
+    "．": ". ",
+    "～": "~",
+    "’": "'",
+    "…": "...",
+    "━": "-",
+    "〈": "<",
+    "〉": ">",
+    "【": "[",
+    "】": "]",
+    "％": "%",
+    "►": "-",
+}
+
+normalization = {
+    "non_printing_characters_re": non_printing_characters_re,
+    "digits_re": digits_re,
+    "unicode_punctuation": unicode_punctuation,
+}
diff --git a/parameters_filtering.py b/parameters_filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a992a8d10512da8c416640d956deff47a8f2ce7
--- /dev/null
+++ b/parameters_filtering.py
@@ -0,0 +1,895 @@
+import string
+import emoji
+
+
+main_special_characters = string.punctuation + string.digits + string.whitespace
+other_special_characters = (
+    "    　    ￼’“”–ー一▬…✦�­£​•€«»°·═"
+    "×士＾˘⇓↓↑←→（）§″′´¿−±∈﻿¢ø‚„½¼¾¹²³―⁃，ˌ¸‹›ʺˈʻ¦‐⠀‰‑≤≥‖"
+    "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン：∼⁄・♡✓⊕․．⋅÷１‟；،、¨ाাी्े◦˚"
+    "゜ʼ≖ʼ¤ッツシ℃√！【】‿∞➤～πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬？▷Г♫∟™ª₪®「—❖"
+    "」﴾》"
+)
+emoji = list(emoji.UNICODE_EMOJI["en"].keys())
+
+special_characters_default = set(main_special_characters + other_special_characters)
+special_characters_default.update(emoji)
+
+
+parameters_filtering_default = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": False,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": False,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.70,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_af = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.6,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_ar = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.45,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1000000,
+}
+
+parameters_filtering_arz = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.5,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_as = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_bn = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.275,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.05,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 575000,
+}
+
+parameters_filtering_ca = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1750000,
+}
+
+parameters_filtering_en = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": True,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 20,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.3,
+    "cond_check_flagged_words": True,
+    "flagged_words_max_cutoff": 0.045,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.80,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500,
+}
+
+parameters_filtering_es = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.2,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500000,
+}
+
+parameters_filtering_eu = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 35,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_fr = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.15,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_gu = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 250000,
+}
+
+parameters_filtering_hi = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 600000,
+}
+
+parameters_filtering_id = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.25,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500000,
+}
+
+parameters_filtering_kn = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 400000,
+}
+
+parameters_filtering_ml = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.2,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1600000,
+}
+
+parameters_filtering_mr = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 425000,
+}
+
+parameters_filtering_pt = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.15,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_sw = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.275,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_ta = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_te = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 35,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_ur = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_vi = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": True,
+    "words_augmentation_group_sizes": [2],
+    "words_augmentation_join_char": " ",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_yo = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_zh = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": False,
+    "length_word_max_cutoff": 1000,
+    "cond_check_number_words": True,
+    "tokenization": True,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "cond_check_character_repetition_removal": True,
+    "character_repetition_length": 10,
+    "character_repetition_max_cutoff": 0.106,
+    "cond_check_word_repetition_removal": True,
+    "word_repetition_length": 5,
+    "word_repetition_max_cutoff": 0.19,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": True,
+    "words_augmentation_group_sizes": [2],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": False,
+    "stopwords_min_cutoff": 0,
+    "cond_check_flagged_words": False,
+    "flagged_words_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering = {
+    "default": parameters_filtering_default,
+    "af": parameters_filtering_af,
+    "ar": parameters_filtering_ar,
+    "arz": parameters_filtering_arz,
+    "as": parameters_filtering_as,
+    "bn": parameters_filtering_bn,
+    "ca": parameters_filtering_ca,
+    "en": parameters_filtering_en,
+    "es": parameters_filtering_es,
+    "eu": parameters_filtering_eu,
+    "fr": parameters_filtering_fr,
+    "gu": parameters_filtering_gu,
+    "hi": parameters_filtering_hi,
+    "id": parameters_filtering_id,
+    "kn": parameters_filtering_kn,
+    "ml": parameters_filtering_ml,
+    "mr": parameters_filtering_mr,
+    "pt": parameters_filtering_pt,
+    "sw": parameters_filtering_sw,
+    "ta": parameters_filtering_ta,
+    "te": parameters_filtering_te,
+    "ur": parameters_filtering_ur,
+    "vi": parameters_filtering_vi,
+    "yo": parameters_filtering_yo,
+    "zh": parameters_filtering_zh,
+}
diff --git a/pt.arpa.bin b/pt.arpa.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1ed3b02dab66efb87a4da18948d7ec7f4a5ffa90
--- /dev/null
+++ b/pt.arpa.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad7241c4b11d902fa092506b731f61e5f67177897c2598b750d1a2e519be87ad
+size 3220168756
diff --git a/pt.sp.model b/pt.sp.model
new file mode 100644
index 0000000000000000000000000000000000000000..3c2ab113c5644ebf7b1d8d23790b90b16c964d75
--- /dev/null
+++ b/pt.sp.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1707a7517b61ca9d4d333dabcc5ec7024e44c6466ff6faea9ccc95a0f1b2737c
+size 958101
diff --git a/pt_examples_with_stats.json b/pt_examples_with_stats.json
new file mode 100644
index 0000000000000000000000000000000000000000..43237b7e36350526bad1aa383d7c875aae3f8af4
--- /dev/null
+++ b/pt_examples_with_stats.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72a681cc82b2a0f9e11a8fa052143f7eaad5a67d31269bbd96653715e0ff776a
+size 135498651
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6e6b4f5f64a5d880bef902ec49cfb16703015608
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+fasttext
+sentencepiece
+https://github.com/kpu/kenlm/archive/master.zip
+emoji
\ No newline at end of file
diff --git a/stopwords.py b/stopwords.py
new file mode 100644
index 0000000000000000000000000000000000000000..9093aec7c814d1fe93d148382bd64a5bf9e9b882
--- /dev/null
+++ b/stopwords.py
@@ -0,0 +1,7445 @@
+# From https://github.com/6/stopwords-json
+# From https://github.com/stopwords-iso/stopwords-iso for Urdu and Vietnamese
+
+
+stopwords = {
+    "af": [
+        "'n",
+        "aan",
+        "af",
+        "al",
+        "as",
+        "baie",
+        "by",
+        "daar",
+        "dag",
+        "dat",
+        "die",
+        "dit",
+        "een",
+        "ek",
+        "en",
+        "gaan",
+        "gesê",
+        "haar",
+        "het",
+        "hom",
+        "hulle",
+        "hy",
+        "in",
+        "is",
+        "jou",
+        "jy",
+        "kan",
+        "kom",
+        "ma",
+        "maar",
+        "met",
+        "my",
+        "na",
+        "nie",
+        "om",
+        "ons",
+        "op",
+        "saam",
+        "sal",
+        "se",
+        "sien",
+        "so",
+        "sy",
+        "te",
+        "toe",
+        "uit",
+        "van",
+        "vir",
+        "was",
+        "wat",
+        "ŉ",
+    ],
+    "ar": [
+        "آخر",
+        "آنَا",
+        "أ",
+        "أثناء",
+        "أحد",
+        "أصبح",
+        "أصبحت",
+        "أغلب",
+        "أكثر",
+        "أكون",
+        "ألا",
+        "أم",
+        "أما",
+        "أمام",
+        "أن",
+        "أنا",
+        "أنت",
+        "أنتم",
+        "أنَا",
+        "أو",
+        "أولئك",
+        "أولٰئك",
+        "أي",
+        "أية",
+        "أين",
+        "أينما",
+        "أَ",
+        "أَثنَاءَ",
+        "أَلَّا",
+        "أَم",
+        "أَمَامَ",
+        "أَمَّا",
+        "أَن",
+        "أَنَّ",
+        "أَو",
+        "أَي",
+        "أَينَ",
+        "أَينَمَا",
+        "أَيّ",
+        "إبان",
+        "إثر",
+        "إحدى",
+        "إذ",
+        "إذا",
+        "إزا",
+        "إزاء",
+        "إل",
+        "إلا",
+        "إلى",
+        "إلي",
+        "إليها",
+        "إما",
+        "إن",
+        "إنما",
+        "إنّ",
+        "إيا",
+        "إِثرَ",
+        "إِذ",
+        "إِذًا",
+        "إِذَا",
+        "إِزَاءَ",
+        "إِلَى",
+        "إِلَّا",
+        "إِمَّا",
+        "إِن",
+        "إِنَّ",
+        "إِنَّمَا",
+        "إِيَّا",
+        "اثر",
+        "اثناء",
+        "اذ",
+        "اذا",
+        "ازا",
+        "ازاء",
+        "ال",
+        "الا",
+        "التى",
+        "التي",
+        "الذى",
+        "الذي",
+        "الذين",
+        "الغاية",
+        "الـ",
+        "الـــ",
+        "الفوق",
+        "اللاتى",
+        "اللاتي",
+        "اللتان",
+        "اللتين",
+        "اللذان",
+        "اللذين",
+        "اللواتي",
+        "اللي",
+        "الى",
+        "الي",
+        "ام",
+        "اما",
+        "امام",
+        "ان",
+        "انا",
+        "انتم",
+        "انما",
+        "او",
+        "اولئك",
+        "اى",
+        "اي",
+        "اين",
+        "اينما",
+        "اَل",
+        "اَلَّذِي",
+        "ب",
+        "بأنفسهم",
+        "بات",
+        "باتت",
+        "بس",
+        "بعد",
+        "بعدما",
+        "بعض",
+        "بعيد",
+        "بغزة",
+        "بـ",
+        "بل",
+        "بما",
+        "بهم",
+        "بيد",
+        "بين",
+        "بينما",
+        "بَس",
+        "بَعدَ",
+        "بَعدَمَا",
+        "بَل",
+        "بَيدَ",
+        "بَينَ",
+        "بَينَمَا",
+        "بُعَيدَ",
+        "بِ",
+        "تحت",
+        "تحـــت",
+        "تصبح",
+        "تعد",
+        "تكن",
+        "تكون",
+        "تكونون",
+        "تلك",
+        "تَحتَ",
+        "تُجَاهَ",
+        "ثم",
+        "ثُمَّ",
+        "جراء",
+        "جَرَّاء",
+        "حتى",
+        "حسب",
+        "حسبما",
+        "حوالى",
+        "حوالي",
+        "حول",
+        "حولي",
+        "حيال",
+        "حيث",
+        "حيثما",
+        "حين",
+        "حينما",
+        "حَتَّى",
+        "حَسَب",
+        "حَسَبَ",
+        "حَسَبَمَا",
+        "حَولَ",
+        "حَوَالَى",
+        "حَيثُ",
+        "حِينَ",
+        "حِينَمَا",
+        "حِيَالَ",
+        "خلال",
+        "خَلفَ",
+        "خِلَالَ",
+        "دون",
+        "دُونَ",
+        "ذا",
+        "ذاك",
+        "ذلك",
+        "ذو",
+        "ذي",
+        "ذَا",
+        "ذَاكَ",
+        "ذُو",
+        "ذٰلك",
+        "ذٰلِكَ",
+        "راح",
+        "ربما",
+        "ربمــا",
+        "رغم",
+        "ريثما",
+        "رَغمَ",
+        "رَيثَمَا",
+        "رُبَّمَا",
+        "س",
+        "سائر",
+        "سواء",
+        "سوف",
+        "سوى",
+        "سَ",
+        "سَوفَ",
+        "سِوَى",
+        "شبه",
+        "شو",
+        "صار",
+        "صوب",
+        "ضد",
+        "ضمن",
+        "ضِدَّ",
+        "ضِمنَ",
+        "طال",
+        "طالما",
+        "طالَما",
+        "طوال",
+        "طيلة",
+        "طَالَمَا",
+        "طِوَالَ",
+        "طِيلَةَ",
+        "عبر",
+        "عدا",
+        "عدة",
+        "عشان",
+        "عـــلى",
+        "عـــندما",
+        "عــلى",
+        "عقب",
+        "عل",
+        "علـى",
+        "على",
+        "علي",
+        "علّ",
+        "عم",
+        "عن",
+        "عنا",
+        "عند",
+        "عندما",
+        "عوض",
+        "عَاد",
+        "عَبرَ",
+        "عَدَا",
+        "عَشان",
+        "عَقِبَ",
+        "عَلَى",
+        "عَلَّ",
+        "عَم",
+        "عَن",
+        "عِندَ",
+        "عِندَمَا",
+        "عِوَضَ",
+        "غالبية",
+        "غدت",
+        "غير",
+        "غَيرَ",
+        "ف",
+        "فتئ",
+        "فـ",
+        "فـي",
+        "فور",
+        "فوق",
+        "فى",
+        "في",
+        "فيما",
+        "فَ",
+        "فَورَ",
+        "فَوقَ",
+        "فِي",
+        "فِيمَا",
+        "ق",
+        "قبالة",
+        "قبل",
+        "قبيل",
+        "قد",
+        "قرابة",
+        "قرب",
+        "قيد",
+        "قَبلَ",
+        "قَد",
+        "قَيدَ",
+        "قُبَالَةَ",
+        "قُبَيلَ",
+        "قُربَ",
+        "قُرَابَةَ",
+        "ك",
+        "كأن",
+        "كأنما",
+        "كامل",
+        "كان",
+        "كانت",
+        "كانوا",
+        "كذا",
+        "كـ",
+        "كل",
+        "كلا",
+        "كلتا",
+        "كلما",
+        "كلي",
+        "كم",
+        "كما",
+        "كن",
+        "كنا",
+        "كنت",
+        "كون",
+        "كى",
+        "كي",
+        "كيف",
+        "كَ",
+        "كَأَنَّ",
+        "كَأَنَّمَا",
+        "كَان",
+        "كَذَا",
+        "كَلَّا",
+        "كَم",
+        "كَمَا",
+        "كَي",
+        "كَيفَ",
+        "كُل",
+        "كُلَّمَا",
+        "كِلَا",
+        "ل",
+        "لأن",
+        "لا",
+        "لازم",
+        "لان",
+        "لدى",
+        "لدي",
+        "لذا",
+        "لذلك",
+        "لذٰلك",
+        "لسنا",
+        "لـ",
+        "لقد",
+        "لكن",
+        "لكى",
+        "لكي",
+        "لم",
+        "لما",
+        "لماذا",
+        "لن",
+        "لهم",
+        "لو",
+        "لولا",
+        "ليس",
+        "ليست",
+        "ليسوا",
+        "لَ",
+        "لَا",
+        "لَازِم",
+        "لَدَى",
+        "لَم",
+        "لَمَّا",
+        "لَن",
+        "لَو",
+        "لَولَا",
+        "لَيس",
+        "لُو",
+        "لِ",
+        "لِأَن",
+        "لِأَنَّ",
+        "لِئَلّا",
+        "لِذَا",
+        "لِذٰلِكَ",
+        "لِكَي",
+        "لِمَاذَا",
+        "لٰكن",
+        "لٰكِن",
+        "لٰكِنَّ",
+        "م",
+        "ما",
+        "ماذا",
+        "مالم",
+        "ماهو",
+        "ماهُوَ",
+        "متى",
+        "مثـــل",
+        "مثل",
+        "مثلما",
+        "مش",
+        "مع",
+        "معظم",
+        "مــن",
+        "مـن",
+        "مقابل",
+        "مما",
+        "ممكن",
+        "من",
+        "منتصف",
+        "منذ",
+        "مهما",
+        "مين",
+        "مَا",
+        "مَاذَا",
+        "مَالَم",
+        "مَتَى",
+        "مَعَ",
+        "مَن",
+        "مَهمَا",
+        "مُقَابِلَ",
+        "مُمكِن",
+        "مُنذُ",
+        "مِثلَ",
+        "مِثلَمَا",
+        "مِمَّا",
+        "مِن",
+        "نا",
+        "ناهيك",
+        "نحسب",
+        "نحن",
+        "نحو",
+        "نصف",
+        "نعم",
+        "نكون",
+        "ني",
+        "نَاهِيك",
+        "نَحوَ",
+        "نَعَم",
+        "ه",
+        "هؤلاء",
+        "ها",
+        "هاتان",
+        "هاتين",
+        "هاد",
+        "هاي",
+        "هذا",
+        "هذان",
+        "هذــه",
+        "هذه",
+        "هذين",
+        "هـــذه",
+        "هــــذه",
+        "هكذا",
+        "هل",
+        "هم",
+        "هما",
+        "هن",
+        "هو",
+        "هى",
+        "هي",
+        "هَا",
+        "هَل",
+        "هُ",
+        "هُو",
+        "هُوَ",
+        "هِ",
+        "هٰؤلاء",
+        "هٰذا",
+        "هٰذان",
+        "هٰذه",
+        "هٰذَا",
+        "هٰكذا",
+        "هٰكَذَا",
+        "و",
+        "وأسلم",
+        "وراء",
+        "وسامراء",
+        "وسط",
+        "وســـط",
+        "وغربه",
+        "وفق",
+        "وقتما",
+        "وقف",
+        "ولا",
+        "ولَا",
+        "وهي",
+        "وَ",
+        "وَرَاءَ",
+        "وَسطَ",
+        "وِفقَ",
+        "وِلّا",
+        "ي",
+        "يا",
+        "يجعل",
+        "يزال",
+        "يصبح",
+        "يكن",
+        "يكون",
+        "يكونا",
+        "يَا",
+        "ِي",
+    ],
+    "bn": [
+        "অনেক",
+        "অন্য",
+        "অবশ্য",
+        "আগে",
+        "আছে",
+        "আজ",
+        "আবার",
+        "আমরা",
+        "আমাদের",
+        "আর",
+        "ই",
+        "উত্তর",
+        "উপর",
+        "উপরে",
+        "এ",
+        "এই",
+        "এক্",
+        "এখন",
+        "এত",
+        "এব",
+        "এমন",
+        "এমনি",
+        "এর",
+        "এস",
+        "এসে",
+        "ও",
+        "ওই",
+        "কমনে",
+        "করা",
+        "করে",
+        "কাছে",
+        "কাজ",
+        "কাজে",
+        "কারণ",
+        "কি",
+        "কিছু",
+        "কে",
+        "কেউ",
+        "কেখা",
+        "কেন",
+        "কোটি",
+        "কোনো",
+        "কয়েক",
+        "খুব",
+        "গিয়ে",
+        "গেল",
+        "চার",
+        "চালু",
+        "চেষ্টা",
+        "ছিল",
+        "জানা",
+        "জ্নজন",
+        "টি",
+        "তখন",
+        "তবে",
+        "তা",
+        "তাই",
+        "তো",
+        "থাকা",
+        "থেকে",
+        "দিন",
+        "দু",
+        "দুই",
+        "দেওয়া",
+        "ধামার",
+        "নতুন",
+        "না",
+        "নাগাদ",
+        "নিয়ে",
+        "নেওয়া",
+        "নয়",
+        "পর",
+        "পরে",
+        "পাচ",
+        "পি",
+        "পেয়্র্",
+        "প্রতি",
+        "প্রথম",
+        "প্রযন্ত",
+        "প্রাথমিক",
+        "প্রায়",
+        "বক্তব্য",
+        "বন",
+        "বলা",
+        "বলে",
+        "বলেন",
+        "বহু",
+        "বা",
+        "বি",
+        "বিভিন্ন",
+        "বেশ",
+        "বেশি",
+        "মতো",
+        "মধ্যে",
+        "মনে",
+        "যখন",
+        "যদি",
+        "যা",
+        "যাওয়া",
+        "যে",
+        "র",
+        "রকম",
+        "লক্ষ",
+        "শুধু",
+        "শুরু",
+        "সঙ্গে",
+        "সব",
+        "সহ",
+        "সাধারণ",
+        "সামনে",
+        "সি",
+        "সে",
+        "সেই",
+        "হতে",
+        "হাজার",
+        "হয়",
+    ],
+    "ca": [
+        "-ho",
+        "-la",
+        "-lo",
+        "-ne",
+        "-se",
+        "a",
+        "abans",
+        "això",
+        "al",
+        "algun",
+        "alguna",
+        "algunes",
+        "alguns",
+        "algú",
+        "allò",
+        "als",
+        "altra",
+        "altre",
+        "altres",
+        "amb",
+        "aqueix",
+        "aqueixa",
+        "aqueixes",
+        "aqueixos",
+        "aquell",
+        "aquella",
+        "aquelles",
+        "aquells",
+        "aquest",
+        "aquesta",
+        "aquestes",
+        "aquestos",
+        "aquests",
+        "bastant",
+        "bastants",
+        "bé",
+        "cada",
+        "cadascun",
+        "cadascuna",
+        "cadascú",
+        "cap",
+        "cert",
+        "certa",
+        "certes",
+        "certs",
+        "com",
+        "con",
+        "contra",
+        "d",
+        "d'",
+        "da",
+        "damunt",
+        "darrere",
+        "davant",
+        "de",
+        "del",
+        "dels",
+        "des",
+        "dient",
+        "diferent",
+        "diferents",
+        "dins",
+        "dintre",
+        "dir",
+        "divers",
+        "diverses",
+        "diversos",
+        "durant",
+        "eixa",
+        "eixe",
+        "eixes",
+        "eixos",
+        "el",
+        "ell",
+        "ella",
+        "elles",
+        "ells",
+        "els",
+        "em",
+        "emperò",
+        "en",
+        "endavant",
+        "enfront",
+        "ens",
+        "entre",
+        "envers",
+        "era",
+        "eren",
+        "es",
+        "estan",
+        "estant",
+        "estar",
+        "estaran",
+        "estarem",
+        "estaria",
+        "estarien",
+        "estarà",
+        "estat",
+        "estava",
+        "estaven",
+        "este",
+        "estem",
+        "estes",
+        "esteu",
+        "estic",
+        "estiguem",
+        "estiguessin",
+        "estigui",
+        "estiguin",
+        "estigués",
+        "estos",
+        "està",
+        "et",
+        "ets",
+        "excepte",
+        "extra",
+        "fa",
+        "faci",
+        "facin",
+        "facis",
+        "faig",
+        "fan",
+        "faran",
+        "farem",
+        "fareu",
+        "faria",
+        "farien",
+        "faries",
+        "faràs",
+        "faràs",
+        "faré",
+        "faríem",
+        "faríeu",
+        "fas",
+        "feia",
+        "feien",
+        "feies",
+        "fem",
+        "fent",
+        "fer",
+        "fes",
+        "fessin",
+        "fessis",
+        "fet",
+        "feu",
+        "fins",
+        "foren",
+        "fos",
+        "fossin",
+        "fou",
+        "front",
+        "fèiem",
+        "fèieu",
+        "féssiu",
+        "gaire",
+        "gaires",
+        "gràcies",
+        "ha",
+        "hagi",
+        "hagin",
+        "haguem",
+        "haguessin",
+        "haguessis",
+        "hagut",
+        "hagués",
+        "haguéssim",
+        "haguéssin",
+        "haguéssiu",
+        "han",
+        "has",
+        "hauran",
+        "haurem",
+        "haureu",
+        "hauria",
+        "haurien",
+        "hauries",
+        "haurà",
+        "hauràs",
+        "hauré",
+        "hauríem",
+        "hauríeu",
+        "havent",
+        "haver",
+        "havia",
+        "havien",
+        "havies",
+        "havíem",
+        "havíeu",
+        "he",
+        "hem",
+        "heu",
+        "hi",
+        "ho",
+        "hom",
+        "hàgim",
+        "i",
+        "in",
+        "jo",
+        "l",
+        "l",
+        "l'",
+        "la",
+        "las",
+        "les",
+        "li",
+        "llur",
+        "llurs",
+        "lo",
+        "los",
+        "ls",
+        "m",
+        "m",
+        "m'",
+        "malgrat",
+        "mancant",
+        "massa",
+        "mateix",
+        "mateixa",
+        "mateixes",
+        "mateixos",
+        "me",
+        "mentre",
+        "menys",
+        "mes",
+        "meu",
+        "meus",
+        "meva",
+        "meves",
+        "mi",
+        "mitjançant",
+        "molt",
+        "molta",
+        "moltes",
+        "molts",
+        "moltíssim",
+        "moltíssima",
+        "moltíssimes",
+        "moltíssims",
+        "n",
+        "n'",
+        "ne",
+        "ni",
+        "ningun",
+        "ninguna",
+        "ningunes",
+        "ninguns",
+        "ningú",
+        "no",
+        "nombroses",
+        "nombrós",
+        "nos",
+        "nosaltres",
+        "nostra",
+        "nostre",
+        "nostres",
+        "ns",
+        "o",
+        "on",
+        "os",
+        "pel",
+        "pels",
+        "per",
+        "perqu",
+        "perquè",
+        "però",
+        "poc",
+        "poca",
+        "pocs",
+        "poques",
+        "prou",
+        "qual",
+        "quals",
+        "qualsevol",
+        "quan",
+        "quant",
+        "quantes",
+        "quants",
+        "que",
+        "quelcom",
+        "qui",
+        "quin",
+        "quina",
+        "quines",
+        "quins",
+        "què",
+        "rere",
+        "respecte",
+        "s",
+        "s",
+        "s'",
+        "sa",
+        "sabent",
+        "salvant",
+        "se",
+        "segons",
+        "sens",
+        "sense",
+        "sent",
+        "ser",
+        "seran",
+        "serem",
+        "seria",
+        "serien",
+        "serà",
+        "seré",
+        "seríem",
+        "ses",
+        "seu",
+        "seus",
+        "seva",
+        "seves",
+        "si",
+        "siguem",
+        "sigui",
+        "siguin",
+        "sigut",
+        "sinó",
+        "sobre",
+        "som",
+        "sota",
+        "su",
+        "suficient",
+        "séssim",
+        "sóc",
+        "són",
+        "t",
+        "t'",
+        "tal",
+        "tals",
+        "tant",
+        "tanta",
+        "tantes",
+        "tants",
+        "te",
+        "tenc",
+        "tendran",
+        "tendrem",
+        "tendreu",
+        "tendria",
+        "tendrien",
+        "tendries",
+        "tendràs",
+        "tendràs",
+        "tendré",
+        "tendríem",
+        "tendríeu",
+        "tenen",
+        "tenia",
+        "tenien",
+        "tenies teníem",
+        "tenim",
+        "tenir",
+        "teniu",
+        "tens",
+        "teníeu",
+        "teu",
+        "teus",
+        "teva",
+        "ti",
+        "tinc",
+        "tindran",
+        "tindre",
+        "tindrem",
+        "tindreu",
+        "tindria",
+        "tindrien",
+        "tindries",
+        "tindràs",
+        "tindràs",
+        "tindré",
+        "tindríem",
+        "tindríeu",
+        "tingut",
+        "tot",
+        "tota",
+        "total",
+        "totes",
+        "tothom",
+        "tots",
+        "tu",
+        "té",
+        "u",
+        "ultra",
+        "un",
+        "una",
+        "unes",
+        "uns",
+        "us",
+        "va",
+        "vagi",
+        "vagin",
+        "vaig",
+        "vam",
+        "van",
+        "varen",
+        "vau",
+        "vers",
+        "versus",
+        "via",
+        "vora",
+        "vos",
+        "vosaltres",
+        "vostre",
+        "vostè",
+        "vostès",
+        "vàrem",
+        "y",
+        "érem",
+        "és",
+    ],
+    "en": [
+        "a",
+        "a.k.a",
+        "aboard",
+        "about",
+        "above",
+        "abt",
+        "accord",
+        "according",
+        "across",
+        "after",
+        "against",
+        "ago",
+        "aground",
+        "ahead",
+        "aka",
+        "ala",
+        "albeit",
+        "all",
+        "along",
+        "alongside",
+        "although",
+        "am",
+        "amid",
+        "amidst",
+        "among",
+        "amongst",
+        "amoung",
+        "an",
+        "and",
+        "and/or",
+        "another",
+        "any",
+        "any1",
+        "anybody",
+        "anyone",
+        "anything",
+        "are",
+        "around",
+        "as",
+        "aside",
+        "astride",
+        "at",
+        "atop",
+        "away",
+        "b",
+        "b/c",
+        "b/t",
+        "back",
+        "base",
+        "based",
+        "bc",
+        "be",
+        "because",
+        "been",
+        "before",
+        "behind",
+        "being",
+        "below",
+        "beneath",
+        "beside",
+        "besides",
+        "between",
+        "beyond",
+        "board",
+        "both",
+        "btwn",
+        "but",
+        "by",
+        "can",
+        "cause",
+        "circa",
+        "cos",
+        "could",
+        "coz",
+        "cus",
+        "depend",
+        "depending",
+        "despite",
+        "did",
+        "do",
+        "does",
+        "down",
+        "due",
+        "during",
+        "each",
+        "either",
+        "else",
+        "even",
+        "ever",
+        "every",
+        "everybody",
+        "everyone",
+        "everything",
+        "except",
+        "for",
+        "forth",
+        "from",
+        "get",
+        "gets",
+        "getting",
+        "give",
+        "given",
+        "got",
+        "had",
+        "half",
+        "has",
+        "hav",
+        "have",
+        "having",
+        "he",
+        "her",
+        "hers",
+        "herself",
+        "him",
+        "himself",
+        "his",
+        "how",
+        "however",
+        "i",
+        "i'd",
+        "if",
+        "in",
+        "include",
+        "including",
+        "inside",
+        "instead",
+        "into",
+        "is",
+        "it",
+        "it's",
+        "its",
+        "itself",
+        "lest",
+        "like",
+        "made",
+        "many",
+        "may",
+        "me",
+        "might",
+        "mine",
+        "minus",
+        "most",
+        "much",
+        "must",
+        "my",
+        "myself",
+        "nary",
+        "near",
+        "nearby",
+        "neither",
+        "next",
+        "nigh",
+        "no",
+        "nobody",
+        "none",
+        "noone",
+        "nor",
+        "not",
+        "nothing",
+        "notwithstanding",
+        "of",
+        "off",
+        "on",
+        "onboard",
+        "once",
+        "one",
+        "ones",
+        "oneself",
+        "only",
+        "onto",
+        "opposite",
+        "or",
+        "other",
+        "others",
+        "ought",
+        "our",
+        "ours",
+        "ourselves",
+        "out",
+        "outside",
+        "over",
+        "overt",
+        "own",
+        "past",
+        "per",
+        "plus",
+        "prior",
+        "quite",
+        "rather",
+        "re",
+        "regard",
+        "regarding",
+        "regardless",
+        "round",
+        "s/he",
+        "save",
+        "self",
+        "shall",
+        "she",
+        "should",
+        "side",
+        "since",
+        "so",
+        "some",
+        "somebody",
+        "someone",
+        "something",
+        "such",
+        "sure",
+        "teh",
+        "than",
+        "thanks",
+        "that",
+        "the",
+        "their",
+        "theirs",
+        "them",
+        "themselves",
+        "then",
+        "there",
+        "these",
+        "they",
+        "they're",
+        "thier",
+        "this",
+        "tho",
+        "those",
+        "thou",
+        "though",
+        "through",
+        "throughout",
+        "thru",
+        "thy",
+        "til",
+        "till",
+        "to",
+        "together",
+        "too",
+        "toward",
+        "towards",
+        "u",
+        "under",
+        "underneath",
+        "unless",
+        "unlike",
+        "until",
+        "unto",
+        "up",
+        "upon",
+        "ur",
+        "us",
+        "use",
+        "versus",
+        "via",
+        "vs",
+        "vs.",
+        "w/",
+        "w/o",
+        "w/out",
+        "was",
+        "we",
+        "were",
+        "what",
+        "whatever",
+        "whatnot",
+        "when",
+        "whenever",
+        "where",
+        "whereas",
+        "wherever",
+        "whether",
+        "which",
+        "while",
+        "whilst",
+        "whither",
+        "who",
+        "who's",
+        "whoever",
+        "whom",
+        "whomever",
+        "whose",
+        "why",
+        "will",
+        "with",
+        "within",
+        "without",
+        "wo",
+        "worth",
+        "would",
+        "wud",
+        "y'all",
+        "ya",
+        "yet",
+        "yo",
+        "you",
+        "you're",
+        "your",
+        "youre",
+        "yours",
+        "yourself",
+        "yourselves",
+    ],
+    "es": [
+        "a",
+        "a fin de que",
+        "a medida que",
+        "a menos que",
+        "a modo de",
+        "a no ser que",
+        "a poco que",
+        "a que",
+        "abandono",
+        "acerca",
+        "acostumbra",
+        "adónde",
+        "ahora",
+        "al igual que",
+        "al lado de",
+        "algo",
+        "alguien",
+        "alguna",
+        "algunas",
+        "alguno",
+        "algunos",
+        "algún",
+        "alrededor",
+        "ambas",
+        "ambos",
+        "ante",
+        "aparece",
+        "aparecen",
+        "apareció",
+        "aparte",
+        "apenas",
+        "aquel",
+        "aquella",
+        "aquellas",
+        "aquello",
+        "aquellos",
+        "aquesa",
+        "aquesas",
+        "aquesos",
+        "aquesta",
+        "aquestas",
+        "aquesto",
+        "aquestos",
+        "aquél",
+        "aquélla",
+        "aquéllas",
+        "aquéllos",
+        "arrepentir",
+        "arrepentiréis",
+        "así",
+        "así como",
+        "así que",
+        "atlético",
+        "aun",
+        "aunque",
+        "aún",
+        "bajo",
+        "bastante",
+        "bastantes",
+        "bien",
+        "cada",
+        "casi",
+        "cerca",
+        "chance",
+        "cierta",
+        "ciertas",
+        "cierto",
+        "ciertos",
+        "comenzado",
+        "comenzó",
+        "comienzan",
+        "como",
+        "como quiera que",
+        "como si",
+        "con",
+        "con tal de",
+        "con tal que",
+        "conforme",
+        "conmigo",
+        "conque",
+        "considera",
+        "consideradas",
+        "consideran",
+        "consideró",
+        "consigo",
+        "contendrán",
+        "contigo",
+        "continuaba",
+        "continuar",
+        "continuaron",
+        "continuase",
+        "continuó",
+        "continúa",
+        "contra",
+        "corresponden",
+        "corresponder",
+        "cual",
+        "cual si",
+        "cuales",
+        "cualesquier",
+        "cualesquiera",
+        "cualquier",
+        "cualquiera",
+        "cuan",
+        "cuando",
+        "cuanta",
+        "cuantas",
+        "cuanto",
+        "cuanto quiera que",
+        "cuantos",
+        "cuya",
+        "cuyas",
+        "cuyo",
+        "cuyos",
+        "cuàles",
+        "cuál",
+        "cuáles",
+        "cuán",
+        "cuándo",
+        "cuánta",
+        "cuántas",
+        "cuánto",
+        "cuántos",
+        "cómo",
+        "da",
+        "dado que",
+        "dar",
+        "de",
+        "de manera que",
+        "de modo que",
+        "deba",
+        "debajo",
+        "deban",
+        "debas",
+        "debe",
+        "debemos",
+        "deben",
+        "deber",
+        "deberá",
+        "deberán",
+        "debería",
+        "deberíamos",
+        "deberían",
+        "debes",
+        "debido",
+        "debiera",
+        "debieron",
+        "debimos",
+        "debió",
+        "debo",
+        "debía",
+        "debíamos",
+        "debían",
+        "declaraba",
+        "declarada",
+        "declarado",
+        "declarase",
+        "declaro",
+        "declaró",
+        "dejaban",
+        "dejado",
+        "dejan",
+        "dejará",
+        "del",
+        "delante",
+        "demasiada",
+        "demasiadas",
+        "demasiado",
+        "demasiados",
+        "demás",
+        "den",
+        "dentro",
+        "dentro_de",
+        "des",
+        "desde",
+        "después",
+        "detrás",
+        "di",
+        "dicha",
+        "dichas",
+        "dicho",
+        "dichos",
+        "diferente",
+        "diferentes",
+        "distintas",
+        "distinto",
+        "distintos",
+        "diversas",
+        "diverso",
+        "diversos",
+        "don",
+        "donde",
+        "dos",
+        "durante",
+        "dónde",
+        "echar",
+        "el",
+        "el que",
+        "ella",
+        "ellas",
+        "ello",
+        "ellos",
+        "en",
+        "en cambio",
+        "en caso de",
+        "en la medida en que",
+        "en tanto que",
+        "encima",
+        "enfrente",
+        "entonces",
+        "entre",
+        "era",
+        "eramos",
+        "eran",
+        "eras",
+        "eres",
+        "ergo",
+        "es",
+        "esa",
+        "esas",
+        "escasa",
+        "escasas",
+        "escaso",
+        "escasos",
+        "escrito",
+        "ese",
+        "eso",
+        "eso que",
+        "esos",
+        "esotra",
+        "esotro",
+        "esta",
+        "estaba",
+        "estabais",
+        "estabamos",
+        "estaban",
+        "estabas",
+        "estado",
+        "estamos",
+        "estan",
+        "estando",
+        "estar",
+        "estaremos",
+        "estará",
+        "estarán",
+        "estaré",
+        "estaría",
+        "estaríamos",
+        "estarían",
+        "estarías",
+        "estas",
+        "este",
+        "estemos",
+        "esto",
+        "estos",
+        "estotra",
+        "estotro",
+        "estoy",
+        "estuve",
+        "estuviera",
+        "estuvieran",
+        "estuvieron",
+        "estuviese",
+        "estuviesen",
+        "estuvimos",
+        "estuvo",
+        "está",
+        "estábamos",
+        "estáis",
+        "están",
+        "estás",
+        "esté",
+        "estén",
+        "ex",
+        "excepto",
+        "frente",
+        "fue",
+        "fuera",
+        "fueran",
+        "fuere",
+        "fueron",
+        "fuese",
+        "fuesen",
+        "fui",
+        "fuimos",
+        "gracias",
+        "gracias_a",
+        "habeis",
+        "haber",
+        "haberle",
+        "haberse",
+        "habido",
+        "habiendo",
+        "habiéndo",
+        "habremos",
+        "habrá",
+        "habrán",
+        "habrás",
+        "habré",
+        "habría",
+        "habríamos",
+        "habrían",
+        "habéis",
+        "había",
+        "habíamos",
+        "habían",
+        "habías",
+        "hace",
+        "hacer",
+        "hacia",
+        "hacía",
+        "halla",
+        "han",
+        "has",
+        "hasta",
+        "hasta que",
+        "hay",
+        "haya",
+        "hayamos",
+        "hayan",
+        "hayas",
+        "he",
+        "hecho",
+        "hemos",
+        "hola",
+        "hubiera",
+        "hubieran",
+        "hubieron",
+        "hubiese",
+        "hubiesen",
+        "hubiéramos",
+        "hubo",
+        "iba",
+        "iban",
+        "ido",
+        "incluso",
+        "ir",
+        "irá",
+        "irán",
+        "iré",
+        "iría",
+        "junto a",
+        "la",
+        "las",
+        "le",
+        "lejos",
+        "les",
+        "lo",
+        "los",
+        "luego",
+        "mal que",
+        "mas",
+        "me",
+        "mediante",
+        "menos",
+        "mes",
+        "mi",
+        "mientras",
+        "mientras que",
+        "mis",
+        "misma",
+        "mismas",
+        "mismo",
+        "mismos",
+        "mismísimo",
+        "morir",
+        "moriría",
+        "mostrado",
+        "mostraron",
+        "mucha",
+        "muchas",
+        "muchisimas",
+        "muchisimio",
+        "muchisimo",
+        "mucho",
+        "muchos",
+        "muchísima",
+        "muchísimas",
+        "muchísimo",
+        "muchísimos",
+        "más",
+        "más bien",
+        "mí",
+        "mía",
+        "mías",
+        "mío",
+        "míos",
+        "nada",
+        "nadie",
+        "negar",
+        "ni",
+        "ni que",
+        "ningun",
+        "ninguna",
+        "ningunas",
+        "ninguno",
+        "ningunos",
+        "ningún",
+        "no",
+        "no obstante",
+        "noche",
+        "nombrado",
+        "nombró",
+        "nos",
+        "nosotros",
+        "nuestra",
+        "nuestras",
+        "nuestro",
+        "nuestros",
+        "o",
+        "os",
+        "otra",
+        "otras",
+        "otro",
+        "otros",
+        "pa",
+        "para",
+        "para que",
+        "parezca",
+        "partir",
+        "pasar",
+        "pero",
+        "po",
+        "poca",
+        "pocas",
+        "poco",
+        "pocos",
+        "podamos",
+        "podeis",
+        "podemos",
+        "poder",
+        "podes",
+        "podido",
+        "podras",
+        "podre",
+        "podremos",
+        "podriaís",
+        "podrá",
+        "podrán",
+        "podrás",
+        "podré",
+        "podréis",
+        "podría",
+        "podríamos",
+        "podrían",
+        "podéis",
+        "podía",
+        "podíamos",
+        "podían",
+        "poner",
+        "poquito",
+        "por",
+        "por el contrario",
+        "por ende",
+        "por eso",
+        "por lo que",
+        "por mucho que",
+        "por más que",
+        "por no hablar de",
+        "por si",
+        "porque",
+        "pos",
+        "post",
+        "pre",
+        "pro",
+        "propia",
+        "propias",
+        "propio",
+        "propios",
+        "pude",
+        "pudiendo",
+        "pudiera",
+        "pudieran",
+        "pudieras",
+        "pudieron",
+        "pudiese",
+        "pudiesen",
+        "pudimos",
+        "pudo",
+        "pueda",
+        "puedan",
+        "puedas",
+        "puede",
+        "pueden",
+        "puedes",
+        "puedo",
+        "pues",
+        "puesto",
+        "puesto que",
+        "que",
+        "queda",
+        "quedaba",
+        "quedan",
+        "quedó",
+        "queremos",
+        "querer",
+        "queriendo",
+        "quien",
+        "quienes",
+        "quienesquiera",
+        "quienquier",
+        "quienquiera",
+        "quiera",
+        "quiere",
+        "quisiera",
+        "quién",
+        "quiénes",
+        "qué",
+        "re",
+        "resulta",
+        "resultado",
+        "resultaría",
+        "resulte",
+        "sabe",
+        "saber",
+        "sabiendo",
+        "salen",
+        "salir",
+        "salió",
+        "salvo",
+        "se",
+        "sea",
+        "seamos",
+        "sean",
+        "seas",
+        "seguir",
+        "seguirá",
+        "seguía",
+        "según",
+        "semejante",
+        "semejantes",
+        "semi",
+        "sendas",
+        "sendo",
+        "sendos",
+        "ser",
+        "será",
+        "serán",
+        "serás",
+        "seré",
+        "seréis",
+        "sería",
+        "serían",
+        "serías",
+        "si",
+        "si bien",
+        "si y solo si",
+        "sido",
+        "siempre que",
+        "siendo",
+        "siente",
+        "siento",
+        "siga",
+        "sigamos",
+        "sigue",
+        "sin",
+        "sino",
+        "siquiera",
+        "sobre",
+        "sobrer",
+        "sobrir",
+        "soler",
+        "solían",
+        "somos",
+        "son",
+        "soy",
+        "sub",
+        "suele",
+        "suelen",
+        "suelo",
+        "super",
+        "supo",
+        "sur",
+        "sus",
+        "suya",
+        "suyas",
+        "suyo",
+        "suyos",
+        "sé",
+        "sí",
+        "tal",
+        "tales",
+        "tanta",
+        "tantas",
+        "tanto",
+        "tantos",
+        "tantísima",
+        "tantísimas",
+        "tantísimos",
+        "te",
+        "tendremos",
+        "tendrian",
+        "tendrá",
+        "tendrán",
+        "tendría",
+        "tendrían",
+        "tenemos",
+        "tener",
+        "tenga",
+        "tengan",
+        "tengo",
+        "tenia",
+        "tenido",
+        "teniendo",
+        "tenéis",
+        "tenía",
+        "teníamos",
+        "tenían",
+        "terminas",
+        "ti",
+        "tiene",
+        "tienen",
+        "tienes",
+        "toda",
+        "todas",
+        "todavía",
+        "todes",
+        "todo",
+        "todos",
+        "trabajado",
+        "trans",
+        "tras",
+        "tu",
+        "tus",
+        "tuve",
+        "tuviera",
+        "tuvieron",
+        "tuviese",
+        "tuvo",
+        "tuya",
+        "tuyas",
+        "tuyo",
+        "tuyos",
+        "tú",
+        "u",
+        "un",
+        "una",
+        "unas",
+        "une",
+        "unir",
+        "uno",
+        "unos",
+        "usted",
+        "ustedes",
+        "va",
+        "vamos",
+        "van",
+        "varias",
+        "varios",
+        "varía",
+        "vas",
+        "vaya",
+        "vayan",
+        "venir",
+        "venía",
+        "ver",
+        "vice",
+        "vieron",
+        "vino",
+        "vis a vis",
+        "visto que",
+        "volver",
+        "volverá",
+        "volveríamos",
+        "volvió",
+        "vos",
+        "vosotras",
+        "vosotros",
+        "voy",
+        "vuelva",
+        "vuelvan",
+        "vuelve",
+        "vuelven",
+        "vuestra",
+        "vuestras",
+        "vuestro",
+        "vuestros",
+        "vía",
+        "y",
+        "ya",
+        "ya que",
+        "yo",
+        "ámbos",
+        "él",
+        "éramos",
+        "ésa",
+        "ésas",
+        "ése",
+        "ésos",
+        "ésta",
+        "éstas",
+        "éste",
+        "ésto",
+        "éstos",
+        "íbamos",
+        "ó",
+        "ú",
+        "última",
+        "últimas",
+        "último",
+        "últimos",
+        "\ufeffdesde",
+        "\ufeffel",
+        "\ufeffen",
+        "\ufeffla",
+        "\ufefflas",
+    ],
+    "eu": [
+        "*edin",
+        "*edun",
+        "*ezan",
+        "aitzitik",
+        "ala",
+        "alabaina",
+        "aldiz",
+        "alegia",
+        "alta",
+        "anitz",
+        "anitzek",
+        "anitzeko",
+        "anitzez",
+        "antzera",
+        "arabera",
+        "ari",
+        "ari_izan",
+        "ariko",
+        "arren",
+        "asko",
+        "askoan",
+        "askok",
+        "askoko",
+        "askorekin",
+        "askoren",
+        "askorengan",
+        "askorentzat",
+        "askori",
+        "askorik",
+        "askotako",
+        "askotan",
+        "askotariko",
+        "askotatik",
+        "askotaz",
+        "askotxo",
+        "askoz",
+        "at",
+        "aunitz",
+        "aurka",
+        "aurkako",
+        "aurretik",
+        "azpian",
+        "azpitik",
+        "ba",
+        "bada",
+        "badago",
+        "badezake",
+        "badidazu",
+        "badiezu",
+        "badio",
+        "badiogu",
+        "badiote",
+        "badiougu",
+        "badiozu",
+        "badira",
+        "badirela",
+        "baditu",
+        "baditugu",
+        "badituzte",
+        "badituzu",
+        "badu",
+        "badugu",
+        "badugun",
+        "badut",
+        "badute",
+        "baduzu",
+        "bagara",
+        "bagatzaizkio",
+        "bagenu",
+        "baginen",
+        "bai",
+        "baietz",
+        "baikaituzte",
+        "bailegoen",
+        "bailituen",
+        "bailitzake",
+        "bailitzateke",
+        "baina",
+        "bainan",
+        "bainintzen",
+        "bainizkion",
+        "baino",
+        "baita",
+        "baitabil",
+        "baitaiteke",
+        "baitan",
+        "baitaude",
+        "baitiete",
+        "baitigu",
+        "baitio",
+        "baitiote",
+        "baitira",
+        "baititu",
+        "baititugu",
+        "baitituzte",
+        "baitituzu",
+        "baititzaket",
+        "baitizkio",
+        "baitu",
+        "baitugu",
+        "baitute",
+        "baituzu",
+        "baitzaio",
+        "baitzaizkio",
+        "baitzara",
+        "baitzegoen",
+        "baitzen",
+        "baitzeuden",
+        "baitzien",
+        "baitzion",
+        "baitzioten",
+        "baitziren",
+        "baitzitekeen",
+        "baitzituen",
+        "baitzitzaion",
+        "baitzuen",
+        "baitzuten",
+        "baizik",
+        "baizituen",
+        "baldin",
+        "balego",
+        "balira",
+        "baliteke",
+        "balitu",
+        "balituzkete",
+        "balitz",
+        "balitzait",
+        "balu",
+        "balute",
+        "banintz",
+        "banitu",
+        "banu",
+        "barik",
+        "barru",
+        "bat",
+        "batera",
+        "batera\x97",
+        "batere",
+        "batzu",
+        "batzuei",
+        "batzuek",
+        "batzuekin",
+        "batzuen",
+        "batzuengatik",
+        "batzuentzat",
+        "batzuetako",
+        "batzuetakoak",
+        "batzuetan",
+        "batzuetara",
+        "batzuetatik",
+        "batzuez",
+        "batzuk",
+        "batzutako",
+        "batzutan",
+        "bazaigu",
+        "bazaizu",
+        "bazara",
+        "bazen",
+        "bazina",
+        "baziren",
+        "bazituen",
+        "bazituzten",
+        "bazuen",
+        "bazuten",
+        "bederen",
+        "behintzat",
+        "bera",
+        "beragatik",
+        "beraiei",
+        "beraiek",
+        "beraiekin",
+        "beraien",
+        "beraietaz",
+        "berak",
+        "berarekin",
+        "beraren",
+        "berarengan",
+        "berarengana",
+        "berarengandik",
+        "berarengatik",
+        "berarentzat",
+        "berari",
+        "berauek",
+        "berauen",
+        "berauetan",
+        "beraz",
+        "berbera",
+        "berberagatik",
+        "berberak",
+        "berberarekin",
+        "berberaren",
+        "berberera",
+        "bere",
+        "berea",
+        "bereak",
+        "berean",
+        "berek",
+        "bereko",
+        "berekoa",
+        "berekoak",
+        "beren",
+        "beretan",
+        "beretik",
+        "beretzat",
+        "berriz",
+        "bertze",
+        "bertzeekin",
+        "bertzela",
+        "bestalde",
+        "bestaldean",
+        "beste",
+        "bestea",
+        "besteak",
+        "bestean",
+        "bestearekiko",
+        "bestearekin",
+        "bestearen",
+        "bestearengandik",
+        "besteari",
+        "besteaz",
+        "besteei",
+        "besteen",
+        "besteengandik",
+        "besteetan",
+        "besteko",
+        "bestekoa",
+        "bestela",
+        "bestera",
+        "besterantz",
+        "besterik",
+        "bestetan",
+        "bestetik",
+        "bezala",
+        "bezalako",
+        "bezalakoa",
+        "bezalakoen",
+        "bidez",
+        "bitartean",
+        "bitarteko",
+        "bitarterako",
+        "bitartez",
+        "da",
+        "dabil",
+        "dabiltza",
+        "dadila",
+        "dadin",
+        "dago",
+        "dagoela",
+        "dagoelako",
+        "dagoen",
+        "dagoena",
+        "dagoenaren",
+        "dagoenean",
+        "dagoenez",
+        "daiteekenaren",
+        "daiteke",
+        "daitekeela",
+        "daitekeen",
+        "daitekeena",
+        "daitekeenaren",
+        "daitekeenez",
+        "daiteken",
+        "daitezela",
+        "daitezen",
+        "daitezke",
+        "daitezkeelako",
+        "daitezkeelarik",
+        "daitezkeen",
+        "daitezkeenak",
+        "daitezkela",
+        "dakizuke",
+        "danok",
+        "daude",
+        "daudela",
+        "daudelako",
+        "dauden",
+        "daudenak",
+        "daudenek",
+        "daudenen",
+        "daudenik",
+        "dautzuet",
+        "dela",
+        "delako",
+        "delarik",
+        "den",
+        "dena",
+        "denak",
+        "denaren",
+        "denarentzat",
+        "denari",
+        "denean",
+        "denek",
+        "denen",
+        "denera",
+        "denerako",
+        "denetan",
+        "denetarik",
+        "denetik",
+        "denez",
+        "denik",
+        "denok",
+        "denon",
+        "denona",
+        "denontzat",
+        "deus",
+        "dexente",
+        "dezadan",
+        "dezagun",
+        "dezake",
+        "dezakedala",
+        "dezakedan",
+        "dezakedanean",
+        "dezakeela",
+        "dezakeen",
+        "dezakeena",
+        "dezakegu",
+        "dezakegula",
+        "dezakegun",
+        "dezakela",
+        "dezakelako",
+        "dezaket",
+        "dezakete",
+        "dezaketela",
+        "dezaketen",
+        "dezakezu",
+        "dezakezuen",
+        "dezakezuenez",
+        "dezakezunez",
+        "dezala",
+        "dezan",
+        "dezaten",
+        "dezente",
+        "dezenterekin",
+        "dezentetan",
+        "diat",
+        "didala",
+        "didana",
+        "didate",
+        "didazue",
+        "die",
+        "diegu",
+        "diegun",
+        "diela",
+        "dien",
+        "dienak",
+        "diet",
+        "diete",
+        "dietela",
+        "dietelako",
+        "dietenean",
+        "diezaiekete",
+        "diezaiokeena",
+        "diezaiokete",
+        "diezaiola",
+        "diezaioten",
+        "diezaizkioke",
+        "diezazkioke",
+        "diezazkiokeen",
+        "digu",
+        "digun",
+        "digute",
+        "digutela",
+        "diguten",
+        "digutenean",
+        "diguzu",
+        "dik",
+        "din",
+        "dinat",
+        "dio",
+        "diogu",
+        "diogulako",
+        "diogun",
+        "diola",
+        "dion",
+        "diona",
+        "dionean",
+        "dionez",
+        "diot",
+        "diote",
+        "diotela",
+        "dioten",
+        "diotena",
+        "diotenak",
+        "diotenek",
+        "diozu",
+        "dira",
+        "direla",
+        "direlako",
+        "direlakoan",
+        "direlakotz",
+        "diren",
+        "direnak",
+        "direnean",
+        "direnek",
+        "direnen",
+        "direnetan",
+        "direnez",
+        "direnik",
+        "dit",
+        "ditake",
+        "ditazke",
+        "ditin",
+        "ditu",
+        "ditudala",
+        "ditudalako",
+        "ditudan",
+        "ditudanean",
+        "dituela",
+        "dituelako",
+        "dituelarik",
+        "dituen",
+        "dituena",
+        "dituenak",
+        "dituenean",
+        "ditugu",
+        "ditugula",
+        "ditugun",
+        "ditugunez",
+        "ditun",
+        "ditut",
+        "dituzte",
+        "dituztela",
+        "dituztelako",
+        "dituzten",
+        "dituztenak",
+        "dituztenean",
+        "dituztenek",
+        "dituztenekin",
+        "dituztenen",
+        "dituzu",
+        "dituzue",
+        "dituzuen",
+        "dituzula",
+        "dituzun",
+        "dituzunik",
+        "ditzagun",
+        "ditzake",
+        "ditzakeen",
+        "ditzakegu",
+        "ditzakegula",
+        "ditzakete",
+        "ditzaketela",
+        "ditzaketelako",
+        "ditzaketen",
+        "ditzakezu",
+        "ditzan",
+        "dizkidazu",
+        "dizkie",
+        "dizkien",
+        "dizkiet",
+        "dizkiete",
+        "dizkigu",
+        "dizkigula",
+        "dizkigunak",
+        "dizkigute",
+        "dizkio",
+        "dizkiola",
+        "dizkion",
+        "dizkiot",
+        "dizkiotela",
+        "dizkit",
+        "dizkizuet",
+        "dizkizugu",
+        "dizu",
+        "dizuet",
+        "dizugu",
+        "dizut",
+        "dizute",
+        "du",
+        "duan",
+        "dudala",
+        "dudalarik",
+        "dudan",
+        "dudanak",
+        "dudanarekin",
+        "dudanean",
+        "dudanik",
+        "duela",
+        "duelako",
+        "duelakoan",
+        "duen",
+        "duena",
+        "duenak",
+        "duenaren",
+        "duenarentzat",
+        "duenari",
+        "duenean",
+        "duenentz",
+        "duenez",
+        "duenik",
+        "dugu",
+        "dugula",
+        "dugulako",
+        "dugun",
+        "duguna",
+        "dugunari",
+        "dugunean",
+        "dugunez",
+        "dugunik",
+        "duk",
+        "dun",
+        "dunala",
+        "dut",
+        "dute",
+        "dutela",
+        "dutelako",
+        "dutelakoan",
+        "duten",
+        "dutena",
+        "dutenagatik",
+        "dutenak",
+        "dutenaren",
+        "dutenean",
+        "dutenek",
+        "duteneko",
+        "dutenen",
+        "dutenena",
+        "dutenenetatik",
+        "dutenentz",
+        "dutenetakoa",
+        "dutenetik",
+        "dutenez",
+        "duzu",
+        "duzue",
+        "duzuela",
+        "duzuen",
+        "duzuenean",
+        "duzuenez",
+        "duzula",
+        "duzun",
+        "duzunarekin",
+        "ea",
+        "edo",
+        "edonor",
+        "edota",
+        "edozein",
+        "edozeinek",
+        "edozer",
+        "edozertarako",
+        "elgarrekin",
+        "elgarri",
+        "elkar",
+        "elkarrekiko",
+        "elkarrekin",
+        "elkarren",
+        "elkarri",
+        "ene",
+        "era",
+        "ere",
+        "esker",
+        "eta",
+        "eurak",
+        "eurei",
+        "eurek",
+        "eurekin",
+        "euren",
+        "eurentzat",
+        "ez",
+        "ezan",
+        "ezazu",
+        "ezazue",
+        "ezean",
+        "ezein",
+        "ezen",
+        "ezer",
+        "ezerekin",
+        "ezerk",
+        "ezertarako",
+        "ezertaz",
+        "ezertxo",
+        "ezetz",
+        "ezik",
+        "ezta",
+        "gabe",
+        "gabeko",
+        "gainera",
+        "gainerakoan",
+        "gainerat",
+        "gainera\x97",
+        "gainetik",
+        "gaitezen",
+        "gaitezke",
+        "gaitezkeela",
+        "gaitu",
+        "gaituela",
+        "gaituzte",
+        "gaituztenak",
+        "gara",
+        "garela",
+        "garelako",
+        "garen",
+        "garenez",
+        "garenok",
+        "gaude",
+        "gaudenak",
+        "gehiago",
+        "gehiagoan",
+        "gehiagok",
+        "gehiagoko",
+        "gehiagorekin",
+        "gehiegi",
+        "gehiegirik",
+        "gehiegitxo",
+        "gehien",
+        "gehiena",
+        "gehienak",
+        "gehienek",
+        "gehienekin",
+        "gehienentzako",
+        "gehienentzat",
+        "gehienetako",
+        "gehienetan",
+        "gehienok",
+        "gehientsu",
+        "gehientsuen",
+        "gehitxo",
+        "gehixeago",
+        "genbiltzan",
+        "genezake",
+        "genien",
+        "genion",
+        "genituela",
+        "genituelako",
+        "genituen",
+        "genituzke",
+        "genituzkeelako",
+        "genizkion",
+        "genizuen",
+        "genizun",
+        "genuela",
+        "genuelako",
+        "genuen",
+        "genuenean",
+        "genuenetik",
+        "genuenez",
+        "genuke",
+        "genukeen",
+        "geratu",
+        "geratzen",
+        "geroztik",
+        "geu",
+        "geure",
+        "geuregan",
+        "geuri",
+        "ginela",
+        "ginen",
+        "ginenean",
+        "ginenekoa",
+        "gintezkeela",
+        "gintuen",
+        "gintuenagatik",
+        "gintunan",
+        "gintuzten",
+        "gintzaizkion",
+        "gu",
+        "guk",
+        "gure",
+        "gurean",
+        "gurekin",
+        "guretzat",
+        "guri",
+        "gutako",
+        "gutaz",
+        "guti",
+        "gutiz",
+        "gutiz-gehien",
+        "gutiz-gehienek",
+        "gutxi",
+        "gutxiago",
+        "gutxiagorako",
+        "gutxiagorekin",
+        "gutxian",
+        "gutxien",
+        "gutxienez",
+        "gutxik",
+        "gutxiko",
+        "gutxira",
+        "gutxiren",
+        "gutxitan",
+        "guzi",
+        "guziak",
+        "guziarekin",
+        "guziekin",
+        "guzientzat",
+        "guzti",
+        "guztia",
+        "guztiagatik",
+        "guztiak",
+        "guztian",
+        "guztiarekin",
+        "guztiaren",
+        "guztiari",
+        "guztiaz",
+        "guztiei",
+        "guztiek",
+        "guztien",
+        "guztiengan",
+        "guztientzako",
+        "guztientzat",
+        "guztietako",
+        "guztietan",
+        "guztietara",
+        "guztietatik",
+        "guztiez",
+        "guztioi",
+        "guztiok",
+        "guztion",
+        "guztionak",
+        "guztionen",
+        "guztiontzat",
+        "guztira",
+        "guztitako",
+        "haatik",
+        "haiek",
+        "haiekin",
+        "haien",
+        "haiengan",
+        "haiengandik",
+        "haietako",
+        "haietan",
+        "haietatik",
+        "hainbat",
+        "hainbatek",
+        "hainbaten",
+        "hainbatez",
+        "hainbertze",
+        "hainbeste",
+        "hainbesterako",
+        "haiteke",
+        "haiz",
+        "halaber",
+        "halere",
+        "harekin",
+        "haren",
+        "harena",
+        "harentzat",
+        "hargatik",
+        "hari",
+        "hark",
+        "hartako",
+        "hartan",
+        "hartara",
+        "hartarako",
+        "hartatik",
+        "hau",
+        "haudala",
+        "hauei",
+        "hauek",
+        "hauekin",
+        "hauen",
+        "hauetako",
+        "hauetan",
+        "hauetara",
+        "hauetarako",
+        "hauetarik",
+        "hauetatik",
+        "hauexek",
+        "hauez",
+        "hauxe",
+        "heu",
+        "heure",
+        "hhriek",
+        "hi",
+        "hik",
+        "hinduan",
+        "hintzen",
+        "hire",
+        "hiri",
+        "honegatik",
+        "honek",
+        "honekin",
+        "honen",
+        "honengatik",
+        "honentzat",
+        "honetako",
+        "honetan",
+        "honetara",
+        "honetarako",
+        "honetatik",
+        "honetaz",
+        "honez",
+        "honi",
+        "hori",
+        "horiei",
+        "horiek",
+        "horiekin",
+        "horien",
+        "horientzat",
+        "horietako",
+        "horietakoren",
+        "horietan",
+        "horietarako",
+        "horietariko",
+        "horietatik",
+        "horiez",
+        "horixe",
+        "horregatik",
+        "horrek",
+        "horrekin",
+        "horren",
+        "horrenbeste",
+        "horrenbestez",
+        "horrengatik",
+        "horretako",
+        "horretan",
+        "horretantxe",
+        "horretara",
+        "horretarako",
+        "horretatik",
+        "horretaz",
+        "horrexegatik",
+        "horrexekin",
+        "horrexetan",
+        "horrez",
+        "horrezaz",
+        "horri",
+        "hortaz",
+        "huan",
+        "huntan",
+        "hura",
+        "huraxe",
+        "iezaidazu",
+        "iezaiezu",
+        "iezaion",
+        "iezaiozu",
+        "inor",
+        "inoren",
+        "inorentzako",
+        "inori",
+        "inork",
+        "inortaz",
+        "irian",
+        "itzazu",
+        "izaki",
+        "kontra",
+        "lezake",
+        "lezakeen",
+        "lezakete",
+        "lezan",
+        "liekeela",
+        "liezaiokeen",
+        "lioke",
+        "liokeela",
+        "liokeen",
+        "lirateke",
+        "liratekeela",
+        "liteke",
+        "litekeela",
+        "litekeen",
+        "litekeena",
+        "litezke",
+        "lituzkeela",
+        "lituzkeen",
+        "lituzkete",
+        "litzaidake",
+        "litzaiguke",
+        "litzateke",
+        "litzatekeela",
+        "litzatekeelako",
+        "litzatekela",
+        "lizateke",
+        "luke",
+        "lukeela",
+        "lukeelako",
+        "lukeen",
+        "lukeena",
+        "lukete",
+        "luketen",
+        "nabil",
+        "nago",
+        "nahiko",
+        "nahikoa",
+        "nahikorik",
+        "nahiz",
+        "naiteke",
+        "naiz",
+        "naizela",
+        "naizen",
+        "naizenean",
+        "naizenetan",
+        "naizenetik",
+        "naizenez",
+        "naizenik",
+        "nau",
+        "nauen",
+        "nauenarentzat",
+        "nauenean",
+        "nauk",
+        "naun",
+        "naute",
+        "nautela",
+        "nauzu",
+        "nauzun",
+        "nazan",
+        "nazaten",
+        "nazazu",
+        "nazazun",
+        "nenbilen",
+        "nengoela",
+        "nengoen",
+        "nere",
+        "neu",
+        "neuk",
+        "neure",
+        "nezake",
+        "ni",
+        "nian",
+        "nien",
+        "nigan",
+        "nik",
+        "ninduen",
+        "ninduten",
+        "nintekeela",
+        "nintzaion",
+        "nintzateke",
+        "nintzatekeela",
+        "nintzela",
+        "nintzelako",
+        "nintzen",
+        "nintzenean",
+        "nion",
+        "nire",
+        "nirea",
+        "niregan",
+        "niregana",
+        "niregatik",
+        "nirekin",
+        "niretzako",
+        "niretzat",
+        "niri",
+        "nitaz",
+        "nituela",
+        "nituen",
+        "nituzke",
+        "nizuke",
+        "nor",
+        "norbait",
+        "norbaitek",
+        "norbaitekin",
+        "norbaiten",
+        "norbaitengana",
+        "norbaitentzat",
+        "norbaiti",
+        "norbera",
+        "norberak",
+        "norberaren",
+        "norbere",
+        "noren",
+        "nori",
+        "nork",
+        "nornahi",
+        "nornahik",
+        "nortzuk",
+        "nortzuren",
+        "nuela",
+        "nuen",
+        "nuena",
+        "nuenean",
+        "nuenetik",
+        "nuke",
+        "nukeela",
+        "omen",
+        "ondoan",
+        "ondoko",
+        "ondora",
+        "ondoren",
+        "ondorengo",
+        "ondotik",
+        "ordea",
+        "ordez",
+        "orduan",
+        "oro_har",
+        "orobat",
+        "orohar",
+        "orok",
+        "ororen",
+        "orori",
+        "ostean",
+        "ostera",
+        "osterantzean",
+        "pean",
+        "piskat",
+        "pixka_bat",
+        "pixkat",
+        "pranko",
+        "ugari",
+        "ugarik",
+        "ugarirekin",
+        "ugariren",
+        "ugaritan",
+        "zagok",
+        "zaidan",
+        "zaidanaren",
+        "zaie",
+        "zaiela",
+        "zaien",
+        "zaienez",
+        "zaigu",
+        "zaigun",
+        "zaiguna",
+        "zaigunean",
+        "zaik",
+        "zaio",
+        "zaiola",
+        "zaiolako",
+        "zaion",
+        "zaiona",
+        "zait",
+        "zaitez",
+        "zaitezen",
+        "zaitu",
+        "zaitut",
+        "zaituzte",
+        "zaitzakegu",
+        "zaizkidan",
+        "zaizkie",
+        "zaizkiela",
+        "zaizkien",
+        "zaizkigu",
+        "zaizkio",
+        "zaizkiola",
+        "zaizkion",
+        "zaizkit",
+        "zaizkizu",
+        "zaizkizue",
+        "zaizkizun",
+        "zaizu",
+        "zaizue",
+        "zara",
+        "zarela",
+        "zarete",
+        "zatekeela",
+        "zatekeen",
+        "zatzait",
+        "zaude",
+        "ze",
+        "zebilen",
+        "zedin",
+        "zegoan",
+        "zegoela",
+        "zegoelako",
+        "zegoen",
+        "zegoenez",
+        "zegok",
+        "zehar",
+        "zein",
+        "zeina",
+        "zeinek",
+        "zeinen",
+        "zeintzu",
+        "zeintzuetan",
+        "zeintzuk",
+        "zela",
+        "zelako",
+        "zelarik",
+        "zen",
+        "zena",
+        "zenak",
+        "zenarekin",
+        "zenari",
+        "zenbait",
+        "zenbaitek",
+        "zenbaiten",
+        "zenbaitetan",
+        "zenbaiti",
+        "zenbaitzuk",
+        "zenbat",
+        "zenbateraino",
+        "zenean",
+        "zenekoa",
+        "zenetik",
+        "zenez",
+        "zeniguten",
+        "zenigutenez",
+        "zenik",
+        "zenituen",
+        "zenitzakeen",
+        "zenuela",
+        "zenuen",
+        "zenuke",
+        "zenukete",
+        "zenutela",
+        "zenuten",
+        "zeozer",
+        "zer",
+        "zer_edo_zer",
+        "zerbait",
+        "zerbaitek",
+        "zerbaitengatik",
+        "zerbaitetarako",
+        "zeren",
+        "zerendako",
+        "zeri",
+        "zerk",
+        "zertan",
+        "zertara",
+        "zertarako",
+        "zertaz",
+        "zertxobait",
+        "zeu",
+        "zeudela",
+        "zeudelako",
+        "zeuden",
+        "zeudenak",
+        "zeuk",
+        "zeure",
+        "zezakeen",
+        "zezaken",
+        "zezaketen",
+        "zezala",
+        "zezan",
+        "zezaten",
+        "zidan",
+        "zidatelako",
+        "zidaten",
+        "zidatena",
+        "zidatenak",
+        "zidatenean",
+        "ziela",
+        "zien",
+        "zienez",
+        "zietela",
+        "zietelako",
+        "zieten",
+        "ziezaion",
+        "zigun",
+        "zigunez",
+        "ziguten",
+        "zinan",
+        "zinen",
+        "zintudan",
+        "zintuztela",
+        "zintuztenean",
+        "ziola",
+        "ziolako",
+        "ziolarik",
+        "zion",
+        "ziona",
+        "zionean",
+        "zionez",
+        "zioten",
+        "ziotenak",
+        "zirela",
+        "zirelako",
+        "zirelakoan",
+        "zirelarik",
+        "ziren",
+        "zirenak",
+        "zirenean",
+        "zirenetik",
+        "zirenez",
+        "zirenik",
+        "ziren\x97",
+        "zirezte",
+        "zitekeela",
+        "zitekeen",
+        "zitekeena",
+        "zitekeenik",
+        "zitezen",
+        "zitezkeela",
+        "zitezkeelakoan",
+        "zitezkeen",
+        "zituela",
+        "zituelako",
+        "zituelarik",
+        "zituen",
+        "zituenean",
+        "zituenei",
+        "zituztela",
+        "zituztelarik",
+        "zituzten",
+        "zituztenak",
+        "zituztenetik",
+        "zitzaidakeen",
+        "zitzaidala",
+        "zitzaidan",
+        "zitzaien",
+        "zitzaigun",
+        "zitzaiola",
+        "zitzaion",
+        "zitzaionagatik",
+        "zitzaionean",
+        "zitzaizkidan",
+        "zitzaizkien",
+        "zitzaizkienean",
+        "zitzaizkigun",
+        "zitzaizkion",
+        "zitzaizkon",
+        "zitzaizun",
+        "zitzakeen",
+        "zitzaketenak",
+        "zizioten",
+        "zizkidaten",
+        "zizkien",
+        "zizkienik",
+        "zizkieten",
+        "zizkigun",
+        "zizkiola",
+        "zizkion",
+        "zizkiona",
+        "zizkioten",
+        "zizkiotenekin",
+        "zizuen",
+        "zizun",
+        "zoin",
+        "zonbat",
+        "zu",
+        "zuei",
+        "zuek",
+        "zuela",
+        "zuelako",
+        "zuelarik",
+        "zuen",
+        "zuena",
+        "zuenak",
+        "zuenarentzat",
+        "zuenean",
+        "zuenetik",
+        "zuenez",
+        "zuenik",
+        "zuentzako",
+        "zuetako",
+        "zuetaz",
+        "zugandik",
+        "zuk",
+        "zukeen",
+        "zuketen",
+        "zure",
+        "zureak",
+        "zurekin",
+        "zuretzat",
+        "zutela",
+        "zutelako",
+        "zutelarik",
+        "zuten",
+        "zutena",
+        "zutenean",
+        "zuteneko",
+        "zutenetik",
+        "zutenez",
+    ],
+    "fr": [
+        "a",
+        "afin",
+        "ai",
+        "aie",
+        "aient",
+        "ainsi",
+        "ait",
+        "alias",
+        "aller",
+        "allons",
+        "apres",
+        "après",
+        "as",
+        "au",
+        "au-delà",
+        "aucun",
+        "aucune",
+        "aucunes",
+        "aucuns",
+        "aujourd'",
+        "auprès",
+        "auquel",
+        "aura",
+        "aurai",
+        "auraient",
+        "aurais",
+        "aurait",
+        "aurions",
+        "aurons",
+        "auront",
+        "autant",
+        "autour",
+        "autre",
+        "autres",
+        "autrui",
+        "auxquelles",
+        "auxquels",
+        "avaient",
+        "avais",
+        "avait",
+        "avant",
+        "avec",
+        "avez",
+        "aviez",
+        "avions",
+        "avoir",
+        "avons",
+        "ayant",
+        "ayez",
+        "ayons",
+        "beaucoup",
+        "c'est-à-dire",
+        "c-à-d.",
+        "ca",
+        "car",
+        "ce",
+        "ceci",
+        "cela",
+        "celle",
+        "celle-ci",
+        "celles",
+        "celles-ci",
+        "celui",
+        "celui-ci",
+        "celui-là",
+        "cent",
+        "certain",
+        "certaine",
+        "certaines",
+        "certains",
+        "ces",
+        "cet",
+        "cette",
+        "ceux",
+        "ceux-ci",
+        "ceux-là",
+        "cf.",
+        "chacun",
+        "chacune",
+        "chaque",
+        "chez",
+        "ci",
+        "cinq",
+        "combien",
+        "comme",
+        "comment",
+        "concernant",
+        "contre",
+        "cà",
+        "d'après",
+        "d'autres",
+        "dans",
+        "de",
+        "dehors",
+        "depuis",
+        "derrière",
+        "des",
+        "deux",
+        "devait",
+        "devant",
+        "devez",
+        "devions",
+        "devoir",
+        "devons",
+        "devra",
+        "devraient",
+        "devrait",
+        "devrions",
+        "devrons",
+        "devront",
+        "doit",
+        "doivent",
+        "donc",
+        "dont",
+        "du",
+        "durant",
+        "dès",
+        "début",
+        "dû",
+        "elle",
+        "elle-même",
+        "elles",
+        "elles-mêmes",
+        "en",
+        "entre",
+        "entres",
+        "envers",
+        "environ",
+        "es",
+        "est",
+        "et",
+        "etaient",
+        "etant",
+        "etre",
+        "eut",
+        "eux",
+        "eux-mêmes",
+        "excepté",
+        "eût",
+        "faire",
+        "fais",
+        "faisaient",
+        "faisait",
+        "faisant",
+        "fait",
+        "faite",
+        "faites",
+        "fasse",
+        "fassent",
+        "fera",
+        "ferait",
+        "feront",
+        "firent",
+        "fit",
+        "font",
+        "furent",
+        "fussent",
+        "fut",
+        "fût",
+        "für",
+        "grâce",
+        "hormis",
+        "hors",
+        "i",
+        "il",
+        "ils",
+        "iront",
+        "je",
+        "jusque",
+        "l'on",
+        "la",
+        "ladite",
+        "laquelle",
+        "le",
+        "le/lui",
+        "ledit",
+        "lequel",
+        "les",
+        "lesdites",
+        "lesquelles",
+        "lesquels",
+        "leur",
+        "leurs",
+        "lors",
+        "lorsque",
+        "lui",
+        "lui-aussi",
+        "lui-même",
+        "là",
+        "ma",
+        "maint",
+        "maintes",
+        "mais",
+        "malgré",
+        "me",
+        "mes",
+        "mien",
+        "moi",
+        "moi-même",
+        "moins",
+        "mon",
+        "ne",
+        "ni",
+        "nonobstant",
+        "nos",
+        "notre",
+        "nous",
+        "nous-mêmes",
+        "nul",
+        "nôtre",
+        "nôtres",
+        "on",
+        "ont",
+        "onze",
+        "ou",
+        "outre",
+        "où",
+        "par",
+        "parce",
+        "parmi",
+        "pas",
+        "pendant",
+        "personne",
+        "peu",
+        "peut",
+        "peuvent",
+        "peux",
+        "plupart",
+        "plus",
+        "plusieurs",
+        "pour",
+        "pourquoi",
+        "pourra",
+        "pourraient",
+        "pourrait",
+        "pourrez",
+        "pourrons",
+        "pourront",
+        "pouvait",
+        "pouvez",
+        "pouvoir",
+        "pouvons",
+        "presque",
+        "près",
+        "pu",
+        "puis",
+        "puisque",
+        "puisse",
+        "puissent",
+        "puissions",
+        "qu",
+        "quand",
+        "quant",
+        "quarante",
+        "quatre",
+        "que",
+        "quel",
+        "quelconque",
+        "quelle",
+        "quelles",
+        "quelqu'un",
+        "quelque",
+        "quelques",
+        "quelques-unes",
+        "quelques-uns",
+        "quelqu’un",
+        "quels",
+        "qui",
+        "quiconque",
+        "quid",
+        "quoi",
+        "quoique",
+        "rien",
+        "sa",
+        "sans",
+        "sauf",
+        "se",
+        "selon",
+        "sera",
+        "serai",
+        "seraient",
+        "serais",
+        "serait",
+        "seras",
+        "serez",
+        "seriez",
+        "serions",
+        "serons",
+        "seront",
+        "ses",
+        "si",
+        "sien",
+        "sienne",
+        "siennes",
+        "siens",
+        "sinon",
+        "six",
+        "soi",
+        "soi-même",
+        "soient",
+        "sois",
+        "soit",
+        "sommes",
+        "son",
+        "sont",
+        "sous",
+        "soyez",
+        "soyons",
+        "suis",
+        "sur",
+        "t-il",
+        "ta",
+        "tandis",
+        "tant",
+        "tantôt",
+        "te",
+        "tel",
+        "telle",
+        "telles",
+        "tes",
+        "tien",
+        "toi",
+        "ton",
+        "tous",
+        "tout",
+        "toute",
+        "toutes",
+        "trois",
+        "tte",
+        "tu",
+        "un",
+        "une",
+        "unes",
+        "uns",
+        "unt",
+        "va",
+        "vais",
+        "van",
+        "vers",
+        "versus",
+        "via",
+        "voici",
+        "voilà",
+        "voir",
+        "voire",
+        "vont",
+        "vos",
+        "votre",
+        "vous",
+        "vous-même",
+        "vs",
+        "vu",
+        "y",
+        "à",
+        "á",
+        "ça",
+        "étaient",
+        "étais",
+        "était",
+        "étant",
+        "étiez",
+        "étions",
+        "été",
+        "êtes",
+        "être",
+    ],
+    "hi": [
+        "अंदर",
+        "अत",
+        "अदि",
+        "अप",
+        "अपना",
+        "अपनि",
+        "अपनी",
+        "अपने",
+        "अभि",
+        "अभी",
+        "आदि",
+        "आप",
+        "इंहिं",
+        "इंहें",
+        "इंहों",
+        "इतयादि",
+        "इत्यादि",
+        "इन",
+        "इनका",
+        "इन्हीं",
+        "इन्हें",
+        "इन्हों",
+        "इस",
+        "इसका",
+        "इसकि",
+        "इसकी",
+        "इसके",
+        "इसमें",
+        "इसि",
+        "इसी",
+        "इसे",
+        "उंहिं",
+        "उंहें",
+        "उंहों",
+        "उन",
+        "उनका",
+        "उनकि",
+        "उनकी",
+        "उनके",
+        "उनको",
+        "उन्हीं",
+        "उन्हें",
+        "उन्हों",
+        "उस",
+        "उसके",
+        "उसि",
+        "उसी",
+        "उसे",
+        "एक",
+        "एवं",
+        "एस",
+        "एसे",
+        "ऐसे",
+        "ओर",
+        "और",
+        "कइ",
+        "कई",
+        "कर",
+        "करता",
+        "करते",
+        "करना",
+        "करने",
+        "करें",
+        "कहते",
+        "कहा",
+        "का",
+        "काफि",
+        "काफ़ी",
+        "कि",
+        "किंहें",
+        "किंहों",
+        "कितना",
+        "किन्हें",
+        "किन्हों",
+        "किया",
+        "किर",
+        "किस",
+        "किसि",
+        "किसी",
+        "किसे",
+        "की",
+        "कुछ",
+        "कुल",
+        "के",
+        "को",
+        "कोइ",
+        "कोई",
+        "कोन",
+        "कोनसा",
+        "कौन",
+        "कौनसा",
+        "गया",
+        "घर",
+        "जब",
+        "जहाँ",
+        "जहां",
+        "जा",
+        "जिंहें",
+        "जिंहों",
+        "जितना",
+        "जिधर",
+        "जिन",
+        "जिन्हें",
+        "जिन्हों",
+        "जिस",
+        "जिसे",
+        "जीधर",
+        "जेसा",
+        "जेसे",
+        "जैसा",
+        "जैसे",
+        "जो",
+        "तक",
+        "तब",
+        "तरह",
+        "तिंहें",
+        "तिंहों",
+        "तिन",
+        "तिन्हें",
+        "तिन्हों",
+        "तिस",
+        "तिसे",
+        "तो",
+        "था",
+        "थि",
+        "थी",
+        "थे",
+        "दबारा",
+        "दवारा",
+        "दिया",
+        "दुसरा",
+        "दुसरे",
+        "दूसरे",
+        "दो",
+        "द्वारा",
+        "न",
+        "नहिं",
+        "नहीं",
+        "ना",
+        "निचे",
+        "निहायत",
+        "नीचे",
+        "ने",
+        "पर",
+        "पहले",
+        "पुरा",
+        "पूरा",
+        "पे",
+        "फिर",
+        "बनि",
+        "बनी",
+        "बहि",
+        "बही",
+        "बहुत",
+        "बाद",
+        "बाला",
+        "बिलकुल",
+        "भि",
+        "भितर",
+        "भी",
+        "भीतर",
+        "मगर",
+        "मानो",
+        "मे",
+        "में",
+        "यदि",
+        "यह",
+        "यहाँ",
+        "यहां",
+        "यहि",
+        "यही",
+        "या",
+        "यिह",
+        "ये",
+        "रखें",
+        "रवासा",
+        "रहा",
+        "रहे",
+        "ऱ्वासा",
+        "लिए",
+        "लिये",
+        "लेकिन",
+        "व",
+        "वगेरह",
+        "वरग",
+        "वर्ग",
+        "वह",
+        "वहाँ",
+        "वहां",
+        "वहिं",
+        "वहीं",
+        "वाले",
+        "वुह",
+        "वे",
+        "वग़ैरह",
+        "संग",
+        "सकता",
+        "सकते",
+        "सबसे",
+        "सभि",
+        "सभी",
+        "साथ",
+        "साबुत",
+        "साभ",
+        "सारा",
+        "से",
+        "सो",
+        "हि",
+        "ही",
+        "हुअ",
+        "हुआ",
+        "हुइ",
+        "हुई",
+        "हुए",
+        "हे",
+        "हें",
+        "है",
+        "हैं",
+        "हो",
+        "होता",
+        "होति",
+        "होती",
+        "होते",
+        "होना",
+        "होने",
+    ],
+    "id": [
+        "Anda",
+        "ada",
+        "adakah",
+        "adalah",
+        "adanya",
+        "adapaun",
+        "adapun",
+        "agar",
+        "akan",
+        "akau",
+        "akhirnya",
+        "akibat",
+        "akibatnya",
+        "aku",
+        "alias",
+        "anda",
+        "aneka",
+        "antar",
+        "antara",
+        "antaranya",
+        "apa",
+        "apabila",
+        "apakah",
+        "apalagi",
+        "apapun",
+        "asal",
+        "atas",
+        "atau",
+        "ataukah",
+        "ataupun",
+        "bagai",
+        "bagaimana",
+        "bagaimanakah",
+        "bagaimanapun",
+        "bagi",
+        "bagi-nya",
+        "bahkan",
+        "bahwa",
+        "bahwasanya",
+        "baik",
+        "bakal",
+        "balik",
+        "banyak",
+        "banyaknya",
+        "baru",
+        "bawah",
+        "beberapa",
+        "begini",
+        "beginilah",
+        "begitu",
+        "belakang",
+        "beliau",
+        "belum",
+        "beragam",
+        "berapa",
+        "berapakah",
+        "berbagai",
+        "berberapa",
+        "berdasar",
+        "berdasarkan",
+        "berdiri",
+        "berdirinya",
+        "berikut",
+        "berkat",
+        "bersama",
+        "bersamanya",
+        "berupa",
+        "beserta",
+        "betapa",
+        "bila",
+        "bilamana",
+        "bisa",
+        "boleh",
+        "buah",
+        "buat",
+        "bukan",
+        "bukankah",
+        "bukanlah",
+        "bukannya",
+        "buruh",
+        "cara",
+        "dalam",
+        "dalamnya",
+        "dan",
+        "dapat",
+        "dari",
+        "darimana",
+        "daripada",
+        "dekat",
+        "demi",
+        "demikian",
+        "dengan",
+        "dengannya",
+        "depan",
+        "dg",
+        "di",
+        "dia",
+        "diantara",
+        "diantaranya",
+        "diatas",
+        "dibalik",
+        "dibandingkan",
+        "dibawah",
+        "dibawahnya",
+        "dibeberapa",
+        "dibelakang",
+        "diberbagai",
+        "didalam",
+        "didalamnya",
+        "diluar",
+        "dimana",
+        "diri",
+        "dirinya",
+        "disaat",
+        "disamping",
+        "disebelah",
+        "disekeliling",
+        "diseluruh",
+        "disini",
+        "ditepi",
+        "dng",
+        "dr",
+        "engkau",
+        "gambar",
+        "gimana",
+        "hadap",
+        "hai",
+        "hanya",
+        "harus",
+        "hei",
+        "ia",
+        "ialah",
+        "ini",
+        "inikah",
+        "inilah",
+        "inipun",
+        "isi",
+        "isinya",
+        "itu",
+        "itua",
+        "itulah",
+        "itupun",
+        "iye",
+        "jadi",
+        "jangan",
+        "jauh",
+        "jelang",
+        "jenis",
+        "jika",
+        "juga",
+        "kah",
+        "kalau",
+        "kalian",
+        "kalo",
+        "kami",
+        "kamilah",
+        "kamu",
+        "kan",
+        "kapan",
+        "kapankah",
+        "karena",
+        "karenanya",
+        "kau",
+        "ke",
+        "kebanyakan",
+        "kecuali",
+        "kedalam",
+        "kedepan",
+        "kedua",
+        "keduanya",
+        "keliling",
+        "keluar",
+        "kemudian",
+        "kena",
+        "kenapa",
+        "kendati",
+        "kepada",
+        "kepadaku",
+        "kepadamu",
+        "kepadanya",
+        "kepusatnya",
+        "kerana",
+        "keseluruhan",
+        "keseluruhannya",
+        "kesemuanya",
+        "ketika",
+        "ketimbang",
+        "khususnya",
+        "kira",
+        "kita",
+        "kok",
+        "koq",
+        "kpd",
+        "ku",
+        "la",
+        "lagi",
+        "lah",
+        "lain",
+        "lainnya",
+        "lalu",
+        "lama",
+        "lantaran",
+        "lantas",
+        "layak",
+        "layaknya",
+        "lengah",
+        "lewat",
+        "loh",
+        "luar",
+        "macam",
+        "maka",
+        "makanya",
+        "maksud",
+        "maksudnya",
+        "malahan",
+        "mampu",
+        "mana",
+        "manakah",
+        "manakala",
+        "manapun",
+        "masa",
+        "masing",
+        "masing-masing",
+        "maupun",
+        "mayoritas",
+        "melainkan",
+        "melalui",
+        "melawan",
+        "melewati",
+        "menajak",
+        "menbeli",
+        "mengajak",
+        "mengapa",
+        "mengenai",
+        "mengenainya",
+        "menjadi",
+        "menjelang",
+        "menuju",
+        "menurut",
+        "menurutmu",
+        "mereka",
+        "merekapun",
+        "merupakan",
+        "meski",
+        "meskipn",
+        "meskipun",
+        "misalkan",
+        "misalnya",
+        "msl",
+        "mulai",
+        "mungkin",
+        "namun",
+        "nya",
+        "oleh",
+        "olehnya",
+        "orang",
+        "pada",
+        "padahal",
+        "padanya",
+        "para",
+        "pasca",
+        "pd",
+        "per",
+        "perihal",
+        "perlu",
+        "pula",
+        "pun",
+        "saat",
+        "saatnya",
+        "sama",
+        "sambil",
+        "sampai",
+        "sampai-sampai",
+        "samping",
+        "sana",
+        "sang",
+        "satu",
+        "satu-satunya",
+        "satunya",
+        "saya",
+        "seakan",
+        "seandainya",
+        "seantero",
+        "sebab",
+        "sebagai",
+        "sebagaimana",
+        "sebagian",
+        "sebaliknya",
+        "sebangsa",
+        "sebanyak",
+        "sebelah",
+        "sebelum",
+        "sebelumnya",
+        "seberang",
+        "seberat",
+        "sebesar",
+        "sebuah",
+        "secara",
+        "sedang",
+        "sedangkan",
+        "sedangkkan",
+        "sedari",
+        "sedikit",
+        "sedikitnya",
+        "seekor",
+        "segala",
+        "segenap",
+        "seharusnya",
+        "sehingga",
+        "sehubungan",
+        "seiring",
+        "sejak",
+        "sejauh",
+        "sejenis",
+        "sejumlah",
+        "sekali",
+        "sekaligus",
+        "sekalipun",
+        "sekitar",
+        "sekitarnya",
+        "selain",
+        "selaku",
+        "selama",
+        "selesai",
+        "seluas",
+        "seluruh",
+        "semacam",
+        "semasa",
+        "semenjak",
+        "sementara",
+        "sempat",
+        "semua",
+        "semuanya",
+        "sendiri",
+        "senilai",
+        "seorang",
+        "sepanjang",
+        "sepasang",
+        "sepeninggal",
+        "seperti",
+        "sepertinya",
+        "sepeti",
+        "sepucuk",
+        "seputar",
+        "serangkaian",
+        "seraya",
+        "serta",
+        "sesampai",
+        "sesampainya",
+        "seseorang",
+        "sesuai",
+        "sesuatu",
+        "sesudah",
+        "setebal",
+        "setelah",
+        "setelahnya",
+        "setengah",
+        "setiap",
+        "setinggi",
+        "seusai",
+        "sewaktu",
+        "si",
+        "siapa",
+        "siapakah",
+        "siapapun",
+        "silakan",
+        "sini",
+        "sinilah",
+        "situ",
+        "soal",
+        "suatu",
+        "sudah",
+        "supaya",
+        "tak",
+        "tan",
+        "tangguh",
+        "tanpa",
+        "tapi",
+        "tatkala",
+        "telah",
+        "tempat",
+        "tengah",
+        "tengahnya",
+        "tentang",
+        "tepat",
+        "tepatnya",
+        "teratas",
+        "terhadap",
+        "terhadapnya",
+        "termasuk",
+        "ternyata",
+        "tersebut",
+        "tertentu",
+        "terutama",
+        "tesebut",
+        "tetap",
+        "tetapi",
+        "tiada",
+        "tiap",
+        "tidak",
+        "tidakkah",
+        "tidaklah",
+        "tidaknya",
+        "tsb",
+        "tt",
+        "ttg",
+        "tuh",
+        "tujuh",
+        "untuk",
+        "untukmu",
+        "untuknya",
+        "untung",
+        "usah",
+        "usai",
+        "via",
+        "waktu",
+        "walau",
+        "walaupun",
+        "ya",
+        "yaitu",
+        "yakni",
+        "yang",
+        "yg",
+    ],
+    "mr": [
+        "अधिक",
+        "अनेक",
+        "अशी",
+        "असलयाचे",
+        "असलेल्या",
+        "असा",
+        "असून",
+        "असे",
+        "आज",
+        "आणि",
+        "आता",
+        "आपल्या",
+        "आला",
+        "आली",
+        "आले",
+        "आहे",
+        "आहेत",
+        "एक",
+        "एका",
+        "कमी",
+        "करणयात",
+        "करून",
+        "का",
+        "काम",
+        "काय",
+        "काही",
+        "किवा",
+        "की",
+        "केला",
+        "केली",
+        "केले",
+        "कोटी",
+        "गेल्या",
+        "घेऊन",
+        "जात",
+        "झाला",
+        "झाली",
+        "झाले",
+        "झालेल्या",
+        "टा",
+        "डॉ",
+        "तर",
+        "तरी",
+        "तसेच",
+        "ता",
+        "ती",
+        "तीन",
+        "ते",
+        "तो",
+        "त्या",
+        "त्याचा",
+        "त्याची",
+        "त्याच्या",
+        "त्याना",
+        "त्यानी",
+        "त्यामुळे",
+        "त्री",
+        "दिली",
+        "दोन",
+        "न",
+        "नाही",
+        "निर्ण्य",
+        "पण",
+        "पम",
+        "परयतन",
+        "पाटील",
+        "म",
+        "मात्र",
+        "माहिती",
+        "मी",
+        "मुबी",
+        "म्हणजे",
+        "म्हणाले",
+        "म्हणून",
+        "या",
+        "याचा",
+        "याची",
+        "याच्या",
+        "याना",
+        "यानी",
+        "येणार",
+        "येत",
+        "येथील",
+        "येथे",
+        "लाख",
+        "व",
+        "व्यकत",
+        "सर्व",
+        "सागित्ले",
+        "सुरू",
+        "हजार",
+        "हा",
+        "ही",
+        "हे",
+        "होणार",
+        "होत",
+        "होता",
+        "होती",
+        "होते",
+    ],
+    "pt": [
+        "a",
+        "a cabo de",
+        "a caminho de",
+        "a despeito de",
+        "a favor de",
+        "a fim de",
+        "a menos que",
+        "a não ser",
+        "a não ser que",
+        "a partir de",
+        "a propósito",
+        "a respeito de",
+        "a título de",
+        "abaixo de",
+        "acima",
+        "acima de",
+        "afinal",
+        "afora",
+        "agora",
+        "agora que",
+        "ai",
+        "ainda",
+        "ainda mais",
+        "algo",
+        "algum",
+        "alguma",
+        "algumas",
+        "alguns",
+        "alguém",
+        "além",
+        "além de",
+        "ambas",
+        "ambos",
+        "andar",
+        "andou",
+        "ante",
+        "antes",
+        "anti",
+        "antre",
+        "ao",
+        "ao cabo de",
+        "ao invés de",
+        "ao lado",
+        "ao longo de",
+        "ao passo que",
+        "ao redor de",
+        "aos cuidados de",
+        "apenas",
+        "apesar de",
+        "apesar de que",
+        "após",
+        "aquela",
+        "aquelas",
+        "aquele",
+        "aqueles",
+        "aquilo",
+        "as",
+        "assim",
+        "assim como",
+        "assim que",
+        "atras",
+        "através",
+        "através de",
+        "atráis",
+        "atrás",
+        "atrás de",
+        "até",
+        "até que",
+        "auto",
+        "avante",
+        "aí",
+        "bastante",
+        "bem",
+        "bem como",
+        "cada",
+        "cara a cara",
+        "caso",
+        "cerca",
+        "cima",
+        "com",
+        "comigo",
+        "como",
+        "como se",
+        "conforme",
+        "connosco",
+        "conosco",
+        "conquanto",
+        "consigo",
+        "consoante",
+        "contanto",
+        "contanto que",
+        "contigo",
+        "contra",
+        "contudo",
+        "convosco",
+        "cuja",
+        "cujas",
+        "cujo",
+        "cujos",
+        "d'",
+        "d.",
+        "da",
+        "dada",
+        "dado",
+        "dado que",
+        "dali",
+        "daquela",
+        "daquelas",
+        "daquele",
+        "daqui",
+        "daqui a",
+        "daí",
+        "de",
+        "de modo que",
+        "dela",
+        "delas",
+        "dele",
+        "deles",
+        "demais",
+        "dentre",
+        "dentro",
+        "dentro de",
+        "depois",
+        "depois de",
+        "desde",
+        "desde que",
+        "dessa",
+        "dessas",
+        "desse",
+        "desses",
+        "desta",
+        "destas",
+        "deste",
+        "destes",
+        "detrás de",
+        "deva",
+        "devam",
+        "deve",
+        "devem",
+        "devemos",
+        "devendo",
+        "dever",
+        "deveria",
+        "deveriam",
+        "deverá",
+        "deverão",
+        "deviam",
+        "devido",
+        "devido a",
+        "devo",
+        "diante de",
+        "disso",
+        "diversas",
+        "diversos",
+        "do que",
+        "donde",
+        "doutros",
+        "dum",
+        "duma",
+        "durante",
+        "e",
+        "e/ou",
+        "eba",
+        "eis",
+        "ela",
+        "elas",
+        "ele",
+        "eles",
+        "eles/elas",
+        "em",
+        "em cima de",
+        "em frente a",
+        "em meio a",
+        "em nome de",
+        "em prol de",
+        "em relação a",
+        "em torno de",
+        "em vez de",
+        "em virtude de",
+        "em vista de",
+        "em volta de",
+        "embaixo de",
+        "embora",
+        "enquanto",
+        "entre",
+        "entretanto",
+        "então",
+        "era",
+        "eram",
+        "ergo",
+        "essa",
+        "essas",
+        "esse",
+        "esses",
+        "esta",
+        "estado",
+        "estamos",
+        "estando",
+        "estar",
+        "estarem",
+        "estaria",
+        "estariam",
+        "estarmos",
+        "estará",
+        "estarão",
+        "estas",
+        "estava",
+        "estavam",
+        "este",
+        "esteja",
+        "estejam",
+        "estes",
+        "esteve",
+        "estivemos",
+        "estiver",
+        "estiveram",
+        "estiverem",
+        "estivesse",
+        "estivessem",
+        "estou",
+        "está",
+        "estávamos",
+        "estão",
+        "eu",
+        "excepto",
+        "exceto",
+        "fica",
+        "ficado",
+        "ficamos",
+        "ficando",
+        "ficar",
+        "ficaram",
+        "ficaria",
+        "ficou",
+        "fiquei",
+        "foi",
+        "fomos",
+        "for",
+        "fora",
+        "fora de",
+        "foram",
+        "forem",
+        "fosse",
+        "fossem",
+        "frente a",
+        "fui",
+        "fôr",
+        "gente",
+        "graças",
+        "graças a",
+        "havendo",
+        "haver",
+        "haverem",
+        "havia",
+        "haviam",
+        "houver",
+        "houvesse",
+        "há",
+        "i.e.",
+        "ia",
+        "iam",
+        "ido",
+        "igual a",
+        "inté",
+        "invés de",
+        "ir",
+        "ireii",
+        "irem",
+        "iremos",
+        "iria",
+        "iriam",
+        "irá",
+        "irão",
+        "isso",
+        "isto",
+        "junto a",
+        "junto com",
+        "já",
+        "já que",
+        "la",
+        "las",
+        "lhe",
+        "lhes",
+        "lo",
+        "logo",
+        "logo que",
+        "los",
+        "lá",
+        "mais",
+        "mais de",
+        "mais do que",
+        "mais que",
+        "mal",
+        "malgrado",
+        "mas",
+        "me",
+        "mediante",
+        "menos",
+        "mesma",
+        "mesmas",
+        "mesmo",
+        "mesmo que",
+        "mesmo se",
+        "mesmos",
+        "meu",
+        "meus",
+        "mim",
+        "minha",
+        "minhas",
+        "muita",
+        "muitas",
+        "muito",
+        "muito menos",
+        "muitos",
+        "muitíssimo",
+        "n'",
+        "na",
+        "na frente de",
+        "na sequência de",
+        "nada",
+        "naquela",
+        "naquele",
+        "naqueles",
+        "naquilo",
+        "nas",
+        "nele",
+        "neles",
+        "nem",
+        "nenhum",
+        "nenhuma",
+        "nenhumas",
+        "nenhuns",
+        "nessa",
+        "nessas",
+        "nesse",
+        "nesses",
+        "nesta",
+        "nestas",
+        "neste",
+        "nestes",
+        "ninguém",
+        "no",
+        "no que",
+        "nos",
+        "nosco",
+        "nossa",
+        "nossas",
+        "nosso",
+        "nossos",
+        "num",
+        "numa",
+        "nós",
+        "o",
+        "o(s)",
+        "onde",
+        "onde quer que",
+        "ora",
+        "os",
+        "ou",
+        "outra",
+        "outras",
+        "outrem",
+        "outro",
+        "outros",
+        "outrém",
+        "oxalá",
+        "p'ra",
+        "p/",
+        "pa",
+        "para",
+        "para com",
+        "para que",
+        "parece",
+        "parecer",
+        "pelo",
+        "per",
+        "perante",
+        "perantes",
+        "permanece",
+        "permanecer",
+        "perto de",
+        "pode",
+        "podem",
+        "podemos",
+        "podendo",
+        "poder",
+        "poderei",
+        "poderem",
+        "poderemos",
+        "poderia",
+        "poderiam",
+        "poderá",
+        "poderão",
+        "poderíamos",
+        "podia",
+        "podiam",
+        "podíamos",
+        "pois",
+        "por",
+        "por causa de",
+        "por causa que",
+        "por conta de",
+        "por entre",
+        "por isso",
+        "por isto",
+        "por meio de",
+        "por trás",
+        "por trás de",
+        "por volta de",
+        "porquanto",
+        "porque",
+        "portanto",
+        "porém",
+        "possa",
+        "possam",
+        "possamos",
+        "posso",
+        "pouca",
+        "poucas",
+        "pouco",
+        "poucos",
+        "pouquíssimos",
+        "pra",
+        "precisam",
+        "precisar",
+        "precisaram",
+        "precisarão",
+        "precisou",
+        "prestes a",
+        "pretender",
+        "pretendiam",
+        "pro",
+        "pré",
+        "pré-",
+        "pró",
+        "pude",
+        "pudemos",
+        "puderam",
+        "puderem",
+        "pudesse",
+        "pudessem",
+        "pós",
+        "pôde",
+        "pôr",
+        "público",
+        "q.b.",
+        "quais",
+        "quaisquer",
+        "qual",
+        "qualquer",
+        "quando",
+        "quanta",
+        "quantas",
+        "quanto",
+        "quanto a",
+        "quanto baste",
+        "quanto mais",
+        "quantos",
+        "que",
+        "quem",
+        "quer",
+        "quão",
+        "quê",
+        "rente a",
+        "rente de",
+        "rumo a",
+        "se",
+        "se bem que",
+        "se e somente se",
+        "se-",
+        "segundo",
+        "seja",
+        "sejam",
+        "sem",
+        "sem falar de",
+        "sempre que",
+        "sendo",
+        "sendo que",
+        "senão",
+        "ser",
+        "serei",
+        "serem",
+        "seremos",
+        "seria",
+        "seriam",
+        "sermos",
+        "será",
+        "serão",
+        "seu",
+        "seus",
+        "si",
+        "sido",
+        "sob",
+        "sobre",
+        "somos",
+        "sou",
+        "sse",
+        "sua",
+        "suas",
+        "sub",
+        "são",
+        "sê",
+        "só que",
+        "sôbre",
+        "ta",
+        "tais",
+        "tal",
+        "tampouco",
+        "tanta",
+        "tantas",
+        "tanto",
+        "tantos",
+        "te",
+        "tem",
+        "temos",
+        "tende",
+        "tendo",
+        "tenha",
+        "tenham",
+        "tenhamos",
+        "tenho",
+        "tentado",
+        "tentar",
+        "tentaram",
+        "ter",
+        "terei",
+        "terem",
+        "teremos",
+        "teria",
+        "teriam",
+        "termos",
+        "terá",
+        "terão",
+        "teríamos",
+        "teu",
+        "teus",
+        "teve",
+        "ti",
+        "tido",
+        "tinha",
+        "tinham",
+        "tive",
+        "tivemos",
+        "tiver",
+        "tiveram",
+        "tiverem",
+        "tivesse",
+        "tivessem",
+        "to",
+        "toda",
+        "todas",
+        "todavia",
+        "todo",
+        "todos",
+        "trás",
+        "tu",
+        "tua",
+        "tuas",
+        "tudo",
+        "tá",
+        "tão",
+        "tão logo",
+        "té",
+        "têm",
+        "tínhamos",
+        "ultra",
+        "um",
+        "uma",
+        "uma vez que",
+        "umas",
+        "uns",
+        "vai",
+        "vais",
+        "vamos",
+        "varias",
+        "varios",
+        "versus",
+        "via",
+        "visto",
+        "visto que",
+        "voce",
+        "você",
+        "vocês",
+        "vos",
+        "vossa",
+        "vossas",
+        "vosso",
+        "vossos",
+        "vou",
+        "vs",
+        "vá",
+        "várias",
+        "vários",
+        "vão",
+        "vérsus",
+        "vós",
+        "à",
+        "à beira de",
+        "à custa de",
+        "à expensa de",
+        "à luz de",
+        "à medida que",
+        "àquela",
+        "àqueles",
+        "às",
+        "às custas de",
+        "às expensas de",
+        "é",
+        "íamos",
+        "\u200b\u200bem",
+    ],
+    "sw": [
+        "akasema",
+        "alikuwa",
+        "alisema",
+        "baada",
+        "basi",
+        "bila",
+        "cha",
+        "chini",
+        "hadi",
+        "hapo",
+        "hata",
+        "hivyo",
+        "hiyo",
+        "huku",
+        "huo",
+        "ili",
+        "ilikuwa",
+        "juu",
+        "kama",
+        "karibu",
+        "katika",
+        "kila",
+        "kima",
+        "kisha",
+        "kubwa",
+        "kutoka",
+        "kuwa",
+        "kwa",
+        "kwamba",
+        "kwenda",
+        "kwenye",
+        "la",
+        "lakini",
+        "mara",
+        "mdogo",
+        "mimi",
+        "mkubwa",
+        "mmoja",
+        "moja",
+        "muda",
+        "mwenye",
+        "na",
+        "naye",
+        "ndani",
+        "ng",
+        "ni",
+        "nini",
+        "nonkungu",
+        "pamoja",
+        "pia",
+        "sana",
+        "sasa",
+        "sauti",
+        "tafadhali",
+        "tena",
+        "tu",
+        "vile",
+        "wa",
+        "wakati",
+        "wake",
+        "walikuwa",
+        "wao",
+        "watu",
+        "wengine",
+        "wote",
+        "ya",
+        "yake",
+        "yangu",
+        "yao",
+        "yeye",
+        "yule",
+        "za",
+        "zaidi",
+        "zake",
+    ],
+    "ur": [
+        "آئی",
+        "آئے",
+        "آج",
+        "آخر",
+        "آخرکبر",
+        "آدهی",
+        "آًب",
+        "آٹھ",
+        "آیب",
+        "اة",
+        "اخبزت",
+        "اختتبم",
+        "ادھر",
+        "ارد",
+        "اردگرد",
+        "ارکبى",
+        "اش",
+        "اضتعوبل",
+        "اضتعوبلات",
+        "اضطرذ",
+        "اضکب",
+        "اضکی",
+        "اضکے",
+        "اطراف",
+        "اغیب",
+        "افراد",
+        "الگ",
+        "اور",
+        "اوًچب",
+        "اوًچبئی",
+        "اوًچی",
+        "اوًچے",
+        "اى",
+        "اً",
+        "اًذر",
+        "اًہیں",
+        "اٹھبًب",
+        "اپٌب",
+        "اپٌے",
+        "اچھب",
+        "اچھی",
+        "اچھے",
+        "اکثر",
+        "اکٹھب",
+        "اکٹھی",
+        "اکٹھے",
+        "اکیلا",
+        "اکیلی",
+        "اکیلے",
+        "اگرچہ",
+        "اہن",
+        "ایطے",
+        "ایک",
+        "ب",
+        "ت",
+        "تبزٍ",
+        "تت",
+        "تر",
+        "ترتیت",
+        "تریي",
+        "تعذاد",
+        "تن",
+        "تو",
+        "توبم",
+        "توہی",
+        "توہیں",
+        "تٌہب",
+        "تک",
+        "تھب",
+        "تھوڑا",
+        "تھوڑی",
+        "تھوڑے",
+        "تھی",
+        "تھے",
+        "تیي",
+        "ثب",
+        "ثبئیں",
+        "ثبترتیت",
+        "ثبری",
+        "ثبرے",
+        "ثبعث",
+        "ثبلا",
+        "ثبلترتیت",
+        "ثبہر",
+        "ثدبئے",
+        "ثرآں",
+        "ثراں",
+        "ثرش",
+        "ثعذ",
+        "ثغیر",
+        "ثلٌذ",
+        "ثلٌذوثبلا",
+        "ثلکہ",
+        "ثي",
+        "ثٌب",
+        "ثٌبرہب",
+        "ثٌبرہی",
+        "ثٌبرہے",
+        "ثٌبًب",
+        "ثٌذ",
+        "ثٌذکرو",
+        "ثٌذکرًب",
+        "ثٌذی",
+        "ثڑا",
+        "ثڑوں",
+        "ثڑی",
+        "ثڑے",
+        "ثھر",
+        "ثھرا",
+        "ثھراہوا",
+        "ثھرپور",
+        "ثھی",
+        "ثہت",
+        "ثہتر",
+        "ثہتری",
+        "ثہتریي",
+        "ثیچ",
+        "ج",
+        "خب",
+        "خبرہب",
+        "خبرہی",
+        "خبرہے",
+        "خبهوظ",
+        "خبًب",
+        "خبًتب",
+        "خبًتی",
+        "خبًتے",
+        "خبًٌب",
+        "خت",
+        "ختن",
+        "خجکہ",
+        "خص",
+        "خططرذ",
+        "خلذی",
+        "خو",
+        "خواى",
+        "خوًہی",
+        "خوکہ",
+        "خٌبة",
+        "خگہ",
+        "خگہوں",
+        "خگہیں",
+        "خیطب",
+        "خیطبکہ",
+        "در",
+        "درخبت",
+        "درخہ",
+        "درخے",
+        "درزقیقت",
+        "درضت",
+        "دش",
+        "دفعہ",
+        "دلچطپ",
+        "دلچطپی",
+        "دلچطپیبں",
+        "دو",
+        "دور",
+        "دوراى",
+        "دوضرا",
+        "دوضروں",
+        "دوضری",
+        "دوضرے",
+        "دوًوں",
+        "دکھبئیں",
+        "دکھبتب",
+        "دکھبتی",
+        "دکھبتے",
+        "دکھبو",
+        "دکھبًب",
+        "دکھبیب",
+        "دی",
+        "دیب",
+        "دیتب",
+        "دیتی",
+        "دیتے",
+        "دیر",
+        "دیٌب",
+        "دیکھو",
+        "دیکھٌب",
+        "دیکھی",
+        "دیکھیں",
+        "دے",
+        "ر",
+        "راضتوں",
+        "راضتہ",
+        "راضتے",
+        "رریعہ",
+        "رریعے",
+        "رکي",
+        "رکھ",
+        "رکھب",
+        "رکھتب",
+        "رکھتبہوں",
+        "رکھتی",
+        "رکھتے",
+        "رکھی",
+        "رکھے",
+        "رہب",
+        "رہی",
+        "رہے",
+        "ز",
+        "زبصل",
+        "زبضر",
+        "زبل",
+        "زبلات",
+        "زبلیہ",
+        "زصوں",
+        "زصہ",
+        "زصے",
+        "زقبئق",
+        "زقیتیں",
+        "زقیقت",
+        "زکن",
+        "زکویہ",
+        "زیبدٍ",
+        "صبف",
+        "صسیر",
+        "صفر",
+        "صورت",
+        "صورتسبل",
+        "صورتوں",
+        "صورتیں",
+        "ض",
+        "ضبت",
+        "ضبتھ",
+        "ضبدٍ",
+        "ضبرا",
+        "ضبرے",
+        "ضبل",
+        "ضبلوں",
+        "ضت",
+        "ضرور",
+        "ضرورت",
+        "ضروری",
+        "ضلطلہ",
+        "ضوچ",
+        "ضوچب",
+        "ضوچتب",
+        "ضوچتی",
+        "ضوچتے",
+        "ضوچو",
+        "ضوچٌب",
+        "ضوچی",
+        "ضوچیں",
+        "ضکب",
+        "ضکتب",
+        "ضکتی",
+        "ضکتے",
+        "ضکٌب",
+        "ضکی",
+        "ضکے",
+        "ضیذھب",
+        "ضیذھی",
+        "ضیذھے",
+        "ضیکٌڈ",
+        "ضے",
+        "طرف",
+        "طریق",
+        "طریقوں",
+        "طریقہ",
+        "طریقے",
+        "طور",
+        "طورپر",
+        "ظبہر",
+        "ع",
+        "عذد",
+        "عظین",
+        "علاقوں",
+        "علاقہ",
+        "علاقے",
+        "علاوٍ",
+        "عووهی",
+        "غبیذ",
+        "غخص",
+        "غذ",
+        "غروع",
+        "غروعبت",
+        "غے",
+        "فرد",
+        "فی",
+        "ق",
+        "قجل",
+        "قجیلہ",
+        "قطن",
+        "لئے",
+        "لا",
+        "لازهی",
+        "لو",
+        "لوجب",
+        "لوجی",
+        "لوجے",
+        "لوسبت",
+        "لوسہ",
+        "لوگ",
+        "لوگوں",
+        "لڑکپي",
+        "لگتب",
+        "لگتی",
+        "لگتے",
+        "لگٌب",
+        "لگی",
+        "لگیں",
+        "لگے",
+        "لی",
+        "لیب",
+        "لیٌب",
+        "لیں",
+        "لے",
+        "ه",
+        "هتعلق",
+        "هختلف",
+        "هسترم",
+        "هسترهہ",
+        "هسطوش",
+        "هسیذ",
+        "هطئلہ",
+        "هطئلے",
+        "هطبئل",
+        "هطتعول",
+        "هطلق",
+        "هعلوم",
+        "هػتول",
+        "هلا",
+        "هوکي",
+        "هوکٌبت",
+        "هوکٌہ",
+        "هٌبضت",
+        "هڑا",
+        "هڑًب",
+        "هڑے",
+        "هکول",
+        "هگر",
+        "هہرثبى",
+        "هیرا",
+        "هیری",
+        "هیرے",
+        "هیں",
+        "و",
+        "وار",
+        "والے",
+        "وٍ",
+        "ًئی",
+        "ًئے",
+        "ًب",
+        "ًبپطٌذ",
+        "ًبگسیر",
+        "ًطجت",
+        "ًقطہ",
+        "ًو",
+        "ًوخواى",
+        "ًکبلٌب",
+        "ًکتہ",
+        "ًہ",
+        "ًہیں",
+        "ًیب",
+        "ًے",
+        "ٓ آش",
+        "ٹھیک",
+        "پبئے",
+        "پبش",
+        "پبًب",
+        "پبًچ",
+        "پر",
+        "پراًب",
+        "پطٌذ",
+        "پل",
+        "پورا",
+        "پوچھب",
+        "پوچھتب",
+        "پوچھتی",
+        "پوچھتے",
+        "پوچھو",
+        "پوچھوں",
+        "پوچھٌب",
+        "پوچھیں",
+        "پچھلا",
+        "پھر",
+        "پہلا",
+        "پہلی",
+        "پہلےضی",
+        "پہلےضے",
+        "پہلےضےہی",
+        "پیع",
+        "چبر",
+        "چبہب",
+        "چبہٌب",
+        "چبہے",
+        "چلا",
+        "چلو",
+        "چلیں",
+        "چلے",
+        "چکب",
+        "چکی",
+        "چکیں",
+        "چکے",
+        "چھوٹب",
+        "چھوٹوں",
+        "چھوٹی",
+        "چھوٹے",
+        "چھہ",
+        "چیسیں",
+        "ڈھوًڈا",
+        "ڈھوًڈلیب",
+        "ڈھوًڈو",
+        "ڈھوًڈًب",
+        "ڈھوًڈی",
+        "ڈھوًڈیں",
+        "ک",
+        "کئی",
+        "کئے",
+        "کب",
+        "کبفی",
+        "کبم",
+        "کت",
+        "کجھی",
+        "کرا",
+        "کرتب",
+        "کرتبہوں",
+        "کرتی",
+        "کرتے",
+        "کرتےہو",
+        "کررہب",
+        "کررہی",
+        "کررہے",
+        "کرو",
+        "کرًب",
+        "کریں",
+        "کرے",
+        "کطی",
+        "کل",
+        "کن",
+        "کوئی",
+        "کوتر",
+        "کورا",
+        "کوروں",
+        "کورٍ",
+        "کورے",
+        "کوطي",
+        "کوى",
+        "کوًطب",
+        "کوًطی",
+        "کوًطے",
+        "کھولا",
+        "کھولو",
+        "کھولٌب",
+        "کھولی",
+        "کھولیں",
+        "کھولے",
+        "کہ",
+        "کہب",
+        "کہتب",
+        "کہتی",
+        "کہتے",
+        "کہو",
+        "کہوں",
+        "کہٌب",
+        "کہی",
+        "کہیں",
+        "کہے",
+        "کی",
+        "کیب",
+        "کیطب",
+        "کیطرف",
+        "کیطے",
+        "کیلئے",
+        "کیوًکہ",
+        "کیوں",
+        "کیے",
+        "کے",
+        "کےثعذ",
+        "کےرریعے",
+        "گئی",
+        "گئے",
+        "گب",
+        "گرد",
+        "گروٍ",
+        "گروپ",
+        "گروہوں",
+        "گٌتی",
+        "گی",
+        "گیب",
+        "گے",
+        "ہر",
+        "ہن",
+        "ہو",
+        "ہوئی",
+        "ہوئے",
+        "ہوا",
+        "ہوبرا",
+        "ہوبری",
+        "ہوبرے",
+        "ہوتب",
+        "ہوتی",
+        "ہوتے",
+        "ہورہب",
+        "ہورہی",
+        "ہورہے",
+        "ہوضکتب",
+        "ہوضکتی",
+        "ہوضکتے",
+        "ہوًب",
+        "ہوًی",
+        "ہوًے",
+        "ہوچکب",
+        "ہوچکی",
+        "ہوچکے",
+        "ہوگئی",
+        "ہوگئے",
+        "ہوگیب",
+        "ہوں",
+        "ہی",
+        "ہیں",
+        "ہے",
+        "ی",
+        "یقیٌی",
+        "یہ",
+        "یہبں",
+    ],
+    "vi": [
+        "bên",
+        "bấy nhiêu",
+        "bằng",
+        "bởi",
+        "cc",
+        "chao",
+        "cho",
+        "cho dù",
+        "chán",
+        "chính",
+        "chút",
+        "chứ",
+        "các",
+        "cái",
+        "còn",
+        "có",
+        "có vẻ",
+        "cùng",
+        "cơ mà",
+        "cả",
+        "của",
+        "do",
+        "do vậy",
+        "do đó",
+        "duy",
+        "dù",
+        "dù sao",
+        "dù vậy",
+        "dưới",
+        "dường như",
+        "dạ",
+        "dẫu",
+        "dẫu vậy",
+        "giữa",
+        "gì",
+        "hay",
+        "hay là",
+        "hoặc",
+        "hơn nữa",
+        "hả",
+        "hầu hết",
+        "hết",
+        "hề",
+        "hễ",
+        "không những",
+        "l",
+        "là",
+        "lên",
+        "lại nữa",
+        "lẫn",
+        "lắm",
+        "mà",
+        "mà còn",
+        "mấy",
+        "mặc dù",
+        "mặt khác",
+        "mọi",
+        "mỗi",
+        "một chút",
+        "một nửa",
+        "một số",
+        "một vài",
+        "một ít",
+        "ngay",
+        "ngoài",
+        "ngoài ra",
+        "ngược lại",
+        "nhá",
+        "nhân",
+        "nhé",
+        "như",
+        "như vậy",
+        "nhưng",
+        "nhất là",
+        "nhằm",
+        "nhỉ",
+        "nhờ",
+        "những",
+        "nào",
+        "này",
+        "nè",
+        "nên",
+        "nếu",
+        "nếu như",
+        "nửa",
+        "nữa",
+        "phía",
+        "phần lớn",
+        "qua",
+        "quả",
+        "ra",
+        "riêng",
+        "rùi",
+        "rằng",
+        "rồi",
+        "sang",
+        "sao",
+        "sau",
+        "song",
+        "thay",
+        "theo",
+        "thiệt",
+        "thì",
+        "thí dụ",
+        "thôi",
+        "thật",
+        "thế",
+        "thế là",
+        "thế mà",
+        "thế nhưng",
+        "toàn",
+        "toàn bộ",
+        "toàn thể",
+        "trong",
+        "trên",
+        "trước",
+        "trời",
+        "tuy",
+        "tuy nhiên",
+        "tuy vậy",
+        "tóm lại",
+        "tại",
+        "tất cả",
+        "tận",
+        "tổ",
+        "tới",
+        "tức",
+        "tức là",
+        "từ",
+        "ui",
+        "và",
+        "vài",
+        "vài ba",
+        "vào",
+        "vì",
+        "vì thế",
+        "vì vậy",
+        "ví dụ",
+        "vô",
+        "vô số",
+        "vô vàn",
+        "vậy",
+        "vậy là",
+        "vậy mà",
+        "về",
+        "với",
+        "xuống",
+        "à",
+        "đa số",
+        "đi",
+        "đâu",
+        "đây",
+        "đó",
+        "đôi",
+        "được",
+        "đấy",
+        "đến",
+        "để",
+        "đối với",
+        "ạ",
+        "ấy",
+        "ở",
+    ],
+    "yo": [
+        "a",
+        "an",
+        "bá",
+        "bí",
+        "bẹ̀rẹ̀",
+        "fún",
+        "fẹ́",
+        "gbogbo",
+        "inú",
+        "jù",
+        "jẹ",
+        "jẹ́",
+        "kan",
+        "kì",
+        "kí",
+        "kò",
+        "láti",
+        "lè",
+        "lọ",
+        "mi",
+        "mo",
+        "máa",
+        "mọ̀",
+        "ni",
+        "náà",
+        "ní",
+        "nígbà",
+        "nítorí",
+        "nǹkan",
+        "o",
+        "padà",
+        "pé",
+        "púpọ̀",
+        "pẹ̀lú",
+        "rẹ̀",
+        "sì",
+        "sí",
+        "sínú",
+        "ṣ",
+        "ti",
+        "tí",
+        "wà",
+        "wá",
+        "wọn",
+        "wọ́n",
+        "yìí",
+        "àti",
+        "àwọn",
+        "é",
+        "í",
+        "òun",
+        "ó",
+        "ń",
+        "ńlá",
+        "ṣe",
+        "ṣé",
+        "ṣùgbọ́n",
+        "ẹmọ́",
+        "ọjọ́",
+        "ọ̀pọ̀lọpọ̀",
+    ],
+    "zh": [
+        "",
+        "一",
+        "一争",
+        "一些",
+        "一切",
+        "一旦",
+        "一点",
+        "一爭",
+        "上",
+        "上前",
+        "上表",
+        "下",
+        "不",
+        "不仅",
+        "不会",
+        "不但",
+        "不僅",
+        "不光",
+        "不关",
+        "不准",
+        "不单",
+        "不可",
+        "不單",
+        "不够",
+        "不夠",
+        "不应",
+        "不得",
+        "不想",
+        "不愿",
+        "不應",
+        "不是",
+        "不會",
+        "不準",
+        "不用",
+        "不管",
+        "不經",
+        "不肯",
+        "不能",
+        "不要",
+        "不該",
+        "不論",
+        "不论",
+        "不该",
+        "不過",
+        "不需",
+        "不願",
+        "与",
+        "与其",
+        "且",
+        "且是",
+        "並",
+        "並且",
+        "並非",
+        "个",
+        "个人",
+        "中",
+        "临",
+        "为",
+        "为了",
+        "为人",
+        "为什么",
+        "主",
+        "乃至",
+        "之",
+        "之上",
+        "之下",
+        "之中",
+        "之內",
+        "之内",
+        "之初",
+        "之前",
+        "之后",
+        "之外",
+        "之後",
+        "之所以",
+        "之时",
+        "之時",
+        "之間",
+        "之间",
+        "也",
+        "也是",
+        "书",
+        "了",
+        "争辩",
+        "事",
+        "于",
+        "井",
+        "亚",
+        "亞",
+        "亦为",
+        "亦是",
+        "亦為",
+        "亭",
+        "亲",
+        "人",
+        "人人",
+        "人家",
+        "什么",
+        "什麼",
+        "今",
+        "仍是",
+        "仍算",
+        "从",
+        "他",
+        "他们",
+        "他俩",
+        "他倆",
+        "他們",
+        "代",
+        "令",
+        "以",
+        "以上",
+        "以下",
+        "以为",
+        "以來",
+        "以前",
+        "以北",
+        "以及",
+        "以后",
+        "以外",
+        "以往",
+        "以後",
+        "以来",
+        "以為",
+        "以爲",
+        "以至",
+        "们",
+        "价",
+        "任",
+        "任何",
+        "众",
+        "会",
+        "传",
+        "伪",
+        "似乎",
+        "似的",
+        "但",
+        "但是",
+        "位",
+        "低",
+        "住",
+        "体",
+        "何",
+        "何方",
+        "佛",
+        "作",
+        "作为",
+        "作為",
+        "你",
+        "你们",
+        "你們",
+        "你自己",
+        "你门",
+        "佬",
+        "併",
+        "使",
+        "來",
+        "供",
+        "依",
+        "依据",
+        "依據",
+        "依照",
+        "依靠",
+        "侠",
+        "侧",
+        "侨",
+        "侯",
+        "便是",
+        "係",
+        "保存",
+        "保級",
+        "保级",
+        "俠",
+        "信",
+        "修复",
+        "修復",
+        "個",
+        "個人",
+        "們",
+        "倘若",
+        "借助",
+        "借由",
+        "借着",
+        "值",
+        "假使",
+        "假如",
+        "偏",
+        "做",
+        "側",
+        "偽",
+        "傳",
+        "傻",
+        "像",
+        "像是",
+        "僑",
+        "價",
+        "儘管",
+        "元",
+        "先",
+        "光",
+        "光棍",
+        "党",
+        "內",
+        "內外",
+        "全",
+        "全体",
+        "全副",
+        "全套",
+        "全部",
+        "全體",
+        "公",
+        "关",
+        "关于",
+        "关心",
+        "兵",
+        "其",
+        "其中",
+        "其他",
+        "其余",
+        "其它",
+        "其餘",
+        "典",
+        "兼",
+        "内",
+        "内外",
+        "军",
+        "冠",
+        "冢",
+        "冲",
+        "冷",
+        "准",
+        "准备",
+        "减慢",
+        "几",
+        "凭",
+        "凭借",
+        "出手",
+        "刀",
+        "分",
+        "分布",
+        "列",
+        "则为",
+        "则是",
+        "初",
+        "別",
+        "別人",
+        "别",
+        "别人",
+        "别的",
+        "到",
+        "到处",
+        "制",
+        "券",
+        "剂",
+        "則是",
+        "則為",
+        "前",
+        "前任",
+        "前后",
+        "前後",
+        "剑",
+        "剧",
+        "副",
+        "劇",
+        "劍",
+        "劑",
+        "力",
+        "办",
+        "办学",
+        "功",
+        "加",
+        "劣",
+        "努力",
+        "包",
+        "包裹",
+        "化",
+        "区",
+        "医",
+        "區",
+        "半",
+        "单",
+        "卡",
+        "卫",
+        "即",
+        "即使",
+        "即便",
+        "却是",
+        "卻",
+        "卻是",
+        "卿",
+        "厂",
+        "厅",
+        "历届",
+        "压",
+        "原",
+        "去",
+        "县",
+        "又",
+        "又或",
+        "又是",
+        "及",
+        "友",
+        "发展",
+        "发育",
+        "变",
+        "变得",
+        "口",
+        "古",
+        "另",
+        "另外",
+        "只是",
+        "只有",
+        "只能",
+        "只要",
+        "可",
+        "可以",
+        "可是",
+        "可能",
+        "台",
+        "史",
+        "叶",
+        "号",
+        "司",
+        "吃",
+        "各",
+        "各个",
+        "各位",
+        "各個",
+        "各天",
+        "各州",
+        "各式",
+        "各樣",
+        "各种",
+        "各种各样",
+        "各種",
+        "各種各樣",
+        "各类",
+        "各級",
+        "各级",
+        "各自",
+        "各項",
+        "各類",
+        "各项",
+        "同",
+        "同年",
+        "名",
+        "后",
+        "向",
+        "吗",
+        "君",
+        "否",
+        "吧",
+        "呀",
+        "员",
+        "呢",
+        "周",
+        "味",
+        "和",
+        "和美",
+        "咱们",
+        "品",
+        "哈尔滨",
+        "哈爾濱",
+        "員",
+        "哪",
+        "哪个",
+        "哪些",
+        "哪個",
+        "哪儿",
+        "哪兒",
+        "哪怕",
+        "哪裏",
+        "哪裡",
+        "哪里",
+        "唯有",
+        "商",
+        "啊",
+        "啦",
+        "喇",
+        "喜",
+        "喜欢",
+        "喜歡",
+        "單",
+        "單憑",
+        "嗎",
+        "嗬",
+        "嘛",
+        "嘴",
+        "器",
+        "回",
+        "因",
+        "因为",
+        "因应",
+        "因應",
+        "因此",
+        "因為",
+        "团",
+        "园",
+        "围",
+        "国",
+        "图",
+        "圆",
+        "圈",
+        "國",
+        "圍",
+        "園",
+        "圓",
+        "圖",
+        "團",
+        "土",
+        "圣",
+        "在",
+        "在內",
+        "在内",
+        "地",
+        "场",
+        "坊",
+        "坟",
+        "坡",
+        "型",
+        "埋",
+        "城",
+        "埤",
+        "執政",
+        "基",
+        "基于",
+        "基於",
+        "堂",
+        "堡",
+        "堤",
+        "報",
+        "場",
+        "塔",
+        "塘",
+        "墓",
+        "墙",
+        "增長",
+        "增长",
+        "墟",
+        "墳",
+        "壓",
+        "士",
+        "处",
+        "外",
+        "多",
+        "多少",
+        "多次",
+        "夜",
+        "够",
+        "夠",
+        "夢",
+        "大",
+        "大家",
+        "天",
+        "头",
+        "夹",
+        "夾",
+        "奏",
+        "奖",
+        "套",
+        "女",
+        "女士们",
+        "女士门",
+        "奸",
+        "她",
+        "她们",
+        "她俩",
+        "她倆",
+        "她們",
+        "好",
+        "好了",
+        "好像",
+        "如",
+        "如何",
+        "如同",
+        "如果",
+        "妃",
+        "妇",
+        "妳",
+        "妹",
+        "始",
+        "娘",
+        "婆",
+        "婦",
+        "子",
+        "孔",
+        "字",
+        "季",
+        "学",
+        "學",
+        "宁愿",
+        "它",
+        "它们",
+        "它們",
+        "安全",
+        "宏",
+        "宗",
+        "官",
+        "实属",
+        "审",
+        "客",
+        "室",
+        "宫",
+        "宮",
+        "家",
+        "宽",
+        "富",
+        "實屬",
+        "審",
+        "寬",
+        "对",
+        "对于",
+        "对方",
+        "对此",
+        "寺",
+        "将",
+        "將",
+        "對",
+        "對方",
+        "對於",
+        "對此",
+        "小",
+        "尖",
+        "就",
+        "就是",
+        "就算",
+        "尸",
+        "尽管",
+        "局",
+        "层",
+        "屋",
+        "屍",
+        "展",
+        "属",
+        "層",
+        "屬",
+        "屯",
+        "山",
+        "屿",
+        "岗",
+        "岛",
+        "岩",
+        "岭",
+        "岸",
+        "峡",
+        "峰",
+        "島",
+        "峽",
+        "崖",
+        "崗",
+        "嶺",
+        "嶼",
+        "川",
+        "州",
+        "工",
+        "左右",
+        "差",
+        "巷",
+        "币",
+        "市",
+        "布",
+        "师",
+        "希望",
+        "帝",
+        "带",
+        "師",
+        "席",
+        "帮",
+        "帶",
+        "帽",
+        "幣",
+        "幫",
+        "年",
+        "并",
+        "并且",
+        "并非",
+        "幾",
+        "庄",
+        "床",
+        "庐",
+        "库",
+        "应",
+        "应当",
+        "应该",
+        "底",
+        "店",
+        "庙",
+        "府",
+        "度",
+        "座",
+        "庫",
+        "庭",
+        "廟",
+        "廠",
+        "廬",
+        "廳",
+        "廷",
+        "建基於",
+        "开口",
+        "开始",
+        "式",
+        "弯",
+        "張",
+        "強",
+        "弹",
+        "强",
+        "彈",
+        "彎",
+        "当",
+        "当中",
+        "当届",
+        "录",
+        "形",
+        "形容",
+        "形成",
+        "影响",
+        "影響",
+        "彼此",
+        "往",
+        "径",
+        "待",
+        "很多",
+        "後",
+        "徑",
+        "徒",
+        "得",
+        "得宠",
+        "得寵",
+        "從",
+        "御",
+        "微",
+        "徽",
+        "心",
+        "必",
+        "必須",
+        "必须",
+        "志",
+        "快",
+        "态",
+        "怎么样",
+        "怎樣",
+        "怎麼",
+        "怕",
+        "性",
+        "怪",
+        "总",
+        "恆",
+        "恋",
+        "恒",
+        "您",
+        "想",
+        "愛",
+        "感",
+        "感到",
+        "感覺",
+        "感觉",
+        "愿意",
+        "態",
+        "憑",
+        "憑藉",
+        "懂",
+        "懂得",
+        "應",
+        "應當",
+        "應該",
+        "懒得",
+        "戀",
+        "戏",
+        "我",
+        "我们",
+        "我們",
+        "我自己",
+        "我门",
+        "或",
+        "或是",
+        "或者",
+        "战",
+        "截止",
+        "截至",
+        "戰",
+        "戲",
+        "戶",
+        "户",
+        "房",
+        "所",
+        "所以",
+        "所有",
+        "手",
+        "才是",
+        "打",
+        "执政",
+        "把",
+        "报",
+        "拖",
+        "持續",
+        "按",
+        "按照",
+        "挡",
+        "损失",
+        "据",
+        "排行",
+        "接唱",
+        "接触",
+        "接觸",
+        "控制",
+        "推进",
+        "推進",
+        "描述",
+        "損失",
+        "擋",
+        "據",
+        "支",
+        "教",
+        "敢",
+        "数",
+        "整",
+        "整个",
+        "整個",
+        "整场",
+        "整块",
+        "整場",
+        "整塊",
+        "整套",
+        "整所",
+        "整架",
+        "整片",
+        "整顆",
+        "整颗",
+        "數",
+        "文",
+        "斋",
+        "斗",
+        "新",
+        "方",
+        "於",
+        "族",
+        "旗",
+        "无论",
+        "既",
+        "既是",
+        "既然",
+        "日",
+        "日趋",
+        "日趨",
+        "旧",
+        "时",
+        "星",
+        "是",
+        "是否",
+        "是否是",
+        "是次",
+        "显",
+        "显得",
+        "時",
+        "晚",
+        "暖",
+        "暗",
+        "暨",
+        "曲",
+        "更为",
+        "更是",
+        "更為",
+        "更趋",
+        "更趨",
+        "書",
+        "替",
+        "會",
+        "會不會",
+        "月",
+        "有",
+        "有些",
+        "有关",
+        "有的",
+        "有關",
+        "服",
+        "朝",
+        "期",
+        "期間",
+        "期间",
+        "未能",
+        "末",
+        "本",
+        "本人",
+        "本地",
+        "本屆",
+        "本届",
+        "本班",
+        "本身",
+        "术",
+        "机",
+        "权",
+        "杆",
+        "材",
+        "村",
+        "束",
+        "来",
+        "杯",
+        "板",
+        "林",
+        "枪",
+        "架",
+        "某",
+        "某个",
+        "某些",
+        "某個",
+        "某种",
+        "某種",
+        "染色",
+        "柜",
+        "树",
+        "校",
+        "株",
+        "核",
+        "根据",
+        "根據",
+        "格",
+        "案",
+        "档",
+        "桥",
+        "桨",
+        "桿",
+        "梁",
+        "梁耀忠",
+        "梦",
+        "棍",
+        "棒",
+        "棚",
+        "椭",
+        "業",
+        "楼",
+        "榜",
+        "槍",
+        "槳",
+        "樂",
+        "樂意",
+        "樓",
+        "樹",
+        "橋",
+        "橙",
+        "機",
+        "橢",
+        "檔",
+        "櫃",
+        "權",
+        "次",
+        "欲",
+        "款",
+        "歌",
+        "正",
+        "正如",
+        "正是",
+        "此",
+        "此套",
+        "此次",
+        "此种",
+        "此種",
+        "此等",
+        "此类",
+        "此項",
+        "此類",
+        "此项",
+        "歷",
+        "歷屆",
+        "死",
+        "段",
+        "殿",
+        "母",
+        "毎年",
+        "每",
+        "每个",
+        "每位",
+        "每個",
+        "每元",
+        "每升",
+        "每卡",
+        "每周",
+        "每天",
+        "每幅",
+        "每年",
+        "每座",
+        "每当",
+        "每戶",
+        "每户",
+        "每所",
+        "每日",
+        "每枚",
+        "每次",
+        "每段",
+        "每片",
+        "每秒",
+        "每組",
+        "每组",
+        "每边",
+        "每週",
+        "每邊",
+        "每間",
+        "每间",
+        "每队",
+        "每隊",
+        "每集",
+        "每首",
+        "毒",
+        "比",
+        "比如說",
+        "比起",
+        "氏",
+        "气",
+        "氣",
+        "水",
+        "永保",
+        "江",
+        "池",
+        "沒",
+        "沒有",
+        "沒能",
+        "沟",
+        "没",
+        "没有",
+        "没能",
+        "河",
+        "治军",
+        "治軍",
+        "沼",
+        "沿",
+        "沿着",
+        "沿著",
+        "況且",
+        "泉",
+        "法",
+        "波",
+        "洋",
+        "洞",
+        "洲",
+        "派",
+        "流沙",
+        "浅",
+        "浊",
+        "浓",
+        "浦",
+        "海",
+        "涉世",
+        "涌",
+        "液",
+        "淡",
+        "深",
+        "深感",
+        "混",
+        "淺",
+        "清",
+        "減慢",
+        "渡",
+        "港",
+        "湖",
+        "湾",
+        "準",
+        "準備",
+        "溝",
+        "溥仪",
+        "溥儀",
+        "溪",
+        "满",
+        "满洲",
+        "滩",
+        "滿",
+        "滿洲",
+        "潮",
+        "澡",
+        "澳",
+        "濁",
+        "濃",
+        "灘",
+        "灣",
+        "火",
+        "炉",
+        "炎",
+        "炮",
+        "点",
+        "為",
+        "為了",
+        "為人",
+        "烃",
+        "烟",
+        "热",
+        "烴",
+        "無",
+        "無論",
+        "煙",
+        "熟",
+        "熱",
+        "營",
+        "爐",
+        "爭取",
+        "爭辯",
+        "爱",
+        "爲",
+        "父",
+        "爷",
+        "爺",
+        "牆",
+        "片",
+        "版",
+        "牌",
+        "牠",
+        "牠們",
+        "物",
+        "犯",
+        "状",
+        "狀",
+        "狂",
+        "狗",
+        "狮",
+        "猫",
+        "獅",
+        "獎",
+        "獲利",
+        "率",
+        "王",
+        "班",
+        "球",
+        "琴",
+        "甚么",
+        "甚至",
+        "甚至是",
+        "甚麼",
+        "甚麽",
+        "生",
+        "用",
+        "由",
+        "由于",
+        "由於",
+        "电",
+        "男",
+        "町",
+        "画",
+        "界",
+        "畔",
+        "畫",
+        "當",
+        "當中",
+        "當屆",
+        "病",
+        "症",
+        "癌",
+        "癖",
+        "發展",
+        "發育",
+        "的",
+        "的話",
+        "的话",
+        "皮",
+        "盃",
+        "监管",
+        "盖因",
+        "監管",
+        "目",
+        "直到",
+        "直至",
+        "相对",
+        "相對",
+        "相比",
+        "省",
+        "看",
+        "看似",
+        "看得",
+        "眼",
+        "眾",
+        "眾多",
+        "着",
+        "督",
+        "瞭",
+        "短",
+        "石",
+        "矿",
+        "码",
+        "砲",
+        "硅",
+        "碑",
+        "碱",
+        "碼",
+        "礁",
+        "礦",
+        "礼",
+        "社",
+        "祂",
+        "神",
+        "祠",
+        "禮",
+        "离",
+        "离开",
+        "秀",
+        "私交",
+        "秋",
+        "种",
+        "科",
+        "秤",
+        "稅",
+        "税",
+        "種",
+        "突感",
+        "窑",
+        "窟",
+        "窯",
+        "站",
+        "端",
+        "競選",
+        "符",
+        "笨",
+        "等",
+        "管",
+        "管理",
+        "箱",
+        "節",
+        "篇",
+        "籍",
+        "米",
+        "类",
+        "粉",
+        "精",
+        "糖",
+        "系",
+        "紀",
+        "紅",
+        "紋",
+        "純",
+        "紙",
+        "級",
+        "素",
+        "組",
+        "結",
+        "給",
+        "綉",
+        "經",
+        "經由",
+        "經過",
+        "綜",
+        "綫",
+        "綱",
+        "網",
+        "線",
+        "緣",
+        "縣",
+        "縱使",
+        "總",
+        "繞",
+        "繼",
+        "红",
+        "级",
+        "纪",
+        "纯",
+        "纲",
+        "纵使",
+        "纸",
+        "纹",
+        "线",
+        "组",
+        "经",
+        "经由",
+        "经过",
+        "结",
+        "绕",
+        "给",
+        "绣",
+        "继",
+        "综",
+        "网",
+        "罩",
+        "罪",
+        "署",
+        "羊",
+        "美",
+        "群",
+        "翁",
+        "老",
+        "者",
+        "而",
+        "而且",
+        "而已",
+        "而是",
+        "而非",
+        "聖",
+        "肉",
+        "肯",
+        "肺",
+        "胎",
+        "胚",
+        "胶",
+        "能",
+        "能否",
+        "能够",
+        "能夠",
+        "脚",
+        "脸",
+        "腔",
+        "腳",
+        "腿",
+        "膜",
+        "膠",
+        "臉",
+        "臨",
+        "自",
+        "自从",
+        "自家",
+        "自己",
+        "自從",
+        "自我",
+        "自身",
+        "至",
+        "至于",
+        "至於",
+        "臺",
+        "與",
+        "與其",
+        "舊",
+        "舞",
+        "舟",
+        "舰",
+        "舱",
+        "船",
+        "艇",
+        "艙",
+        "艦",
+        "色",
+        "节",
+        "花",
+        "若",
+        "若是",
+        "茶",
+        "药",
+        "莊",
+        "获利",
+        "菌",
+        "菜",
+        "营",
+        "葉",
+        "著",
+        "蓋因",
+        "蓝",
+        "藉",
+        "藉助",
+        "藉由",
+        "藉著",
+        "藍",
+        "藤",
+        "藥",
+        "藩",
+        "處",
+        "號",
+        "虽",
+        "虽则",
+        "虽然",
+        "蛙",
+        "行",
+        "術",
+        "街",
+        "衛",
+        "衣",
+        "表",
+        "表现",
+        "表現",
+        "表示",
+        "被",
+        "装",
+        "裏",
+        "裔",
+        "裙",
+        "裝",
+        "裡",
+        "裡面",
+        "裤",
+        "製",
+        "褲",
+        "要",
+        "要不要",
+        "要么",
+        "要是",
+        "要求",
+        "親",
+        "覺得",
+        "觀",
+        "观",
+        "觉得",
+        "角",
+        "計劃",
+        "記",
+        "詞",
+        "試圖",
+        "詩",
+        "話",
+        "該",
+        "該屆",
+        "該批",
+        "該族",
+        "該條",
+        "該段",
+        "該組",
+        "該集",
+        "該項",
+        "誌",
+        "認為",
+        "認識",
+        "語",
+        "誤信",
+        "說",
+        "誰",
+        "課",
+        "請",
+        "論",
+        "諸",
+        "諸如",
+        "謂",
+        "證",
+        "譜",
+        "變",
+        "變得",
+        "认为",
+        "认识",
+        "记",
+        "许多",
+        "许许多多",
+        "论",
+        "证",
+        "词",
+        "诗",
+        "话",
+        "该",
+        "该届",
+        "该批",
+        "该族",
+        "该条",
+        "该段",
+        "该组",
+        "该集",
+        "语",
+        "误信",
+        "说",
+        "请",
+        "诸",
+        "诸如",
+        "课",
+        "谁",
+        "谓",
+        "谱",
+        "谷",
+        "豆",
+        "象",
+        "貓",
+        "負債",
+        "費",
+        "資",
+        "賣",
+        "質",
+        "賽",
+        "负债",
+        "质",
+        "费",
+        "资",
+        "赛",
+        "起",
+        "起伏",
+        "起来",
+        "趁",
+        "超",
+        "趋",
+        "趋于",
+        "趨",
+        "趨於",
+        "距",
+        "距离",
+        "距離",
+        "跟",
+        "路",
+        "躁",
+        "身",
+        "車",
+        "軍",
+        "軒",
+        "軟",
+        "軸",
+        "較",
+        "輕",
+        "车",
+        "轩",
+        "软",
+        "轴",
+        "轻",
+        "较",
+        "辦",
+        "辦學",
+        "边",
+        "达到",
+        "过",
+        "过后",
+        "运作",
+        "近",
+        "还",
+        "还是",
+        "还有",
+        "这",
+        "这些",
+        "这儿",
+        "这养",
+        "这样",
+        "这次",
+        "这种",
+        "这里",
+        "远",
+        "连",
+        "连任",
+        "连同",
+        "迷",
+        "追溯",
+        "透过",
+        "透過",
+        "這",
+        "這些",
+        "這個",
+        "這兒",
+        "這樣",
+        "這樣子",
+        "這次",
+        "這種",
+        "這裏",
+        "這裡",
+        "這邊",
+        "這麼",
+        "通",
+        "通过",
+        "通過",
+        "逢",
+        "連",
+        "連任",
+        "連同",
+        "週",
+        "運作",
+        "過",
+        "過後",
+        "道",
+        "達到",
+        "遠",
+        "選舉",
+        "還是",
+        "邊",
+        "那",
+        "那个",
+        "那些",
+        "那儿",
+        "那兒",
+        "那样",
+        "那樣",
+        "那裏",
+        "那裡",
+        "那邊",
+        "那里",
+        "邦",
+        "邨",
+        "郎",
+        "郡",
+        "部",
+        "都",
+        "都是",
+        "鄉",
+        "配",
+        "酒",
+        "酸",
+        "醣",
+        "醫",
+        "里",
+        "里面",
+        "重",
+        "量",
+        "金",
+        "針",
+        "針對",
+        "銘",
+        "鋼",
+        "錄",
+        "錦",
+        "鍋",
+        "鍵",
+        "鎊",
+        "鎮",
+        "鏈",
+        "鏡",
+        "鐵",
+        "鑒於",
+        "针",
+        "针对",
+        "钢",
+        "铁",
+        "铭",
+        "链",
+        "锅",
+        "锦",
+        "键",
+        "镇",
+        "镜",
+        "長",
+        "长",
+        "門",
+        "開口",
+        "開始",
+        "間",
+        "閣",
+        "閣下",
+        "關",
+        "關心",
+        "關於",
+        "门",
+        "间",
+        "阁",
+        "队",
+        "阶",
+        "际",
+        "陆",
+        "降解",
+        "院",
+        "除",
+        "除了",
+        "除外",
+        "除非",
+        "陵",
+        "陸",
+        "隊",
+        "階",
+        "随",
+        "随同",
+        "隔",
+        "際",
+        "隨",
+        "隨同",
+        "难过",
+        "集",
+        "雖",
+        "雖則",
+        "雖然",
+        "離",
+        "離開",
+        "難過",
+        "電",
+        "需",
+        "需要",
+        "非",
+        "靠",
+        "面",
+        "音",
+        "頂",
+        "須",
+        "頭",
+        "頭個",
+        "題",
+        "額",
+        "願意",
+        "類",
+        "顯",
+        "顯得",
+        "顶",
+        "须",
+        "题",
+        "额",
+        "風",
+        "风",
+        "飯",
+        "餅",
+        "餐",
+        "館",
+        "饃",
+        "首先",
+        "點",
+    ],
+}