diff --git a/app.py b/app.py
index a498d3f5e9cfd97ecc0495facae59fd74a7a3bb7..7a5aa14fc72c35ad31be5a4180839685451928b3 100644
--- a/app.py
+++ b/app.py
@@ -13,6 +13,8 @@ import numpy as np
 
 import matplotlib.pyplot as plt
 
+from filtering import Filtering
+
 
 class Visualization:
     def __init__(
@@ -390,6 +392,9 @@ class Visualization:
                 ax.set_ylabel("frequency in the documents")
                 st.pyplot(fig)
 
+    def check_personal_doc(self):
+        pass
+
     def download_data(self):
         st.header("Download data")
 
@@ -408,6 +413,7 @@ class Visualization:
         self.filtering_of_words()
         self.plot_distributions_filtering_parameters()
         #self.plot_zipf_law()
+        self.check_personal_doc()
         self.download_data()
 
 
diff --git a/badwords.py b/badwords.py
new file mode 100644
index 0000000000000000000000000000000000000000..64f1c200ef867bcaac1eee2645e0381a0fcee439
--- /dev/null
+++ b/badwords.py
@@ -0,0 +1,2682 @@
+# Merge
+# https://github.com/zacanger/profane-words
+# and
+# https://github.com/thisandagain/washyourmouthoutwithsoap/blob/develop/data/build.json
+# and
+# https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
+
+
+english_badwords = [
+    "abuse",
+    "anal",
+    "anilingus",
+    "anus",
+    "aroused",
+    "arse",
+    "arsehole",
+    "ass",
+    "asses",
+    "assfuck",
+    "asshat",
+    "asshole",
+    "assholes",
+    "autoerotic",
+    "bangbros",
+    "banging",
+    "bareback",
+    "bastard",
+    "bastards",
+    "bazongas",
+    "bbw",
+    "bdsm",
+    "biatch",
+    "bicurious",
+    "bigass",
+    "bigtits",
+    "bimbo",
+    "bimbos",
+    "bitch",
+    "bitches",
+    "bitching",
+    "blowjob",
+    "blowjobs",
+    "boche",
+    "boner",
+    "boners",
+    "boob",
+    "boobies",
+    "boobs",
+    "booty",
+    "brothel",
+    "buceta",
+    "bugger",
+    "buggered",
+    "buggery",
+    "bukkake",
+    "bule",
+    "buttcheeks",
+    "buttfuck",
+    "butthead",
+    "butthole",
+    "buttplug",
+    "cameltoe",
+    "camgirl",
+    "camwhore",
+    "chink",
+    "chinks",
+    "cialis",
+    "clit",
+    "clitoris",
+    "clits",
+    "clitty",
+    "clusterfuck",
+    "cock",
+    "cock-head",
+    "cockblock",
+    "cockfight",
+    "cockhead",
+    "cocks",
+    "cocksman",
+    "cocksucker",
+    "cocksucking",
+    "coital",
+    "coitus",
+    "coochie",
+    "cooly",
+    "coon",
+    "coons",
+    "copulate",
+    "cowgirl",
+    "crabs",
+    "creampie",
+    "cum",
+    "cumming",
+    "cums",
+    "cumshot",
+    "cumshots",
+    "cumslut",
+    "cunnilingus",
+    "cunny",
+    "cunt",
+    "cunts",
+    "cybersex",
+    "darkey",
+    "darkie",
+    "darkies",
+    "darky",
+    "deepthroat",
+    "deepthroating",
+    "dick",
+    "dickhole",
+    "dicks",
+    "dildo",
+    "dildos",
+    "dogging",
+    "doggy-style",
+    "doggystyle",
+    "dominatrix",
+    "dommes",
+    "dong",
+    "dp",
+    "dupa",
+    "dyke",
+    "dykes",
+    "ecchi",
+    "ejaculate",
+    "ejaculated",
+    "ejaculates",
+    "ejaculating",
+    "ejaculation",
+    "ejaculations",
+    "enema",
+    "erect",
+    "erection",
+    "ero",
+    "erotic",
+    "erotism",
+    "escort",
+    "fag",
+    "fagging",
+    "faggot",
+    "fagot",
+    "fagots",
+    "fags",
+    "felch",
+    "fellate",
+    "fellatio",
+    "femdom",
+    "fetish",
+    "figging",
+    "fingerbang",
+    "fingering",
+    "fisted",
+    "fister",
+    "fisting",
+    "floozy",
+    "fondle",
+    "footfetish",
+    "footjob",
+    "foreskin",
+    "fornicate",
+    "foursome",
+    "fuck",
+    "fuckable",
+    "fuckbook",
+    "fuckboy",
+    "fuckbuddy",
+    "fucked",
+    "fucker",
+    "fuckers",
+    "fuckfest",
+    "fuckhole",
+    "fuckin",
+    "fucking",
+    "fucks",
+    "fuk",
+    "fukin",
+    "fuking",
+    "g-spot",
+    "gangbang",
+    "gangbanged",
+    "gangbanger",
+    "gangbangs",
+    "genital",
+    "genitals",
+    "gigolo",
+    "glans",
+    "gonad",
+    "gonads",
+    "gook",
+    "gringo",
+    "gringos",
+    "grope",
+    "gspot",
+    "guido",
+    "handjob",
+    "haole",
+    "hapa",
+    "hardcore",
+    "hardon",
+    "harem",
+    "hentai",
+    "hindoo",
+    "hoe",
+    "hoes",
+    "honky",
+    "hooker",
+    "hookers",
+    "hooter",
+    "hooters",
+    "hori",
+    "horndog",
+    "horney",
+    "horniest",
+    "horny",
+    "humped",
+    "humper",
+    "humping",
+    "hussy",
+    "hymen",
+    "ikey",
+    "incest",
+    "injun",
+    "intercourse",
+    "interracial",
+    "jack-off",
+    "jackoff",
+    "jailbait",
+    "jerk-off",
+    "jerkoff",
+    "jiggy",
+    "jism",
+    "jizz",
+    "jizzed",
+    "kaffir",
+    "kafir",
+    "kike",
+    "kikes",
+    "kinkster",
+    "kinky",
+    "kkk",
+    "klan",
+    "kraut",
+    "labia",
+    "lapdance",
+    "libido",
+    "licker",
+    "licking",
+    "limey",
+    "lingerie",
+    "livesex",
+    "lolita",
+    "lovemaking",
+    "lust",
+    "lusting",
+    "masochist",
+    "masterbate",
+    "masterbating",
+    "masterbation",
+    "masturbate",
+    "masturbating",
+    "masturbation",
+    "milf",
+    "minge",
+    "missionary",
+    "molest",
+    "molestation",
+    "molester",
+    "munging",
+    "muschi",
+    "nads",
+    "naked",
+    "necked",
+    "necro",
+    "negress",
+    "negro",
+    "negroes",
+    "negroid",
+    "negros",
+    "nig",
+    "nigar",
+    "nigga",
+    "niggas",
+    "niggaz",
+    "nigger",
+    "niggers",
+    "nigra",
+    "nipple",
+    "nipples",
+    "nookie",
+    "nooky",
+    "nooner",
+    "nude",
+    "nudie",
+    "nudity",
+    "nymph",
+    "nympho",
+    "nymphomania",
+    "orgasim",
+    "orgasm",
+    "orgasms",
+    "orgies",
+    "orgy",
+    "orifice",
+    "p0rn",
+    "paedophile",
+    "pantie",
+    "panties",
+    "panty",
+    "pastie",
+    "pecker",
+    "pedo",
+    "pedophile",
+    "pedophilia",
+    "pedophiliac",
+    "peeper",
+    "peepshow",
+    "pegging",
+    "penetrate",
+    "penetration",
+    "penile",
+    "penis",
+    "penises",
+    "penus",
+    "perv",
+    "phallic",
+    "phonesex",
+    "pickaninnies",
+    "pimp",
+    "playboy",
+    "playgirl",
+    "poontang",
+    "porn",
+    "porno",
+    "pornography",
+    "pornos",
+    "pr0n",
+    "premature",
+    "preteen",
+    "pron",
+    "prostitute",
+    "pube",
+    "pubes",
+    "pubic",
+    "pubis",
+    "punani",
+    "pussies",
+    "pussy",
+    "pussys",
+    "pusy",
+    "puta",
+    "puto",
+    "queef",
+    "quickie",
+    "quicky",
+    "quim",
+    "randy",
+    "rape",
+    "raped",
+    "raper",
+    "raping",
+    "rapist",
+    "rectum",
+    "redneck",
+    "rednecks",
+    "redskin",
+    "redskins",
+    "rimjob",
+    "rimming",
+    "russki",
+    "s&m",
+    "sadism",
+    "sadist",
+    "sambo",
+    "santorum",
+    "schlong",
+    "scissoring",
+    "semen",
+    "sex",
+    "sexed",
+    "sexi",
+    "sexing",
+    "sexo",
+    "sexpot",
+    "sextoy",
+    "sexual",
+    "sexually",
+    "sexx",
+    "sexxx",
+    "sexxxy",
+    "sexxy",
+    "sexy",
+    "sh!t",
+    "sh1t",
+    "shagging",
+    "shemale",
+    "sissy",
+    "skank",
+    "skanks",
+    "slapper",
+    "slut",
+    "sluts",
+    "slutting",
+    "slutty",
+    "smut",
+    "smutty",
+    "sodomise",
+    "sodomite",
+    "sodomize",
+    "sodomy",
+    "spank",
+    "sperm",
+    "spic",
+    "spick",
+    "splooge",
+    "spooge",
+    "squaw",
+    "squirting",
+    "steamy",
+    "stiffy",
+    "strapon",
+    "suck",
+    "sucked",
+    "sucker",
+    "sucking",
+    "sucks",
+    "swallow",
+    "swallower",
+    "swinger",
+    "teabagging",
+    "testical",
+    "testicle",
+    "testicles",
+    "testis",
+    "threesome",
+    "threeway",
+    "titfuck",
+    "titjob",
+    "tits",
+    "tittie",
+    "titties",
+    "titty",
+    "tittyfuck",
+    "tity",
+    "toots",
+    "topless",
+    "trannie",
+    "tranny",
+    "tribadism",
+    "twat",
+    "twats",
+    "undies",
+    "undressing",
+    "upskirt",
+    "vag",
+    "vagina",
+    "vaginal",
+    "viagra",
+    "vibrator",
+    "virgin",
+    "vixen",
+    "voyeur",
+    "vulva",
+    "wank",
+    "wanker",
+    "wanking",
+    "wazoo",
+    "wedgie",
+    "wench",
+    "wetback",
+    "whore",
+    "whored",
+    "whorehouse",
+    "whores",
+    "whoring",
+    "wigger",
+    "willie",
+    "willies",
+    "willy",
+    "wog",
+    "wop",
+    "x-rated",
+    "xxx",
+    "xxxxxx",
+    "yaoi",
+    "yid",
+    "zoophile",
+    "zoophilia",
+]
+
+badwords = {
+    "ar": english_badwords
+    + [
+        "احتلام",
+        "اغتصاب",
+        "بز",
+        "بزاز",
+        "بظر",
+        "بيضان",
+        "تمص",
+        "ثدي",
+        "جماع",
+        "حلمة",
+        "خنثي",
+        "خول",
+        "زب",
+        "سحاق",
+        "سحاقية",
+        "سكس",
+        "شاذ",
+        "شرج",
+        "شرموطة",
+        "شهوة",
+        "طيز",
+        "عاهرة",
+        "عرص",
+        "فرج",
+        "قحبة",
+        "قضيب",
+        "كس",
+        "لبوة",
+        "لحس",
+        "لعق",
+        "لواط",
+        "لوطي",
+        "مبادل",
+        "متناك",
+        "متناكة",
+        "مص",
+        "مفلقسة",
+        "نيك",
+    ],
+    "ca": english_badwords
+    + [
+        "avortament",
+        "anal",
+        "anus",
+        "cul",
+        "ass-fucker",
+        "asss",
+        "asshole",
+        "assholes",
+        "bolera",
+        "boles",
+        "bastardo",
+        "bellend",
+        "bestial",
+        "bestialitat",
+        "puta",
+        "femelles",
+        "picant",
+        "sagnant",
+        "mamada",
+        "bollok",
+        "boob",
+        "pits",
+        "buceta",
+        "bum",
+        "culata",
+        "catifa muncher",
+        "picar",
+        "cipa",
+        "clitoris",
+        "polla",
+        "galletejador",
+        "gallines",
+        "coon",
+        "merda",
+        "cum",
+        "correguda",
+        "cunillingus",
+        "boig",
+        "maleït",
+        "consolador",
+        "consoladors",
+        "dink",
+        "canalla",
+        "duche",
+        "dique",
+        "ejaculació",
+        "ejaculat",
+        "ejacula",
+        "ejaculant",
+        "fag",
+        "fagging",
+        "fagot",
+        "fagots",
+        "fanny",
+        "felching",
+        "fel.lació",
+        "brida",
+        "follar",
+        "follat",
+        "escuradents",
+        "follant",
+        "folles",
+        "fucks",
+        "empacadora de llaminadures",
+        "déu maldit",
+        "deu meu",
+        "infern",
+        "hore",
+        "córrer",
+        "retrocés",
+        "kock",
+        "llavis",
+        "lujuria",
+        "lució",
+        "masoquista",
+        "masturbarse",
+        "puta mare",
+        "nazi",
+        "nigger",
+        "negres",
+        "orgasim",
+        "orgasme",
+        "orgasmes",
+        "pecker",
+        "penis",
+        "piss",
+        "mossegat",
+        "pisser",
+        "pisses",
+        "pissing",
+        "treure de polleguera",
+        "caca",
+        "porno",
+        "pornografia",
+        "picades",
+        "pube",
+        "coques",
+        "gatet",
+        "violació",
+        "violador",
+        "recte",
+        "retard",
+        "rimming",
+        "sàdic",
+        "cargolar",
+        "escrot",
+        "semen",
+        "sexe",
+        "shag",
+        "borratxos",
+        "transsexual",
+        "mossegar",
+        "shitted",
+        "skank",
+        "smegma",
+        "smut",
+        "arrebat",
+        "fill de puta",
+        "spac",
+        "spunk",
+        "testicle",
+        "tit",
+        "tetas",
+        "titt",
+        "turd",
+        "vagina",
+        "viagra",
+        "vulva",
+        "wang",
+        "wank",
+        "x classificat",
+        "xxx",
+    ],
+    "en": english_badwords,
+    "es": english_badwords
+    + [
+        "Asesinato",
+        "Bollera",
+        "Cabrón",
+        "Caca",
+        "Chupada",
+        "Chupapollas",
+        "Chupetón",
+        "Concha de tu madre",
+        "Coprofagía",
+        "Coño",
+        "Culo",
+        "Drogas",
+        "Esperma",
+        "Fiesta de salchichas",
+        "Follador",
+        "Follar",
+        "Gilipichis",
+        "Gilipollas",
+        "Hacer una paja",
+        "Haciendo el amor",
+        "Heroína",
+        "Hija de puta",
+        "Hijaputa",
+        "Hijo de puta",
+        "Hijoputa",
+        "Idiota",
+        "Imbécil",
+        "Jilipollas",
+        "Kapullo",
+        "Lameculos",
+        "Maciza",
+        "Macizorra",
+        "Mamada",
+        "Marica",
+        "Mariconazo",
+        "Maricón",
+        "Mierda",
+        "Nazi",
+        "Orina",
+        "Pedo",
+        "Pendejo",
+        "Pervertido",
+        "Pezón",
+        "Pinche",
+        "Pis",
+        "Prostituta",
+        "Puta",
+        "Racista",
+        "Ramera",
+        "Semen",
+        "Sexo",
+        "Sexo oral",
+        "Soplagaitas",
+        "Soplapollas",
+        "Sádico",
+        "Tetas grandes",
+        "Travesti",
+        "Trio",
+        "Tía buena",
+        "Verga",
+        "Vulva",
+        "aborto",
+        "agallas",
+        "anal",
+        "ano",
+        "arrebatar",
+        "asno",
+        "atornillar",
+        "bastardo",
+        "bestial",
+        "bestialidad",
+        "bolas",
+        "bollok",
+        "bolsa de pelota",
+        "brida",
+        "buceta",
+        "cabron",
+        "cagadas",
+        "cagado",
+        "cagando",
+        "campana",
+        "carajo",
+        "chupar la polla",
+        "cipa",
+        "clítoris",
+        "concha",
+        "consolador",
+        "consoladores",
+        "corrida",
+        "coño",
+        "coños",
+        "culo",
+        "culos",
+        "cunillingus",
+        "córneo",
+        "de mierda",
+        "dique",
+        "duche",
+        "enojado",
+        "escroto",
+        "espacio",
+        "estúpido",
+        "extremo",
+        "eyacula",
+        "eyaculación",
+        "eyaculado",
+        "eyacular",
+        "fagging",
+        "felación",
+        "felching",
+        "folla",
+        "follada",
+        "follador de culo",
+        "folladores",
+        "follar",
+        "fudge packer",
+        "gallos",
+        "grieta",
+        "hacerse una paja",
+        "hijo de puta",
+        "hore",
+        "infierno",
+        "kock",
+        "labios vaginales",
+        "los pechos",
+        "lujuria",
+        "madre folladora",
+        "maldita sea",
+        "maldito",
+        "maldito sea",
+        "mamada",
+        "mapache",
+        "maricones",
+        "maricón",
+        "martillo",
+        "masoquista",
+        "masturbarse",
+        "mear",
+        "mierda",
+        "molesto",
+        "muncher alfombra",
+        "nazi",
+        "negro",
+        "niggers",
+        "orgasimo",
+        "orgasmo",
+        "orgasmos",
+        "orinando",
+        "pelusa",
+        "pene",
+        "perra",
+        "perras",
+        "perro follador",
+        "pinchazo",
+        "pinchazos",
+        "pisser",
+        "polla",
+        "porno",
+        "pornografía",
+        "pube",
+        "puta",
+        "putas",
+        "pájaro carpintero",
+        "quejas",
+        "recto",
+        "retardar",
+        "rimming",
+        "sangriento",
+        "semen",
+        "sexo",
+        "skank",
+        "smegma",
+        "sádico",
+        "testículo",
+        "teta",
+        "tetas",
+        "tirón",
+        "tizón",
+        "tonto",
+        "transexual",
+        "vagina",
+        "vete a la mierda",
+        "viagra",
+        "violación",
+        "violador",
+        "vulva",
+        "wang",
+        "x clasificado",
+        "xxx",
+        "zurullo",
+    ],
+    "eu": english_badwords
+    + [
+        "abortu",
+        "anal",
+        "ipurdi",
+        "kabroi",
+        "puta",
+        "clitoris",
+        "cunillingus",
+        "madarikatu",
+        "zakil",
+        "hazia isuri",
+        "arraio",
+        "izorratu",
+        "infernu",
+        "emagaldu",
+        "lizunkeri",
+        "lizun",
+        "masokista",
+        "masturbatu",
+        "nazi",
+        "beltz",
+        "orgasmo",
+        "pixa",
+        "porno",
+        "pornografia",
+        "alu",
+        "bortxaketa",
+        "bortxatzaile",
+        "sadista",
+        "ipurzulo",
+        "hazi",
+        "semen",
+        "sexu",
+        "kaka",
+        "putaseme",
+        "barrabil",
+        "titi",
+        "bagina",
+        "viagra",
+    ],
+    "fr": english_badwords
+    + [
+        "MALPT",
+        "anal",
+        "anus",
+        "arracher",
+        "avortement",
+        "baise",
+        "baiser",
+        "baiseur de chien",
+        "baiseurs",
+        "baisée",
+        "bander",
+        "bellend",
+        "bestial",
+        "bestialité",
+        "bigornette",
+        "bite",
+        "bitte",
+        "bloblos",
+        "bollok",
+        "boob",
+        "bordel",
+        "bourré",
+        "bourrée",
+        "bout",
+        "brackmard",
+        "branlage",
+        "branler",
+        "branlette",
+        "branleur",
+        "branleuse",
+        "bride",
+        "brouter le cresson",
+        "buceta",
+        "caca",
+        "chatte",
+        "chattes",
+        "chiasse",
+        "chienne",
+        "chiennes",
+        "chier",
+        "chiottes",
+        "chié",
+        "cipa",
+        "clito",
+        "clitoris",
+        "clochard",
+        "cochonneries",
+        "con",
+        "connard",
+        "connards",
+        "connasse",
+        "conne",
+        "convoitise",
+        "coq",
+        "coqs",
+        "corné",
+        "couilles",
+        "cramouille",
+        "cran",
+        "cul",
+        "culs",
+        "cunillingus",
+        "damné",
+        "des balles",
+        "digue",
+        "duché",
+        "déconne",
+        "déconner",
+        "emballeur de fudge",
+        "emmerdant",
+        "emmerder",
+        "emmerdeur",
+        "emmerdeuse",
+        "enculer",
+        "enculeur",
+        "enculeurs",
+        "enculé",
+        "enculée",
+        "enfer",
+        "enfoiré",
+        "enfoirée",
+        "espacer",
+        "fagging",
+        "fagot",
+        "fagots",
+        "faire chier",
+        "fellation",
+        "fente",
+        "fille de pute",
+        "fils de pute",
+        "folle",
+        "foutre",
+        "fuckings",
+        "gerbe",
+        "gerber",
+        "godemiché",
+        "godes",
+        "gouine",
+        "grande folle",
+        "grogniasse",
+        "gueule",
+        "hore",
+        "jouir",
+        "kock",
+        "la putain de ta mère",
+        "les lèvres",
+        "les seins",
+        "luxure",
+        "masochiste",
+        "masturber",
+        "merde",
+        "merdeuse",
+        "merdeux",
+        "merdique",
+        "meuf",
+        "mère enculée",
+        "ménage à trois",
+        "mésange",
+        "nazi",
+        "negro",
+        "nique ta mère",
+        "nique ta race",
+        "nègre",
+        "nègres",
+        "orgasim",
+        "orgasme",
+        "orgasmes",
+        "palucher",
+        "penchant",
+        "pipe",
+        "pipi",
+        "piquer",
+        "piqûres",
+        "pisse",
+        "pisser",
+        "porno",
+        "pornographie",
+        "pouffiasse",
+        "pousse-crotte",
+        "pube",
+        "putain",
+        "putain de",
+        "pute",
+        "pédale",
+        "pédé",
+        "pénis",
+        "péter",
+        "queue",
+        "quéquette",
+        "ramoner",
+        "rectum",
+        "retard",
+        "rimming",
+        "râpé",
+        "sac de billes",
+        "sac à foutre",
+        "sac à merde",
+        "sadique",
+        "salaud",
+        "salope",
+        "salopes",
+        "sanglant",
+        "scrotum",
+        "se branler",
+        "seins",
+        "sexe",
+        "skank",
+        "smegma",
+        "sperme",
+        "suce",
+        "suceuse",
+        "tanche",
+        "tapette",
+        "tapis muncher",
+        "testicule",
+        "teuch",
+        "titt",
+        "transexuelle",
+        "tremper",
+        "tringler",
+        "trique",
+        "troncher",
+        "trou du cul",
+        "turlute",
+        "vagin",
+        "viagra",
+        "violeur",
+        "vulve",
+        "wang",
+        "x évalué",
+        "xxx",
+        "zigounette",
+        "zizi",
+        "zut",
+        "éjaculant",
+        "éjaculation",
+        "éjacule",
+        "éjaculer",
+        "éjaculé",
+        "étron",
+    ],
+    "hi": english_badwords
+    + [
+        "aand",
+        "aandu",
+        "balatkar",
+        "balatkari",
+        "behen chod",
+        "beti chod",
+        "bhadva",
+        "bhadve",
+        "bhandve",
+        "bhangi",
+        "bhootni ke",
+        "bhosad",
+        "bhosadi ke",
+        "bitching",
+        "blowjob",
+        "bollok",
+        "boobe",
+        "buceta",
+        "chakke",
+        "chinaal",
+        "chinki",
+        "chod",
+        "chodu",
+        "chodu bhagat",
+        "chooche",
+        "choochi",
+        "choope",
+        "choot",
+        "choot ke baal",
+        "chootia",
+        "chootiya",
+        "chuche",
+        "chuchi",
+        "chudaap",
+        "chudai khanaa",
+        "chudam chudai",
+        "chude",
+        "chut",
+        "chut ka chuha",
+        "chut ka churan",
+        "chut ka mail",
+        "chut ke baal",
+        "chut ke dhakkan",
+        "chut maarli",
+        "chutad",
+        "chutadd",
+        "chutan",
+        "chutia",
+        "chutiya",
+        "cipa",
+        "cunillingus",
+        "dink",
+        "duche",
+        "ejaculated",
+        "ejaculates",
+        "ejaculating",
+        "fagging",
+        "fagots",
+        "felching",
+        "fuckers",
+        "fuckings",
+        "fucks",
+        "gaand",
+        "gaandfat",
+        "gaandmasti",
+        "gaandufad",
+        "gandfattu",
+        "gandu",
+        "gashti",
+        "gasti",
+        "ghassa",
+        "ghasti",
+        "gucchi",
+        "gucchu",
+        "harami",
+        "haramzade",
+        "hawas",
+        "hawas ke pujari",
+        "hijda",
+        "hijra",
+        "jhant",
+        "jhant chaatu",
+        "jhant ka keeda",
+        "jhant ke baal",
+        "jhant ke pissu",
+        "jhantu",
+        "kamine",
+        "kaminey",
+        "kanjar",
+        "kutta",
+        "kutta kamina",
+        "kutte ki aulad",
+        "kutte ki jat",
+        "kuttiya",
+        "loda",
+        "lodu",
+        "lund",
+        "lund choos",
+        "lund ka bakkal",
+        "lund khajoor",
+        "lundtopi",
+        "lundure",
+        "lusting",
+        "maa ki chut",
+        "maal",
+        "madar chod",
+        "madarchod",
+        "madhavchod",
+        "masochist",
+        "mooh mein le",
+        "mutth",
+        "mutthal",
+        "najayaz",
+        "najayaz aulaad",
+        "najayaz paidaish",
+        "orgasim",
+        "paki",
+        "pataka",
+        "patakha",
+        "pisser",
+        "pisses",
+        "pissing",
+        "pube",
+        "pussies",
+        "raand",
+        "randaap",
+        "randi",
+        "randi rona",
+        "rimming",
+        "saala",
+        "saala kutta",
+        "saali kutti",
+        "saali randi",
+        "shagging",
+        "shite",
+        "shitted",
+        "shitting",
+        "shitty",
+        "skank",
+        "sluts",
+        "spac",
+        "suar",
+        "suar ke lund",
+        "suar ki aulad",
+        "tatte",
+        "tatti",
+        "teri maa ka bhosada",
+        "teri maa ka boba chusu",
+        "teri maa ki behenchod ",
+        "teri maa ki chut",
+        "tharak",
+        "tharki",
+        "titt",
+        "tu chuda",
+        "turd",
+        "wank",
+        "xxx",
+        "अंडकोश की थैली",
+        "अंडा",
+        "अरे नहीं",
+        "अश्लील",
+        "उल्लू",
+        "एक्स रेटेड",
+        "ओगाज़्म",
+        "कमबख्त",
+        "काम करना",
+        "कामोद्दीपक चित्र",
+        "कालीन का चूरा",
+        "किन्नर",
+        "कुतिया",
+        "कुत्ते-कमीने",
+        "कून",
+        "कॉक",
+        "गड़बड़",
+        "गधा कमीने",
+        "गधे",
+        "गर्भपात",
+        "गुदा",
+        "गेंद का थैला",
+        "गेंदों",
+        "गोली चलाने की आवाज़",
+        "घटिया इंसान",
+        "चाकलेट का रंग",
+        "चिंक",
+        "चुभन",
+        "चूची",
+        "चूतड़",
+        "चोंच",
+        "छीनना",
+        "जी में आये करो",
+        "झटका बंद",
+        "ठगना पैकर",
+        "डिल्डो",
+        "दुष्ट",
+        "दूर जाने का अभद्र संकेत देना",
+        "धत् तेरे की",
+        "नरक",
+        "नाजी",
+        "निकला हुआ किनारा",
+        "नितंब",
+        "पंगा लेना",
+        "पिछाड़ी",
+        "पीड़न कामुक",
+        "पेशाब",
+        "पॉर्न",
+        "फटना",
+        "फूहड़",
+        "बकवास",
+        "बट",
+        "बलात्कार",
+        "बहुत मदहोश",
+        "बांध",
+        "बिल्ली",
+        "बेल अंत",
+        "बेवकूफों",
+        "बोल पड़ना",
+        "भगवान-शापित",
+        "भगशेफ",
+        "मल",
+        "मलाशय",
+        "माँ कमीने",
+        "मुखमैथुन",
+        "मुर्गा",
+        "मुर्गा के",
+        "मुर्गा चूसने वाला",
+        "मूर्ख",
+        "मैल",
+        "योनि",
+        "योनी",
+        "यौन-संबंध",
+        "रक्तरंजित",
+        "लानत है",
+        "लिंग",
+        "लुटेरा",
+        "लेबिया",
+        "वहशी",
+        "वहशीता",
+        "वियाग्रा",
+        "वीर्य",
+        "वेश्या",
+        "वैंग",
+        "वो साले",
+        "शिफ़्ट को",
+        "शिश्नमल",
+        "संभोग सुख",
+        "सह",
+        "सह शॉट",
+        "साहस",
+        "सिगरेट",
+        "सींग का बना हुआ",
+        "स्तन",
+        "स्तनों",
+        "हवस",
+        "हस्तमैथुन",
+        "होमोसेक्सुअल",
+        "होर",
+    ],
+    "id": english_badwords
+    + [
+        "abortus",
+        "anal",
+        "dubur",
+        "pantat",
+        "bajingan",
+        "keledai",
+        "keparat",
+        "tas bola",
+        "bola",
+        "bellend",
+        "kejam",
+        "kebinatangan",
+        "menggerutu",
+        "pelacur",
+        "berdarah",
+        "blowjob",
+        "bollok",
+        "dada",
+        "payudara",
+        "buceta",
+        "gelandangan",
+        "pengunyah karpet",
+        "celah",
+        "cipa",
+        "kelentit",
+        "kokang",
+        "pengisap ayam",
+        "ayam",
+        "coon",
+        "sampah",
+        "air mani",
+        "cumshot",
+        "cunillingus",
+        "vagina",
+        "mengutuk",
+        "kontol",
+        "dildo",
+        "dink",
+        "anjing-keparat",
+        "duche",
+        "tanggul",
+        "berejakulasi",
+        "ejakulasi",
+        "homo",
+        "fagging",
+        "kayu bakar",
+        "penggemar",
+        "felching",
+        "fellatio",
+        "flens",
+        "brengsek",
+        "kacau",
+        "sialan",
+        "persetan",
+        "pengepakan fudge",
+        "terkutuk",
+        "ya tuhan",
+        "neraka",
+        "hore",
+        "terangsang",
+        "kock",
+        "labia",
+        "nafsu",
+        "bernafsu",
+        "masokis",
+        "masturbasi",
+        "keparat ibu",
+        "nazi",
+        "orang negro",
+        "negro",
+        "orgasim",
+        "orgasme",
+        "cotok",
+        "penis",
+        "kencing",
+        "kesal",
+        "pisser",
+        "bikin",
+        "buritan",
+        "porno",
+        "pornografi",
+        "tusukan",
+        "menusuk",
+        "pube",
+        "pussies",
+        "memperkosa",
+        "pemerkosa",
+        "memperlambat",
+        "rimming",
+        "sadis",
+        "meniduri",
+        "skrotum",
+        "seks",
+        "bercinta",
+        "waria",
+        "kotoran",
+        "shite",
+        "kengerian",
+        "dikirim",
+        "buang hajat",
+        "menyebalkan",
+        "smegma",
+        "jelaga",
+        "merebut",
+        "dasar bajingan",
+        "ruang",
+        "keberanian",
+        "buah pelir",
+        "titt",
+        "viagra",
+        "vulva",
+        "wang",
+        "terima kasih",
+        "x diberi peringkat",
+        "xxx",
+    ],
+    "kn": english_badwords
+    + [
+        "ಗರ್ಭಪಾತ",
+        "ಗುದ",
+        "ಗುದದ್ವಾರ",
+        "ಕತ್ತೆ",
+        "ಆಶ್-ಫಕರ್",
+        "ಅಸ್ಹೋಲ್",
+        "ಅಸೋಲೆಸ್",
+        "ಬಾಲ್ಬಾಗ್",
+        "ಚೆಂಡುಗಳು",
+        "ಬಾಸ್ಟರ್ಡ್",
+        "ಬೆಲೆಂಡ್",
+        "ಮೃದ್ವಂಗಿ",
+        "ಪ್ರಾಣಿಜನ್ಯತೆ",
+        "ಬಿಚ್",
+        "ಬಿಟ್ಚಿಸ್",
+        "ಬೆಚಿಂಗ್",
+        "ರಕ್ತಸಿಕ್ತ",
+        "ಬ್ಲೋಜಾಬ್",
+        "ಬೊಲ್ಲೊಕ್",
+        "ಕುರುಚಲು ಗಿಡ",
+        "ಬೂಬಿಗಳು",
+        "ಸ್ತನಗಳನ್ನು",
+        "ಬುಕೆಟಾ",
+        "ತಿಕ",
+        "ಬಟ್",
+        "ಕಾರ್ಪೆಟ್ ಮಂಚರ್",
+        "ಚಿಂಕ್",
+        "ಸಿಪಾ",
+        "ಚಂದ್ರನಾಡಿ",
+        "ಕೋಳಿ",
+        "ಕೋಳಿ ಸಕ್ಕರ್",
+        "ಕಾಕ್ಸ್",
+        "ಕೂನ್",
+        "ಅಮೇಧ್ಯ",
+        "ಕಮ್",
+        "ಕಮ್ಶಾಟ್",
+        "ಕುನಿಲ್ಲಸ್",
+        "ಕಂಟ್",
+        "ಡ್ಯಾಮ್",
+        "ಡಿಕ್",
+        "ದ್ವಿಧ್ರುವಿ",
+        "dildos",
+        "ಡಿಂಕ್",
+        "ನಾಯಿ-ಫಕರ್",
+        "ಡಚೆ",
+        "ಡೈಕ್",
+        "ಹೊರಹೊಮ್ಮಿಸು",
+        "ಸ್ಫೂರ್ತಿ",
+        "ಎಜಾಕ್ಯುಲೇಟ್ಸ್",
+        "ಇಜಲಲೇಟಿಂಗ್",
+        "ಉದ್ಗಾರ",
+        "ತಮಾಷೆ",
+        "ಮಂದಗತಿ",
+        "ಮಬ್ಬು",
+        "fagots",
+        "ಫ್ಯಾನಿ",
+        "ಹೊಡೆತ",
+        "ಪತನ",
+        "ಚಾಚುಪಟ್ಟಿ",
+        "ಫಕ್",
+        "ನಾಶವಾಗಿದ್ದನು",
+        "ಫಕರ್",
+        "fuckers",
+        "ಫಕಿಂಗ್",
+        "ಫಕಿಂಗ್ಸ್",
+        "ಇಷ್ಟಪಡುತ್ತಾನೆ",
+        "ಮಿಠಾಯಿ ಪ್ಯಾಕರ್",
+        "ದೇವರನ್ನು ಹಾನಿಗೊಳಗಾಯಿತು",
+        "ಗಾಡ್ಡಮ್",
+        "ನರಕ",
+        "ಹೋರ್",
+        "ಮೊನಚಾದ",
+        "ಜರ್ಕ್-ಆಫ್",
+        "ಕೋಕ್",
+        "ಯೋನಿಯ",
+        "ಕಾಮ",
+        "ಕಾಮುಕ",
+        "ಮಾಸೋಚಿಸ್ಟ್",
+        "ಹಸ್ತಮೈಥುನ ಮಾಡು",
+        "ತಾಯಿ ಫಕರ್",
+        "ನಾಜಿ",
+        "ನಿಗರ್",
+        "ನಿಗ್ಗರ್ಗಳು",
+        "ಒರಾಸಿಮ್",
+        "ಪರಾಕಾಷ್ಠೆ",
+        "ಪರಾಕಾಷ್ಠೆಗಳನ್ನು",
+        "ಪೆಕರ್",
+        "ಶಿಶ್ನ",
+        "ಮೂತ್ರ ವಿಸರ್ಜಿಸು",
+        "ನಿರುತ್ಸಾಹಗೊಂಡಿದೆ",
+        "ಪಿಸರ್",
+        "ಮೂತ್ರಪಿಂಡಗಳು",
+        "pissing",
+        "ಪಿಸ್ಸಾಫ್",
+        "ಪೂಪ್",
+        "ಅಶ್ಲೀಲತೆ",
+        "ಅಶ್ಲೀಲ",
+        "ಚುಚ್ಚು",
+        "ಪ್ರಿಕ್ಸ್",
+        "ಪಬ್",
+        "ಪುಸಿಗಳು",
+        "ಪುಸಿ",
+        "ಅತ್ಯಾಚಾರ",
+        "ಅತ್ಯಾಚಾರಿ",
+        "ಗುದನಾಳದ",
+        "ರಿಟಾರ್ಡ್",
+        "ಹಚ್ಚುವುದು",
+        "ದುಃಖಗಾರ",
+        "ತಿರುಗಿಸುವುದು",
+        "ಸ್ಕ್ರೋಟಮ್",
+        "ವೀರ್ಯ",
+        "ಲೈಂಗಿಕತೆ",
+        "ಶಾಗ್",
+        "ಶಾಗ್ಗಿಂಗ್",
+        "ಶೆಮೇಲ್",
+        "ಶಿಟ್",
+        "ಷೈಟ್",
+        "ಶಿಟ್ಸ್",
+        "shitted",
+        "ಅಲುಗಾಡುವಿಕೆ",
+        "ಅಸಹ್ಯ",
+        "ಸ್ಕಾಂಕ್",
+        "ಸೂಳೆ",
+        "ಸ್ಲಟ್ಗಳು",
+        "ಸ್ಮೆಗ್ಮಾ",
+        "ಕೊಳೆತ",
+        "ಸ್ನ್ಯಾಚ್",
+        "ಮಗ-ಆಫ್-ಬಿಚ್",
+        "spac",
+        "ಉಬ್ಬು",
+        "ವೃಷಣ",
+        "ಟಿಟ್",
+        "ಚೇಕಡಿ ಹಕ್ಕಿಗಳು",
+        "turd",
+        "ಯೋನಿ",
+        "ವಯಾಗ್ರ",
+        "ವಾಂಗ್",
+        "ಮುಷ್ಕರ",
+        "x ರೇಟೆಡ್",
+        "xxx",
+    ],
+    "ml": english_badwords
+    + [
+        "ഗർഭഛിദ്രം",
+        "വിശപ്പ്",
+        "മലദ്വാരം",
+        "കഴുത",
+        "അസി ഫക്കർ",
+        "കഴുതകളെ",
+        "ആസ്ഹോൾ",
+        "അശ്ളീലങ്ങൾ",
+        "ബോൾബാഗ്",
+        "പന്തുകൾ",
+        "തന്തയില്ലാത്തവൻ",
+        "ബെല്ലെൻഡ്",
+        "മൃഗീയമായ",
+        "മൃഗീയത",
+        "ബിച്ച്",
+        "ബിച്ചുകൾ",
+        "ബിപിഡിംഗ്",
+        "രക്തരൂക്ഷിതമായ",
+        "ആശ്വാസം",
+        "ബലോക്ക്",
+        "ബോബ്",
+        "പൂക്കൾ",
+        "സ്തനങ്ങൾ",
+        "ബ്യൂട്ടാ",
+        "ബം",
+        "മയക്കുമരുന്ന്",
+        "പരവതാനി മാൻച്ചർ",
+        "ചുംബ്",
+        "സിപാ",
+        "ക്ലോറിസിസ്",
+        "കോക്ക്",
+        "കോക്ക് സക്കർ",
+        "കോക്സ്",
+        "കോൺ",
+        "ക്രാപ്പ്",
+        "ശുക്ലം",
+        "പുരുഷാരം",
+        "സി",
+        "മുഷിഞ്ഞ",
+        "കഷ്ടം",
+        "ഡിക്ക്",
+        "ഡിൽഡോ",
+        "dildos",
+        "ഡൈൻ",
+        "നായ-ഫക്കർ",
+        "ഡച്ച്",
+        "ഡൈകെ",
+        "ശമിപ്പിക്കുക",
+        "മോഷ്ടിച്ചു",
+        "വികാരങ്ങൾ",
+        "വിരസത",
+        "മടി",
+        "ക്ഷീണിപ്പിക്കുക",
+        "fagot",
+        "വഞ്ചന",
+        "ഫാനി",
+        "വേദന",
+        "flange",
+        "ഊമ്പി",
+        "സംഭോഗം ചെയ്യുക",
+        "ഫക്കർ",
+        "നർമ്മം",
+        "ഫഡ്ജ് പാക്കർ",
+        "ദൈവം-കൊള്ളിത",
+        "ഗോഡ്ഡം",
+        "നരകം",
+        "വയ്ക്കുക",
+        "വൃത്തികെട്ട",
+        "ജെർക് ഓഫ്",
+        "കിക്ക്",
+        "ലാബിയ",
+        "മോഹം",
+        "മോഹഭംഗം",
+        "മാസോച്ചിസ്റ്റ്",
+        "സ്വയംഭോഗം ചെയ്യുക",
+        "അമ്മ ഫക്കർ",
+        "നാസി",
+        "നിഗർ",
+        "മയക്കുമരുന്നുകൾ",
+        "രതിമൂർച്ഛ",
+        "പെക്കർ",
+        "ലിംഗം",
+        "മൂത്രമൊഴിക്കുക",
+        "കുഴഞ്ഞുവീഴുന്നു",
+        "പിസ്സർ",
+        "പിസ്സകൾ",
+        "pissing",
+        "പിസ്സോഫ്",
+        "poop",
+        "അശ്ലീലം",
+        "അശ്ലീലത",
+        "പ്രാവി",
+        "വിസർജ്യങ്ങൾ",
+        "പ്യൂബ്",
+        "pussies",
+        "pussy",
+        "ബലാൽസംഗം",
+        "ബലാത്സംഗം",
+        "മലാശയം",
+        "തുടരുക",
+        "റിമ്മിംഗ്",
+        "സചിസ്റ്റ്",
+        "വഞ്ചി",
+        "പുല്ല്",
+        "ബീജം",
+        "ശവം",
+        "ഷാഗിംഗ്",
+        "അവൾ",
+        "ഷീറ്റ്",
+        "ഷെയ്റ്റ്",
+        "shits",
+        "തിന്നിട്ടില്ല",
+        "ഷോർട്ട്",
+        "ഷൈറ്റി",
+        "സ്കാൻ",
+        "മന്ദഹസരം",
+        "സ്നെഗമാ",
+        "പുഞ്ചിരി",
+        "പിടിക്കുക",
+        "വെറുക്കപ്പെട്ടയാൾ",
+        "സ്പെയ്ക്",
+        "തുളച്ച്",
+        "വൃഷണം",
+        "പേ",
+        "ടിത്ത്",
+        "കുഴപ്പമില്ല",
+        "യോനി",
+        "വരാഗ്ര",
+        "വാൽവ",
+        "വാങ്",
+        "വാൻ",
+        "വേശ്യ",
+        "x റേറ്റുചെയ്തു",
+        "xxx",
+    ],
+    "mr": english_badwords
+    + [
+        "गर्भपात",
+        "गुदा",
+        "गाढव",
+        "गांडुळ",
+        "asses",
+        "asshole",
+        "assholes",
+        "ballbag",
+        "चेंडू",
+        "बॅस्टर्ड",
+        "बेलेंड",
+        "बेस्टियल",
+        "प्राण्यांबरोबर",
+        "कुत्री",
+        "बिट्स",
+        "खूनी",
+        "blowjob",
+        "बोलोक",
+        "बोब",
+        "स्तन",
+        "बसीटा",
+        "बम",
+        "बट",
+        "कार्पेट मुन्चर",
+        "चिंक",
+        "सिपा",
+        "क्लिटोरिस",
+        "मुर्ख",
+        "मांसाहारी",
+        "कॉक्स",
+        "कॉनन",
+        "बकवास",
+        "सह",
+        "cumshot",
+        "कनिलिंगस",
+        "कांट",
+        "धिक्कार",
+        "डिक",
+        "dildo",
+        "डिल्डो",
+        "डंक",
+        "duche",
+        "डाईक",
+        "उद्गार",
+        "उत्साही",
+        "ejaculates",
+        "उत्सुकता",
+        "स्खलन",
+        "फॅग",
+        "फॅगिंग",
+        "फॅगॉट",
+        "फॅगॉट्स",
+        "फॅनी",
+        "फेलिंग",
+        "फॅलेटीओ",
+        "निकला",
+        "fucked",
+        "गुप्तचर",
+        "fuckers",
+        "fucking",
+        "fuckings",
+        "fucks",
+        "फडगे पॅकर",
+        "देव-शापित",
+        "देव",
+        "नरक",
+        "होरे",
+        "शिंग",
+        "झटका बंद",
+        "कॉक",
+        "लॅबिया",
+        "वासना",
+        "मासोचिस्ट",
+        "हस्तमैथुन करा",
+        "आई माकड",
+        "नाझी",
+        "निगर",
+        "निगार",
+        "ऑर्गॅसिम",
+        "संभोग",
+        "orgasms",
+        "चापटी",
+        "पुरुषाचे जननेंद्रिय",
+        "पेशी",
+        "pissed",
+        "पिसर",
+        "pisses",
+        "पिसिंग",
+        "पिसोफ",
+        "घाट",
+        "अश्लील",
+        "पोर्नोग्राफी",
+        "मुरुम",
+        "प्रिक्स",
+        "प्यूब",
+        "pussies",
+        "मांजर",
+        "बलात्कार",
+        "गुदाशय",
+        "मंद",
+        "rimming",
+        "दुःखी",
+        "screwing",
+        "स्क्रोटम",
+        "वीर्य",
+        "लिंग",
+        "शेग",
+        "shagging",
+        "शेमले",
+        "विचित्र",
+        "shite",
+        "shits",
+        "shitted",
+        "shitting",
+        "shitty",
+        "घाणेरडा",
+        "फट",
+        "sluts",
+        "सुगंध",
+        "स्मट",
+        "छेडछाड",
+        "मुलगा-एक-कुत्री",
+        "spac",
+        "तिरस्कार",
+        "परीक्षक",
+        "शीर्षक",
+        "टिट",
+        "टर्ड",
+        "योनी",
+        "वियाग्रा",
+        "वल्वा",
+        "वांग",
+        "विंक",
+        "वेश्या",
+        "एक्स रेट केले",
+        "xxx",
+    ],
+    "pt": english_badwords
+    + [
+        "aborto",
+        "amador",
+        "anal",
+        "aparafusar",
+        "aranha",
+        "ariano",
+        "arrebatar",
+        "ass-filho da puta",
+        "asses",
+        "balalao",
+        "bastardo",
+        "bate uma",
+        "bellend",
+        "bestial",
+        "bestialidade",
+        "bicha",
+        "bichano",
+        "bichanos",
+        "bichas",
+        "biscate",
+        "bissexual",
+        "boceta",
+        "bolas",
+        "bollok",
+        "boob",
+        "boquete",
+        "bosta",
+        "braulio de borracha",
+        "buceta",
+        "bumbum",
+        "bunda",
+        "burro",
+        "cabrao",
+        "cacete",
+        "cadela",
+        "cadelas",
+        "cagando",
+        "cagar",
+        "calçado",
+        "camisinha",
+        "caralho",
+        "cerveja",
+        "chochota",
+        "chupar",
+        "cipa",
+        "clitoris",
+        "clitóris",
+        "cobiçoso",
+        "cocaína",
+        "cocô",
+        "coito",
+        "colhoes",
+        "com tesão",
+        "comedor de tapetes",
+        "comer",
+        "cona",
+        "consolo",
+        "coon",
+        "coragem",
+        "corno",
+        "cu",
+        "cunillingus",
+        "dar o rabo",
+        "desgraçado",
+        "dildo",
+        "dildos",
+        "dink",
+        "dog-filho da puta",
+        "droga",
+        "duche",
+        "dum raio",
+        "ejacula",
+        "ejaculado",
+        "ejacular",
+        "ejaculação",
+        "empacotador de fudge",
+        "escroto",
+        "esporra",
+        "estuprador",
+        "estupro",
+        "fagging",
+        "fanny",
+        "fecal",
+        "felação",
+        "felching",
+        "fenda",
+        "filho da puta",
+        "filhos da puta",
+        "foda",
+        "foda-se",
+        "fode",
+        "foder",
+        "fodido",
+        "frango assado",
+        "galo",
+        "galos",
+        "gozada",
+        "gozar",
+        "grelho",
+        "heroína",
+        "homem gay",
+        "homoerótico",
+        "homosexual",
+        "hore",
+        "idiota",
+        "idiotas",
+        "inferno",
+        "kock",
+        "lolita",
+        "luxúria",
+        "lábios",
+        "lésbica",
+        "maldito",
+        "mama",
+        "masoquista",
+        "masturbar",
+        "merda",
+        "merdas",
+        "mesa",
+        "mijando",
+        "mijar",
+        "nazista",
+        "negro",
+        "niggers",
+        "não me chateies",
+        "orgasim",
+        "orgasmo",
+        "orgasmos",
+        "otário",
+        "paneleiro",
+        "passar um cheque",
+        "pau",
+        "peidar",
+        "peitos",
+        "peituda",
+        "pica",
+        "picadas",
+        "pinto",
+        "pisser",
+        "porcaria",
+        "porno",
+        "pornografia",
+        "pornô",
+        "porra",
+        "prostituta",
+        "pube",
+        "punheta",
+        "puta",
+        "puta que pariu",
+        "puta que te pariu",
+        "putaria",
+        "puto",
+        "pênis",
+        "queca",
+        "retardar",
+        "reto",
+        "rimming",
+        "sacanagem",
+        "saco",
+        "saco de bola",
+        "sangrento",
+        "sapatona",
+        "sexo",
+        "shite",
+        "skank",
+        "smegma",
+        "spac",
+        "sujeira",
+        "sádico",
+        "sêmen",
+        "testículo",
+        "tetas",
+        "titt",
+        "torneira",
+        "transando",
+        "transar",
+        "transsexual",
+        "trepada",
+        "vadia",
+        "vadias",
+        "vagabunda",
+        "vagabundo",
+        "vagina",
+        "vai tomar no cu",
+        "vai-te foder",
+        "veado",
+        "viagra",
+        "vibrador",
+        "vulva",
+        "wang",
+        "x avaliado",
+        "xana",
+        "xixi",
+        "xochota",
+        "xxx",
+        "ânus",
+    ],
+    "te": english_badwords
+    + [
+        "గర్భస్రావం",
+        "అంగ",
+        "పాయువు",
+        "గాడిద",
+        "గాడిద-fucker",
+        "asses",
+        "assholes",
+        "బాల్బ్యాగ్",
+        "బంతుల్లో",
+        "బాస్టర్డ్",
+        "బెల్లెండ్",
+        "మృగ",
+        "బెస్టియాలిటీ",
+        "బిచ్",
+        "bitches",
+        "బిట్చింగ్",
+        "బ్లడీ",
+        "blowjob",
+        "బోల్లక",
+        "బూబ్",
+        "వక్షోజాలను",
+        "ఛాతీ",
+        "buceta",
+        "బం",
+        "బట్",
+        "కార్పెట్ ముంచర్",
+        "చింక్",
+        "cipa",
+        "స్త్రీగుహ్యాంకురము",
+        "ఆత్మవిశ్వాసం",
+        "కాక్-సక్కర్",
+        "కాక్స్",
+        "కూన్",
+        "చెత్త",
+        "కం",
+        "cumshot",
+        "క్యునిల్లింగస్",
+        "కంట్",
+        "తిట్టు",
+        "డిక్",
+        "లైంగిక సంతృప్తి కోసం స్త్రీలు ఉపయోగించే పురుషాంగము వంటి పరికరము",
+        "డిల్డోస్",
+        "dink",
+        "కుక్క-fucker",
+        "డూష్",
+        "డైక్",
+        "స్ఖలించు",
+        "ఎజాక్యులేటెడ్",
+        "ఎజాక్యులేట్స్",
+        "ఎరాక్యులేటింగ్",
+        "స్ఖలనం",
+        "నవుకరు",
+        "ఫాగ్గింగ్",
+        "ఫాగాట్",
+        "ఫగాట్స్",
+        "fanny",
+        "ఫెల్చింగ్",
+        "కుడుచుట",
+        "అచ్చు",
+        "ఫక్",
+        "ఇబ్బంది పెట్టాడు",
+        "fucker",
+        "ఫకర్స్",
+        "ఫకింగ్",
+        "ఫకింగ్స్",
+        "ఫక్స్",
+        "ఫడ్జ్ ప్యాకర్",
+        "దేవతలా మంచిది",
+        "గాడ్డామ్",
+        "నరకం",
+        "హోర్",
+        "horny",
+        "జెర్క్-ఆఫ్",
+        "కాక్",
+        "పెదవి",
+        "కామం",
+        "మనసు పడ్డట్లు చిత్రించారు",
+        "masochist",
+        "హస్తప్రయోగం",
+        "తల్లి ఫెకర్",
+        "నాజీ",
+        "నిగ్గర్",
+        "నిగ్గర్స్",
+        "ఆర్గాసిమ్",
+        "స్కలనం",
+        "orgasms",
+        "pecker",
+        "పురుషాంగం",
+        "విసర్జన",
+        "pissed",
+        "పిస్సర్",
+        "పిస్సీస్",
+        "పిస్సింగ్",
+        "పిస్సాఫ్",
+        "poop",
+        "శృంగార",
+        "పోర్నో",
+        "అశ్లీల",
+        "బుడతడు",
+        "ప్రిక్స్",
+        "ప్యూబ్",
+        "pussies",
+        "పుస్సీ",
+        "రేప్",
+        "ఉన్నప్పటికీ బలాత్కారం",
+        "పురీషనాళం",
+        "రిటార్డ్",
+        "రిమ్మింగ్",
+        "పీడన కాముకత",
+        "screwing",
+        "స్క్రోటమ్",
+        "వీర్యం",
+        "సెక్స్",
+        "బొచ్చు",
+        "షగ్గింగ్",
+        "షీమేల్",
+        "ఒంటి",
+        "షైట్",
+        "షిట్స్",
+        "షిట్టెడ్",
+        "షిట్టింగ్",
+        "shitty",
+        "స్కాన్క్",
+        "నీతి",
+        "స్లట్స్",
+        "శిశ్న",
+        "స్మట్",
+        "స్నాచ్",
+        "ఒక బిచ్ కుమారుడు ఆఫ్",
+        "spac",
+        "స్పంక్",
+        "వృషణాలు",
+        "తునక",
+        "టిట్స్",
+        "టిట్",
+        "turd",
+        "యోని",
+        "వయాగ్రా",
+        "జననాంగం",
+        "వాంగ్",
+        "వ్యాంక్",
+        "వేశ్య",
+        "x రేట్",
+        "xxx",
+    ],
+    "vi": english_badwords
+    + [
+        "sự phá thai",
+        "hậu môn",
+        "mông",
+        "đồ ngu",
+        "lừa",
+        "lỗ đít",
+        "túi bóng",
+        "những quả bóng",
+        "đồ khốn",
+        "tuyệt vời",
+        "mục sư",
+        "lòng tốt",
+        "chó cái",
+        "dính máu",
+        "công việc thổi",
+        "bollok",
+        "boob",
+        "ngực",
+        "buceta",
+        "ăn mày",
+        "thảm muncher",
+        "sứt mẻ",
+        "cipa",
+        "âm vật",
+        "gà",
+        "gà hút",
+        "gà trống",
+        "coon",
+        "tào lao",
+        "kiêm",
+        "cum",
+        "cunillingus",
+        "lồn",
+        "chỉ trích",
+        "tinh ranh",
+        "dương vật giả",
+        "dink",
+        "chó-chó",
+        "duche",
+        "đê",
+        "xuất tinh",
+        "fag",
+        "đóng băng",
+        "fagot",
+        "đồ ăn vặt",
+        "người hâm mộ",
+        "nỉ",
+        "thất bại",
+        "mặt bích",
+        "chết tiệt",
+        "quái",
+        "đụ",
+        "ôm",
+        "đóng gói fudge",
+        "địa ngục",
+        "có",
+        "sừng",
+        "giật",
+        "kock",
+        "môi âm",
+        "ham muốn",
+        "khổ dâm",
+        "thủ dâm",
+        "mẹ kiếp",
+        "nazi",
+        "người da đen",
+        "người mách nước",
+        "cực khoái",
+        "người mổ",
+        "dương vật",
+        "đi tiểu",
+        "bực mình",
+        "đái",
+        "phân",
+        "khiêu dâm",
+        "nội dung khiêu dâm",
+        "châm",
+        "chích",
+        "pube",
+        "pussies",
+        "âm hộ",
+        "hiếp dâm",
+        "trực tràng",
+        "chậm phát triển",
+        "xé",
+        "người tàn bạo",
+        "vặn vít",
+        "bìu",
+        "tinh dịch",
+        "tình dục",
+        "lông",
+        "xáo trộn",
+        "đồng tính",
+        "cứt",
+        "shite",
+        "ván trượt",
+        "đĩ",
+        "quần lót",
+        "smegma",
+        "xì trum",
+        "con trai",
+        "spac",
+        "spunk",
+        "tinh hoàn",
+        "ăn miếng trả miếng",
+        "titt",
+        "cỏ",
+        "âm đạo",
+        "viagra",
+        "âm môn",
+        "wang",
+        "đã ngủ",
+        "con điếm",
+        "x đánh giá",
+        "xxx",
+    ],
+    "zh": english_badwords
+    + [
+        "13.",
+        "13点",
+        "㞗",
+        "三级片",
+        "下三烂",
+        "下贱",
+        "个老子的",
+        "九游",
+        "乳",
+        "乳交",
+        "乳头",
+        "乳房",
+        "乳波臀浪",
+        "交配",
+        "仆街",
+        "仆街",
+        "他奶奶",
+        "他奶奶的",
+        "他奶娘的",
+        "他妈",
+        "他妈ㄉ王八蛋",
+        "他妈地",
+        "他妈的",
+        "他娘",
+        "他马的",
+        "你个傻比",
+        "你他马的",
+        "你全家",
+        "你奶奶的",
+        "你她马的",
+        "你妈",
+        "你妈的",
+        "你娘",
+        "你娘卡好",
+        "你娘咧",
+        "你它妈的",
+        "你它马的",
+        "你是鸡",
+        "你是鸭",
+        "你老味",
+        "你老母",
+        "你老闆",
+        "你马的",
+        "做爱",
+        "傻比",
+        "傻逼",
+        "册那",
+        "冚家拎",
+        "冚家鏟",
+        "军妓",
+        "几八",
+        "几叭",
+        "几巴",
+        "几芭",
+        "刚度",
+        "刚瘪三",
+        "包皮",
+        "十三点",
+        "卖B",
+        "卖比",
+        "卖淫",
+        "卵",
+        "卵子",
+        "双峰微颤",
+        "口交",
+        "口肯",
+        "叫床",
+        "吃屎",
+        "后庭",
+        "吹箫",
+        "咸家伶",
+        "咸家鏟",
+        "塞你公",
+        "塞你娘",
+        "塞你母",
+        "塞你爸",
+        "塞你老师",
+        "塞你老母",
+        "处女",
+        "外阴",
+        "大卵子",
+        "大卵泡",
+        "大鸡巴",
+        "奶",
+        "奶奶的熊",
+        "奶子",
+        "奸",
+        "奸你",
+        "她妈地",
+        "她妈的",
+        "她马的",
+        "妈B",
+        "妈个B",
+        "妈个比",
+        "妈个老比",
+        "妈妈的",
+        "妈比",
+        "妈的",
+        "妈的B",
+        "妈逼",
+        "妓",
+        "妓女",
+        "妓院",
+        "妳她妈的",
+        "妳妈的",
+        "妳娘的",
+        "妳老母的",
+        "妳马的",
+        "姘头",
+        "姣西",
+        "姦",
+        "娘个比",
+        "娘的",
+        "婊子",
+        "婊子养的",
+        "嫖娼",
+        "嫖客",
+        "它妈地",
+        "它妈的",
+        "密洞",
+        "射你",
+        "射精",
+        "小乳头",
+        "小卵子",
+        "小卵泡",
+        "小瘪三",
+        "小肉粒",
+        "小骚比",
+        "小骚货",
+        "小鸡巴",
+        "小鸡鸡",
+        "尻",
+        "屁眼",
+        "屁股",
+        "屄",
+        "屌",
+        "屎忽",
+        "巨乳",
+        "干x娘",
+        "干七八",
+        "干你",
+        "干你妈",
+        "干你娘",
+        "干你老母",
+        "干你良",
+        "干妳妈",
+        "干妳娘",
+        "干妳老母",
+        "干妳马",
+        "干您娘",
+        "干机掰",
+        "干死CS",
+        "干死GM",
+        "干死你",
+        "干死客服",
+        "幹",
+        "强奸",
+        "强奸你",
+        "性",
+        "性交",
+        "性器",
+        "性无能",
+        "性爱",
+        "情色",
+        "想上你",
+        "懆您妈",
+        "懆您娘",
+        "懒8",
+        "懒八",
+        "懒叫",
+        "懒教",
+        "成人",
+        "我操你祖宗十八代",
+        "扒光",
+        "打炮",
+        "打飞机",
+        "抽插",
+        "招妓",
+        "插你",
+        "插死你",
+        "撒尿",
+        "撚",
+        "操你",
+        "操你全家",
+        "操你奶奶",
+        "操你妈",
+        "操你娘",
+        "操你祖宗",
+        "操你老妈",
+        "操你老母",
+        "操妳",
+        "操妳全家",
+        "操妳妈",
+        "操妳娘",
+        "操妳祖宗",
+        "操机掰",
+        "操比",
+        "操逼",
+        "放荡",
+        "日他娘",
+        "日你",
+        "日你妈",
+        "日你老娘",
+        "日你老母",
+        "日批",
+        "月经",
+        "机八",
+        "机巴",
+        "机机歪歪",
+        "杂种",
+        "柒",
+        "浪叫",
+        "淫",
+        "淫乱",
+        "淫妇",
+        "淫棍",
+        "淫水",
+        "淫秽",
+        "淫荡",
+        "淫西",
+        "湿透的内裤",
+        "激情",
+        "灨你娘",
+        "烂货",
+        "烂逼",
+        "爛",
+        "狗屁",
+        "狗日",
+        "狗狼养的",
+        "玉杵",
+        "王八蛋",
+        "瓜娃子",
+        "瓜婆娘",
+        "瓜批",
+        "瘪三",
+        "白烂",
+        "白痴",
+        "白癡",
+        "硬膠",
+        "祖宗",
+        "私服",
+        "笨實",
+        "笨蛋",
+        "粉腸",
+        "精子",
+        "老二",
+        "老味",
+        "老母",
+        "老瘪三",
+        "老骚比",
+        "老骚货",
+        "肉壁",
+        "肉棍子",
+        "肉棒",
+        "肉缝",
+        "肏",
+        "肛交",
+        "肥西",
+        "色情",
+        "花柳",
+        "荡妇",
+        "賤",
+        "贝肉",
+        "贱B",
+        "贱人",
+        "贱货",
+        "贼你妈",
+        "赛你老母",
+        "赛妳阿母",
+        "赣您娘",
+        "躝癱",
+        "轮奸",
+        "迷药",
+        "逼",
+        "逼样",
+        "野鸡",
+        "閪",
+        "阳具",
+        "阳萎",
+        "阴唇",
+        "阴户",
+        "阴核",
+        "阴毛",
+        "阴茎",
+        "阴道",
+        "阴部",
+        "陰莖",
+        "雞巴",
+        "靠北",
+        "靠母",
+        "靠爸",
+        "靠背",
+        "靠腰",
+        "驶你公",
+        "驶你娘",
+        "驶你母",
+        "驶你爸",
+        "驶你老师",
+        "驶你老母",
+        "骚比",
+        "骚货",
+        "骚逼",
+        "鬼公",
+        "鳩",
+        "鸡8",
+        "鸡八",
+        "鸡叭",
+        "鸡吧",
+        "鸡奸",
+        "鸡巴",
+        "鸡芭",
+        "鸡鸡",
+        "龟儿子",
+        "龟头",
+    ],
+}
diff --git a/en.arpa.bin b/en.arpa.bin
new file mode 100644
index 0000000000000000000000000000000000000000..41880f89fbf5b3d8e64fb2ab5d3e70753ca3c1ed
--- /dev/null
+++ b/en.arpa.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e90c9b25af01dcaa2667ed45d012d891269760fc6eccfe8dbbd161eb20e01d7d
+size 4403509656
diff --git a/en.sp.model b/en.sp.model
new file mode 100644
index 0000000000000000000000000000000000000000..937daf7e94e4808d7babd5739bb0d048474a9c5e
--- /dev/null
+++ b/en.sp.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:262c0b0bd4ebc592e439453bc7e006d0ed12d1914e206a1fb8c7fba091f52c4d
+size 1389058
diff --git a/filtering.py b/filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..c75664e1ca452d10d2c42b7a0db5a4f2455b83b9
--- /dev/null
+++ b/filtering.py
@@ -0,0 +1,879 @@
+import re
+
+import numpy as np
+
+import fasttext
+
+import sentencepiece
+import kenlm
+
+import pathlib
+
+from languages_id import langs_id
+from parameters_filtering import parameters_filtering
+from normalization import normalization
+from stopwords import stopwords
+from badwords import badwords
+
+
+class LoadParameters:
+    @staticmethod
+    def load_parameters(lang_dataset_id):
+        if lang_dataset_id in parameters_filtering:
+            param = parameters_filtering[lang_dataset_id]
+        else:
+            param = parameters_filtering["default"]
+        return param
+
+    @staticmethod
+    def load_stopwords(lang_dataset_id):
+        stopwords_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "stopwords_id"
+        ].iloc[0]
+        if stopwords_lang_id:
+            stopwords_lang = set(stopwords[stopwords_lang_id])
+        else:
+            stopwords_lang = None
+        return stopwords_lang
+
+    @staticmethod
+    def load_badwords(lang_dataset_id):
+        badwords_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "badwords_id"
+        ].iloc[0]
+        if badwords_lang_id:
+            badwords_lang = set(badwords[badwords_lang_id])
+        else:
+            badwords_lang = None
+        return badwords_lang
+
+    @staticmethod
+    def load_model_lang_id(lang_dataset_id, path_fasttext_model):
+        fasttext_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "fasttext_id"
+        ].iloc[0]
+        if fasttext_lang_id:
+            model_lang_id = fasttext.load_model(path_fasttext_model)
+        else:
+            model_lang_id = None
+        return model_lang_id
+
+    @staticmethod
+    def load_sentencepiece_model(lang_dataset_id, path_sentencepiece_model):
+        sentencepiece_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "sentencepiece_id"
+        ].iloc[0]
+        if sentencepiece_lang_id:
+            sentencepiece_model = sentencepiece.SentencePieceProcessor()
+            sentencepiece_model.load(path_sentencepiece_model)
+        else:
+            sentencepiece_model = None
+        return sentencepiece_model
+
+    @staticmethod
+    def load_kenlm_model(lang_dataset_id, path_kenlm_model):
+        kenlm_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "kenlm_id"
+        ].iloc[0]
+        if kenlm_lang_id:
+            kenlm_model = kenlm.Model(path_kenlm_model)
+        else:
+            kenlm_model = None
+        return kenlm_model
+
+
+class ModifyingDocuments:
+    @staticmethod
+    def remove_empty_el_from_list(list_):
+        return [el for el in list_ if el]
+
+    @staticmethod
+    def remove_non_printing_characters(document, non_printing_characters_re):
+        return non_printing_characters_re.sub("", document)
+
+    @staticmethod
+    def uniform_whitespace(
+        document,
+        whitespace=[
+            " ",
+            " ",
+            " ",
+            " ",
+            " ",
+            "　",
+            " ",
+            " ",
+            " ",
+            " ",
+            "￼",
+            "",
+        ],
+    ):
+        """There are different whitespace characters."""
+        whitespace = set(whitespace)
+        document = "".join(
+            [char if char not in whitespace else " " for char in document]
+        )
+        return document
+
+    @staticmethod
+    def replace_digits_with_zeros(document, digits_re):
+        return digits_re.sub("0", document)
+
+    @staticmethod
+    def replace_unicode_punctuation(document, unicode_punctuation):
+        return "".join(unicode_punctuation.get(c, c) for c in document)
+
+    @staticmethod
+    def normalization(
+        document,
+        remove_non_printing_characters,
+        strip,
+        lower_case,
+        uniform_whitespace,
+        replace_digits_with_zeros,
+        replace_unicode_punctuation,
+        non_printing_characters_re=normalization["non_printing_characters_re"],
+        digits_re=normalization["digits_re"],
+        unicode_punctuation=normalization["unicode_punctuation"],
+    ):
+        if remove_non_printing_characters:
+            document = ModifyingDocuments.remove_non_printing_characters(
+                document, non_printing_characters_re
+            )
+        if strip:
+            document = document.strip()
+        if not document:
+            return document
+        if lower_case:
+            document = document.lower()
+        if uniform_whitespace:
+            document = ModifyingDocuments.uniform_whitespace(document)
+        if replace_digits_with_zeros:
+            document = ModifyingDocuments.replace_digits_with_zeros(document, digits_re)
+        if replace_unicode_punctuation:
+            document = ModifyingDocuments.replace_unicode_punctuation(
+                document, unicode_punctuation
+            )
+        return document
+
+    @staticmethod
+    def tokenization(document, sentencepiece_model, join_on_whitespace):
+        document_tokenized = sentencepiece_model.encode_as_pieces(document)
+        if join_on_whitespace:
+            document_tokenized = " ".join(document_tokenized)
+        return document_tokenized
+
+    @staticmethod
+    def split_on_whitespace(
+        document,
+        new_line=False,
+        tab=False,
+    ):
+        """This method also removes concatenated spaces."""
+        sep = [" "] + new_line * ["\n"] + tab * ["\t"]
+        sep = "|".join(sep)
+        split_document = re.split(sep, document)
+        split_document = ModifyingDocuments.remove_empty_el_from_list(split_document)
+        return split_document
+
+    @staticmethod
+    def strip(document, strip_characters):
+        """Way faster than document.strip(strip_characters)
+        since strip_characters is now a set instead of a str,
+        and it contains a lot of elements (all the emojis)."""
+        if not document:
+            return document
+        beg_ind = 0
+        end_ind = len(document)
+        for i in range(len(document)):
+            if document[i] in strip_characters:
+                beg_ind += 1
+            else:
+                break
+        for i in range(1, len(document) + 1):
+            if document[-i] in strip_characters:
+                end_ind -= 1
+            else:
+                break
+        document_stripped = document[beg_ind:end_ind]
+        return document_stripped
+
+    @staticmethod
+    def get_words_from_document(
+        document, sentencepiece_model_tok, lower_case, strip_characters
+    ):
+        """Get words from a document. Non reversible since the document
+        is split on multiple characters, words are stripped of
+        special characters and characters are converted to lower case.
+        Useful to compute ratios, like the stopwords ratio."""
+        if sentencepiece_model_tok:
+            document_normalized = ModifyingDocuments.normalization(
+                document=document,
+                remove_non_printing_characters=True,
+                strip=True,
+                lower_case=True,
+                uniform_whitespace=True,
+                replace_digits_with_zeros=True,
+                replace_unicode_punctuation=True,
+            )
+            words = ModifyingDocuments.tokenization(
+                document_normalized, sentencepiece_model_tok, join_on_whitespace=False
+            )
+        else:
+            words = ModifyingDocuments.split_on_whitespace(
+                document, new_line=True, tab=True
+            )
+        if lower_case:
+            words = [word.lower() for word in words]
+        if strip_characters:
+            words = [ModifyingDocuments.strip(word, strip_characters) for word in words]
+            words = ModifyingDocuments.remove_empty_el_from_list(words)
+        return words
+
+    @staticmethod
+    def words_augmentation(words, group_size, join_char):
+        """Augment words, especially for Chinese (without a space between words)
+        and Vietnamese (with a space between syllables)."""
+        augmentation = [
+            join_char.join(words[i : i + group_size])
+            for i in range(len(words) - group_size + 1)
+        ]
+        return augmentation
+
+    @staticmethod
+    def split_on_newline_tab_whitespace(document):
+        """First split on "\n", then on "\t", then on " "."""
+        sentences = document.split("\n")
+        sentences = [sentence.split("\t") for sentence in sentences]
+        sentences = [
+            [
+                ModifyingDocuments.split_on_whitespace(subsentence)
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        return sentences
+
+    @staticmethod
+    def merge_on_whitespace_tab_newline(sentences):
+        """Invert the method split_on_newline_tab_whitespace.
+        Removes concatenated separators."""
+        sentences = [
+            [" ".join(subsentence) for subsentence in sentence if subsentence]
+            for sentence in sentences
+        ]
+        sentences = ["\t".join(sentence) for sentence in sentences if sentence]
+        if not sentences:
+            return ""
+        document = "\n".join(sentences)
+        return document
+
+    @staticmethod
+    def should_keep_word_with_incorrect_substrings(
+        word, strip_characters, incorrect_word_substrings
+    ):
+        word = ModifyingDocuments.strip(word, strip_characters)
+        should_keep = all(
+            [(i_substr not in word) for i_substr in incorrect_word_substrings]
+        )
+        return should_keep
+
+    @staticmethod
+    def remove_words_with_incorrect_substrings(
+        document,
+        strip_characters,
+        incorrect_word_substrings,
+    ):
+        sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document)
+        sentences = [
+            [
+                [
+                    word
+                    for word in subsentence
+                    if ModifyingDocuments.should_keep_word_with_incorrect_substrings(
+                        word, strip_characters, incorrect_word_substrings
+                    )
+                ]
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences)
+        return document
+
+    @staticmethod
+    def should_keep_long_word(word, strip_characters, length_word_max_cutoff):
+        """If the word is too long but it contains only one
+        special character, it might be a concatenation of one word,
+        a punctuation, and another word, with no space between them.
+        In this case, we give the word a pass."""
+        if len(word) <= length_word_max_cutoff:
+            return True
+        word = ModifyingDocuments.strip(word, strip_characters)
+        if not word:  # The word consisted only of strip characters
+            return False
+        if len(word) <= length_word_max_cutoff:
+            return True
+        return False
+
+    def remove_long_words(
+        document,
+        strip_characters,
+        length_word_max_cutoff,
+    ):
+        sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document)
+        sentences = [
+            [
+                [
+                    word
+                    for word in subsentence
+                    if ModifyingDocuments.should_keep_long_word(
+                        word,
+                        strip_characters,
+                        length_word_max_cutoff,
+                    )
+                ]
+                for subsentence in sentence
+            ]
+            for sentence in sentences
+        ]
+        document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences)
+        return document
+
+    @staticmethod
+    def modifying_documents(
+        document,
+        cond_uniform_whitespace,
+        cond_replace_unicode_punctuation,
+        cond_remove_words_with_incorrect_substrings,
+        strip_characters,
+        incorrect_word_substrings,
+        cond_remove_long_words,
+        length_word_max_cutoff,
+    ):
+        document = ModifyingDocuments.normalization(
+            document=document,
+            remove_non_printing_characters=False,
+            strip=True,
+            lower_case=False,
+            uniform_whitespace=cond_uniform_whitespace,
+            replace_digits_with_zeros=False,
+            replace_unicode_punctuation=cond_replace_unicode_punctuation,
+        )
+        if cond_remove_words_with_incorrect_substrings:
+            document = ModifyingDocuments.remove_words_with_incorrect_substrings(
+                document,
+                strip_characters,
+                incorrect_word_substrings,
+            )
+        if cond_remove_long_words:
+            document = ModifyingDocuments.remove_long_words(
+                document,
+                strip_characters,
+                length_word_max_cutoff,
+            )
+        return document
+
+
+class FunctionDatasetModifyingDocuments:
+    def __init__(self, lang_dataset_id):
+        self.lang_dataset_id = lang_dataset_id
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+
+    def __call__(self, example):
+        example["text"] = ModifyingDocuments.modifying_documents(
+            document=example["text"],
+            cond_uniform_whitespace=self.param["cond_uniform_whitespace"],
+            cond_replace_unicode_punctuation=self.param[
+                "cond_replace_unicode_punctuation"
+            ],
+            cond_remove_words_with_incorrect_substrings=self.param[
+                "cond_remove_words_with_incorrect_substrings"
+            ],
+            strip_characters=self.param["strip_characters"],
+            incorrect_word_substrings=self.param["incorrect_word_substrings"],
+            cond_remove_long_words=self.param["cond_remove_long_words"],
+            length_word_max_cutoff=self.param["length_word_max_cutoff"],
+        )
+        return example
+
+    def __reduce__(self):
+        return (self.__class__, (self.lang_dataset_id,))
+
+
+class Filtering:
+    @staticmethod
+    def check_number_words(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        number_words_min_cutoff,
+        number_words_max_cutoff,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=False,
+            strip_characters=strip_characters,
+        )
+        cond = (len(words) >= number_words_min_cutoff) and (
+            len(words) <= number_words_max_cutoff
+        )
+        return cond
+
+    @staticmethod
+    def compute_repetitions_ratio(document, repetitions_length):
+        def get_freq_ngrams(document, n):
+            ngrams = [document[i : i + n] for i in range(len(document) - n + 1)]
+            freq_ngrams = {}
+            for ngram in ngrams:
+                freq_ngrams[ngram] = freq_ngrams.get(ngram, 0) + 1
+            return freq_ngrams
+
+        freq_ngrams = get_freq_ngrams(document, repetitions_length)
+        if len(freq_ngrams) == 0:
+            return 0
+        freq_ngrams = list(freq_ngrams.values())
+        freq_ngrams = sorted(freq_ngrams, reverse=True)
+        num_rep_ngrams = int(np.sqrt(len(freq_ngrams)))
+        repetitions_ratio = sum(freq_ngrams[:num_rep_ngrams]) / sum(freq_ngrams)
+        return repetitions_ratio
+
+    @staticmethod
+    def check_repetitions_removal(
+        document,
+        repetitions_length,
+        repetitions_max_cutoff,
+    ):
+        repetitions_ratio = Filtering.compute_repetitions_ratio(
+            document, repetitions_length
+        )
+        cond = repetitions_ratio <= repetitions_max_cutoff
+        return cond
+
+    @staticmethod
+    def compute_special_characters_ratio(document, special_characters):
+        special_characters_ratio = len(
+            [char for char in document if char in special_characters]
+        ) / len(document)
+        return special_characters_ratio
+
+    @staticmethod
+    def check_special_characters(
+        document,
+        special_characters,
+        special_characters_max_cutoff,
+    ):
+        special_characters_ratio = Filtering.compute_special_characters_ratio(
+            document, special_characters
+        )
+        cond = special_characters_ratio <= special_characters_max_cutoff
+        return cond
+
+    @staticmethod
+    def compute_stopwords_ratio(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        stopwords,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=True,
+            strip_characters=strip_characters,
+        )
+        if not words:
+            return 0
+        augmentation = []
+        if cond_words_augmentation:
+            augmentation = [
+                ModifyingDocuments.words_augmentation(
+                    words, group_size, words_augmentation_join_char
+                )
+                for group_size in words_augmentation_group_sizes
+            ]
+            augmentation = [word for augm in augmentation for word in augm]
+        stopwords_ratio = len(
+            [word for word in words + augmentation if word in stopwords]
+        ) / len(words)
+        if stopwords_ratio > 1.0:
+            stopwords_ratio = 1.0
+        return stopwords_ratio
+
+    @staticmethod
+    def check_stopwords(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        stopwords,
+        stopwords_min_cutoff,
+    ):
+        cond = True
+        if stopwords:
+            stopwords_ratio = Filtering.compute_stopwords_ratio(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                stopwords,
+            )
+            cond = stopwords_ratio >= stopwords_min_cutoff
+        return cond
+
+    @staticmethod
+    def compute_badwords_ratio(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        badwords,
+    ):
+        words = ModifyingDocuments.get_words_from_document(
+            document,
+            sentencepiece_model_tok,
+            lower_case=True,
+            strip_characters=strip_characters,
+        )
+        if not words:
+            return 0
+        augmentation = []
+        if cond_words_augmentation:
+            augmentation = [
+                ModifyingDocuments.words_augmentation(
+                    words, group_size, words_augmentation_join_char
+                )
+                for group_size in words_augmentation_group_sizes
+            ]
+            augmentation = [word for augm in augmentation for word in augm]
+        badwords_ratio = len(
+            [word for word in words + augmentation if word in badwords]
+        ) / len(words)
+        if badwords_ratio > 1.0:
+            badwords_ratio = 1.0
+        for word in augmentation:
+            if word in badwords:
+                print(word)
+        return badwords_ratio
+
+    @staticmethod
+    def check_badwords(
+        document,
+        sentencepiece_model_tok,
+        strip_characters,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        badwords,
+        badwords_max_cutoff,
+    ):
+        cond = True
+        if badwords:
+            badwords_ratio = Filtering.compute_badwords_ratio(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                badwords,
+            )
+            cond = badwords_ratio <= badwords_max_cutoff
+        return cond
+
+    @staticmethod
+    def compute_lang_id_pred_score(document, model_lang_id):
+        document = document.lower().replace("\n", " ")
+        pred = model_lang_id.predict(document)
+        lang_pred_fasttext_id = pred[0][0].replace("__label__", "")
+        score_pred = pred[1][0]
+        lang_pred_dataset_id = langs_id.loc[
+            langs_id["fasttext_id"] == lang_pred_fasttext_id, "dataset_id"
+        ]
+        if len(lang_pred_dataset_id) > 0:
+            lang_pred_dataset_id = lang_pred_dataset_id.iloc[0]
+        else:
+            lang_pred_dataset_id = "unknown"
+        return lang_pred_dataset_id, score_pred
+
+    @staticmethod
+    def check_lang_id(
+        document,
+        lang_dataset_id,
+        model_lang_id,
+        lang_id_min_cutoff,
+    ):
+        cond = True
+        if model_lang_id:
+            lang_pred_dataset_id, score_pred = Filtering.compute_lang_id_pred_score(
+                document, model_lang_id
+            )
+            cond = (lang_pred_dataset_id == lang_dataset_id) and (
+                score_pred >= lang_id_min_cutoff
+            )
+        return cond
+
+    @staticmethod
+    def compute_perplexity_score(document, sentencepiece_model, kenlm_model):
+        document = ModifyingDocuments.normalization(
+            document=document,
+            remove_non_printing_characters=True,
+            strip=True,
+            lower_case=True,
+            uniform_whitespace=True,
+            replace_digits_with_zeros=True,
+            replace_unicode_punctuation=True,
+        )
+        document = ModifyingDocuments.tokenization(
+            document, sentencepiece_model, join_on_whitespace=True
+        )
+        doc_log_score, doc_length = 0, 0
+        for line in document.split("\n"):
+            log_score = kenlm_model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        pp_score = 10.0 ** (-doc_log_score / doc_length)
+        pp_score = round(pp_score, 1)
+        return pp_score
+
+    @staticmethod
+    def check_perplexity(
+        document,
+        sentencepiece_model,
+        kenlm_model,
+        perplexity_max_cutoff,
+    ):
+        cond = True
+        if kenlm_model:
+            score = Filtering.compute_perplexity_score(
+                document, sentencepiece_model, kenlm_model
+            )
+            cond = score <= perplexity_max_cutoff
+        return cond
+
+    @staticmethod
+    def filtering(
+        document,
+        cond_check_number_words,
+        sentencepiece_model_tok,
+        strip_characters,
+        number_words_min_cutoff,
+        number_words_max_cutoff,
+        cond_check_repetitions_removal,
+        repetitions_length,
+        repetitions_max_cutoff,
+        cond_check_special_characters,
+        special_characters,
+        special_characters_max_cutoff,
+        cond_words_augmentation,
+        words_augmentation_group_sizes,
+        words_augmentation_join_char,
+        cond_check_stopwords,
+        stopwords,
+        stopwords_min_cutoff,
+        cond_check_badwords,
+        badwords,
+        badwords_max_cutoff,
+        cond_check_lang_id,
+        lang_dataset_id,
+        model_lang_id,
+        lang_id_min_cutoff,
+        cond_check_perplexity,
+        sentencepiece_model,
+        kenlm_model,
+        perplexity_max_cutoff,
+    ):
+        if cond_check_number_words:
+            if not Filtering.check_number_words(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                number_words_min_cutoff,
+                number_words_max_cutoff,
+            ):
+                return False
+        if cond_check_repetitions_removal:
+            if not Filtering.check_repetitions_removal(
+                document,
+                repetitions_length,
+                repetitions_max_cutoff,
+            ):
+                return False
+        if cond_check_special_characters:
+            if not Filtering.check_special_characters(
+                document,
+                special_characters,
+                special_characters_max_cutoff,
+            ):
+                return False
+        if cond_check_stopwords:
+            if not Filtering.check_stopwords(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                stopwords,
+                stopwords_min_cutoff,
+            ):
+                return False
+        if cond_check_badwords:
+            if not Filtering.check_badwords(
+                document,
+                sentencepiece_model_tok,
+                strip_characters,
+                cond_words_augmentation,
+                words_augmentation_group_sizes,
+                words_augmentation_join_char,
+                badwords,
+                badwords_max_cutoff,
+            ):
+                return False
+        if cond_check_lang_id:
+            if not Filtering.check_lang_id(
+                document,
+                lang_dataset_id,
+                model_lang_id,
+                lang_id_min_cutoff,
+            ):
+                return False
+        if cond_check_perplexity:
+            if not Filtering.check_perplexity(
+                document,
+                sentencepiece_model,
+                kenlm_model,
+                perplexity_max_cutoff,
+            ):
+                return False
+        return True
+
+
+class FunctionDatasetFiltering:
+    def __init__(
+        self,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
+    ):
+        self.lang_dataset_id = lang_dataset_id
+        self.path_fasttext_model = path_fasttext_model
+        self.path_sentencepiece_model = path_sentencepiece_model
+        self.path_kenlm_model = path_kenlm_model
+
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+        self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
+        self.badwords = LoadParameters.load_badwords(lang_dataset_id)
+        self.model_lang_id = LoadParameters.load_model_lang_id(
+            lang_dataset_id, path_fasttext_model
+        )
+        self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
+            lang_dataset_id, path_sentencepiece_model
+        )
+        self.sentencepiece_model_tok = (
+            self.sentencepiece_model if self.param["tokenization"] else None
+        )
+        self.kenlm_model = LoadParameters.load_kenlm_model(
+            lang_dataset_id, path_kenlm_model
+        )
+
+    def __call__(self, example):
+        keep_example = Filtering.filtering(
+            document=example["text"],
+            cond_check_number_words=self.param["cond_check_number_words"],
+            sentencepiece_model_tok=self.sentencepiece_model_tok,
+            strip_characters=self.param["strip_characters"],
+            number_words_min_cutoff=self.param["number_words_min_cutoff"],
+            number_words_max_cutoff=self.param["number_words_max_cutoff"],
+            cond_check_repetitions_removal=self.param["check_repetitions_removal"],
+            repetitions_length=self.param["repetitions_length"],
+            repetitions_max_cutoff=self.param["repetitions_max_cutoff"],
+            cond_check_special_characters=self.param["cond_check_special_characters"],
+            special_characters=self.param["special_characters"],
+            special_characters_max_cutoff=self.param["special_characters_max_cutoff"],
+            cond_words_augmentation=self.param["cond_words_augmentation"],
+            words_augmentation_group_sizes=self.param["words_augmentation_group_sizes"],
+            words_augmentation_join_char=self.param["words_augmentation_join_char"],
+            cond_check_stopwords=self.param["cond_check_stopwords"],
+            stopwords=self.stopwords,
+            stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
+            cond_check_badwords=self.param["cond_check_badwords"],
+            badwords=self.badwords,
+            badwords_max_cutoff=self.param["badwords_max_cutoff"],
+            cond_check_lang_id=self.param["cond_check_lang_id"],
+            lang_dataset_id=self.lang_dataset_id,
+            model_lang_id=self.model_lang_id,
+            lang_id_min_cutoff=self.param["lang_id_min_cutoff"],
+            cond_check_perplexity=self.param["cond_check_perplexity"],
+            sentencepiece_model=self.sentencepiece_model,
+            kenlm_model=self.kenlm_model,
+            perplexity_max_cutoff=self.param["perplexity_max_cutoff"],
+        )
+        return keep_example
+
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (
+                self.lang_dataset_id,
+                self.path_fasttext_model,
+                self.path_sentencepiece_model,
+                self.path_kenlm_model,
+            ),
+        )
+
+
+class DatasetFiltering:
+    def __init__(
+        self,
+        dataset,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
+        num_proc,
+        path_dir_save_dataset,
+    ):
+        self.ds = dataset
+        self.lang_dataset_id = lang_dataset_id
+        self.path_fasttext_model = path_fasttext_model
+        self.path_sentencepiece_model = path_sentencepiece_model
+        self.path_kenlm_model = path_kenlm_model
+        self.num_proc = num_proc
+        self.path_dir_save_dataset = path_dir_save_dataset
+
+    def modifying_documents(self):
+        dataset_modifying_documents = FunctionDatasetModifyingDocuments(
+            self.lang_dataset_id
+        )
+        self.ds = self.ds.map(dataset_modifying_documents, num_proc=self.num_proc)
+
+    def filtering(self):
+        func_dataset_filtering = FunctionDatasetFiltering(
+            self.lang_dataset_id,
+            self.path_fasttext_model,
+            self.path_sentencepiece_model,
+            self.path_kenlm_model,
+        )
+        self.ds = self.ds.filter(func_dataset_filtering, num_proc=self.num_proc)
+
+    def save_dataset(self):
+        pathlib.Path(self.path_dir_save_dataset).mkdir(parents=True, exist_ok=True)
+        path_dir_save_dataset = pathlib.PurePath(
+            self.path_dir_save_dataset, self.lang_dataset_id
+        )
+        pathlib.Path(path_dir_save_dataset).mkdir(parents=True, exist_ok=True)
+        self.ds.save_to_disk(path_dir_save_dataset)
diff --git a/languages_id.py b/languages_id.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b7747e93374668bccdd282b736cee321735b9f5
--- /dev/null
+++ b/languages_id.py
@@ -0,0 +1,231 @@
+import pandas as pd
+
+
+langs_id = [
+    {
+        "lang": "Afrikaans",
+        "dataset_id": "af",
+        "stopwords_id": "af",
+        "badwords_id": None,
+        "fasttext_id": "af",
+        "sentencepiece_id": "af",
+        "kenlm_id": "af",
+    },
+    {
+        "lang": "Arabic",
+        "dataset_id": "ar",
+        "stopwords_id": "ar",
+        "badwords_id": "ar",
+        "fasttext_id": "ar",
+        "sentencepiece_id": "ar",
+        "kenlm_id": "ar",
+    },
+    {
+        "lang": "Egyptian Arabic",
+        "dataset_id": "arz",
+        "stopwords_id": None,
+        "badwords_id": None,
+        "fasttext_id": "arz",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Assamese",
+        "dataset_id": "as",
+        "stopwords_id": None,
+        "badwords_id": None,
+        "fasttext_id": "as",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Bengali",
+        "dataset_id": "bn",
+        "stopwords_id": "bn",
+        "badwords_id": None,
+        "fasttext_id": "bn",
+        "sentencepiece_id": "bn",
+        "kenlm_id": "bn",
+    },
+    {
+        "lang": "Catalan",
+        "dataset_id": "ca",
+        "stopwords_id": "ca",
+        "badwords_id": "ca",
+        "fasttext_id": "ca",
+        "sentencepiece_id": "ca",
+        "kenlm_id": "ca",
+    },
+    {
+        "lang": "English",
+        "dataset_id": "en",
+        "stopwords_id": "en",
+        "badwords_id": "en",
+        "fasttext_id": "en",
+        "sentencepiece_id": "en",
+        "kenlm_id": "en",
+    },
+    {
+        "lang": "Spanish",
+        "dataset_id": "es",
+        "stopwords_id": "es",
+        "badwords_id": "es",
+        "fasttext_id": "es",
+        "sentencepiece_id": "es",
+        "kenlm_id": "es",
+    },
+    {
+        "lang": "Basque",
+        "dataset_id": "eu",
+        "stopwords_id": "eu",
+        "badwords_id": "eu",
+        "fasttext_id": "eu",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "French",
+        "dataset_id": "fr",
+        "stopwords_id": "fr",
+        "badwords_id": "fr",
+        "fasttext_id": "fr",
+        "sentencepiece_id": "fr",
+        "kenlm_id": "fr",
+    },
+    {
+        "lang": "Gujarati",
+        "dataset_id": "gu",
+        "stopwords_id": None,
+        "badwords_id": None,
+        "fasttext_id": "gu",
+        "sentencepiece_id": "gu",
+        "kenlm_id": "gu",
+    },
+    {
+        "lang": "Hindi",
+        "dataset_id": "hi",
+        "stopwords_id": "hi",
+        "badwords_id": "hi",
+        "fasttext_id": "hi",
+        "sentencepiece_id": "hi",
+        "kenlm_id": "hi",
+    },
+    {
+        "lang": "Indonesian",
+        "dataset_id": "id",
+        "stopwords_id": "id",
+        "badwords_id": "id",
+        "fasttext_id": "id",
+        "sentencepiece_id": "id",
+        "kenlm_id": "id",
+    },
+    {
+        "lang": "Kannada",
+        "dataset_id": "kn",
+        "stopwords_id": None,
+        "badwords_id": "kn",
+        "fasttext_id": "kn",
+        "sentencepiece_id": "kn",
+        "kenlm_id": "kn",
+    },
+    {
+        "lang": "Malayalam",
+        "dataset_id": "ml",
+        "stopwords_id": None,
+        "badwords_id": "ml",
+        "fasttext_id": "ml",
+        "sentencepiece_id": "ml",
+        "kenlm_id": "ml",
+    },
+    {
+        "lang": "Marathi",
+        "dataset_id": "mr",
+        "stopwords_id": "mr",
+        "badwords_id": "mr",
+        "fasttext_id": "mr",
+        "sentencepiece_id": "mr",
+        "kenlm_id": "mr",
+    },
+    {
+        "lang": "Portuguese",
+        "dataset_id": "pt",
+        "stopwords_id": "pt",
+        "badwords_id": "pt",
+        "fasttext_id": "pt",
+        "sentencepiece_id": "pt",
+        "kenlm_id": "pt",
+    },
+    {
+        "lang": "Somali",
+        "dataset_id": "so",
+        "stopwords_id": "so",
+        "badwords_id": None,
+        "fasttext_id": "so",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Swahili",
+        "dataset_id": "sw",
+        "stopwords_id": "sw",
+        "badwords_id": None,
+        "fasttext_id": "sw",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Tamil",
+        "dataset_id": "ta",
+        "stopwords_id": None,
+        "badwords_id": None,
+        "fasttext_id": "ta",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Telugu",
+        "dataset_id": "te",
+        "stopwords_id": None,
+        "badwords_id": "te",
+        "fasttext_id": "te",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Urdu",
+        "dataset_id": "ur",
+        "stopwords_id": "ur",
+        "badwords_id": None,
+        "fasttext_id": "ur",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Vietnamese",
+        "dataset_id": "vi",
+        "stopwords_id": "vi",
+        "badwords_id": "vi",
+        "fasttext_id": "vi",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Yoruba",
+        "dataset_id": "yo",
+        "stopwords_id": "yo",
+        "badwords_id": None,
+        "fasttext_id": "yo",
+        "sentencepiece_id": None,
+        "kenlm_id": None,
+    },
+    {
+        "lang": "Chinese",
+        "dataset_id": "zh",
+        "stopwords_id": "zh",
+        "badwords_id": "zh",
+        "fasttext_id": "zh",
+        "sentencepiece_id": "zh",
+        "kenlm_id": "zh",
+    },
+]
+langs_id = pd.DataFrame(langs_id)
diff --git a/lid.176.bin b/lid.176.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f8707035ea3cc86ac248a4e31fa6368cd845476a
--- /dev/null
+++ b/lid.176.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e
+size 131266198
diff --git a/normalization.py b/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..652e810fb5019c5177f6fd0abf9635f322f23927
--- /dev/null
+++ b/normalization.py
@@ -0,0 +1,52 @@
+import re
+from typing import Dict
+
+
+non_printing_characters_re = re.compile(
+    f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
+)
+
+digits_re: re.Pattern = re.compile(r"\d")
+
+unicode_punctuation: Dict[str, str] = {
+    "，": ",",
+    "。": ".",
+    "、": ",",
+    "„": '"',
+    "”": '"',
+    "“": '"',
+    "«": '"',
+    "»": '"',
+    "１": '"',
+    "」": '"',
+    "「": '"',
+    "《": '"',
+    "》": '"',
+    "´": "'",
+    "∶": ":",
+    "：": ":",
+    "？": "?",
+    "！": "!",
+    "（": "(",
+    "）": ")",
+    "；": ";",
+    "–": "-",
+    "—": " - ",
+    "．": ". ",
+    "～": "~",
+    "’": "'",
+    "…": "...",
+    "━": "-",
+    "〈": "<",
+    "〉": ">",
+    "【": "[",
+    "】": "]",
+    "％": "%",
+    "►": "-",
+}
+
+normalization = {
+    "non_printing_characters_re": non_printing_characters_re,
+    "digits_re": digits_re,
+    "unicode_punctuation": unicode_punctuation,
+}
diff --git a/requirements.txt b/packages.txt
similarity index 100%
rename from requirements.txt
rename to packages.txt
diff --git a/parameters_filtering.py b/parameters_filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ac5f1adaae1bf1ffeb4639186551a0181cb4410
--- /dev/null
+++ b/parameters_filtering.py
@@ -0,0 +1,852 @@
+import string
+import emoji
+
+
+main_special_characters = string.punctuation + string.digits + string.whitespace
+other_special_characters = (
+    "    　    ￼’“”–ー一▬…✦�­£​•€«»°·═"
+    "×士＾˘⇓↓↑←→（）§″′´¿−±∈﻿¢ø‚„½¼¾¹²³―⁃，ˌ¸‹›ʺˈʻ¦‐⠀‰‑≤≥‖"
+    "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン：∼⁄・♡✓⊕․．⋅÷１‟；،、¨ाাी्े◦˚"
+    "゜ʼ≖ʼ¤ッツシ℃√！【】‿∞➤～πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬x？▷Г♫∟™ª₪®「—"
+    "❖」﴾》"
+)
+emoji = list(emoji.UNICODE_EMOJI["en"].keys())
+
+special_characters_default = set(main_special_characters + other_special_characters)
+special_characters_default.update(emoji)
+
+
+parameters_filtering_default = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": False,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": False,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.70,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_af = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.6,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_ar = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.45,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1000000,
+}
+
+parameters_filtering_arz = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.5,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_as = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_bn = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.275,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.05,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 575000,
+}
+
+parameters_filtering_ca = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1750000,
+}
+
+parameters_filtering_en = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": True,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 20,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.3,
+    "cond_check_badwords": True,
+    "badwords_max_cutoff": 0.045,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.80,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500,
+}
+
+parameters_filtering_es = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.2,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500000,
+}
+
+parameters_filtering_eu = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 35,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_fr = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.15,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_gu = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 250000,
+}
+
+parameters_filtering_hi = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 25,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 600000,
+}
+
+parameters_filtering_id = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.25,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 2500000,
+}
+
+parameters_filtering_kn = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 400000,
+}
+
+parameters_filtering_ml = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.2,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 1600000,
+}
+
+parameters_filtering_mr = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 425000,
+}
+
+parameters_filtering_pt = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0.15,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": True,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_so = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": False,
+    "length_word_max_cutoff": 1000,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": False,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_sw = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.275,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_ta = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 50,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_te = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 35,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.25,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_ur = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_vi = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.35,
+    "cond_words_augmentation": True,
+    "words_augmentation_group_sizes": [2, 3],
+    "words_augmentation_join_char": " ",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_yo = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": True,
+    "length_word_max_cutoff": 30,
+    "cond_check_number_words": True,
+    "tokenization": False,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.3,
+    "cond_words_augmentation": False,
+    "words_augmentation_group_sizes": [],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": True,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering_zh = {
+    "cond_uniform_whitespace": True,
+    "cond_replace_unicode_punctuation": False,
+    "cond_remove_words_with_incorrect_substrings": False,
+    "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
+    "cond_remove_long_words": False,
+    "length_word_max_cutoff": 1000,
+    "cond_check_number_words": True,
+    "tokenization": True,
+    "strip_characters": special_characters_default,
+    "number_words_min_cutoff": 1,
+    "number_words_max_cutoff": 100000,
+    "check_repetitions_removal": True,
+    "repetitions_length": 10,
+    "repetitions_max_cutoff": 0.106,
+    "cond_check_special_characters": True,
+    "special_characters": special_characters_default,
+    "special_characters_max_cutoff": 0.4,
+    "cond_words_augmentation": True,
+    "words_augmentation_group_sizes": [2, 3],
+    "words_augmentation_join_char": "",
+    "cond_check_stopwords": False,
+    "stopwords_min_cutoff": 0,
+    "cond_check_badwords": False,
+    "badwords_max_cutoff": 0.2,
+    "cond_check_lang_id": True,
+    "lang_id_min_cutoff": 0.75,
+    "cond_check_perplexity": False,
+    "perplexity_max_cutoff": 3000000,
+}
+
+parameters_filtering = {
+    "default": parameters_filtering_default,
+    "af": parameters_filtering_af,
+    "ar": parameters_filtering_ar,
+    "arz": parameters_filtering_arz,
+    "as": parameters_filtering_as,
+    "bn": parameters_filtering_bn,
+    "ca": parameters_filtering_ca,
+    "en": parameters_filtering_en,
+    "es": parameters_filtering_es,
+    "eu": parameters_filtering_eu,
+    "fr": parameters_filtering_fr,
+    "gu": parameters_filtering_gu,
+    "hi": parameters_filtering_hi,
+    "id": parameters_filtering_id,
+    "kn": parameters_filtering_kn,
+    "ml": parameters_filtering_ml,
+    "mr": parameters_filtering_mr,
+    "pt": parameters_filtering_pt,
+    "so": parameters_filtering_so,
+    "sw": parameters_filtering_sw,
+    "ta": parameters_filtering_ta,
+    "te": parameters_filtering_te,
+    "ur": parameters_filtering_ur,
+    "vi": parameters_filtering_vi,
+    "yo": parameters_filtering_yo,
+    "zh": parameters_filtering_zh,
+}
diff --git a/stopwords.py b/stopwords.py
new file mode 100644
index 0000000000000000000000000000000000000000..e75bbd4fcf3860add6204b0c413703f37074d9b7
--- /dev/null
+++ b/stopwords.py
@@ -0,0 +1,5395 @@
+# From https://github.com/6/stopwords-json
+# From https://github.com/stopwords-iso/stopwords-iso for Urdu and Vietnamese
+
+
+stopwords = {
+    "af": [
+        "'n",
+        "aan",
+        "af",
+        "al",
+        "as",
+        "baie",
+        "by",
+        "daar",
+        "dag",
+        "dat",
+        "die",
+        "dit",
+        "een",
+        "ek",
+        "en",
+        "gaan",
+        "gesê",
+        "haar",
+        "het",
+        "hom",
+        "hulle",
+        "hy",
+        "in",
+        "is",
+        "jou",
+        "jy",
+        "kan",
+        "kom",
+        "ma",
+        "maar",
+        "met",
+        "my",
+        "na",
+        "nie",
+        "om",
+        "ons",
+        "op",
+        "saam",
+        "sal",
+        "se",
+        "sien",
+        "so",
+        "sy",
+        "te",
+        "toe",
+        "uit",
+        "van",
+        "vir",
+        "was",
+        "wat",
+        "ŉ",
+    ],
+    "ar": [
+        "،",
+        "أ",
+        "ا",
+        "اثر",
+        "اجل",
+        "احد",
+        "اخرى",
+        "اذا",
+        "اربعة",
+        "اطار",
+        "اعادة",
+        "اعلنت",
+        "اف",
+        "اكثر",
+        "اكد",
+        "الا",
+        "الاخيرة",
+        "الان",
+        "الاول",
+        "الاولى",
+        "التى",
+        "التي",
+        "الثاني",
+        "الثانية",
+        "الذاتي",
+        "الذى",
+        "الذي",
+        "الذين",
+        "السابق",
+        "الف",
+        "الماضي",
+        "المقبل",
+        "الوقت",
+        "الى",
+        "اليوم",
+        "اما",
+        "امام",
+        "امس",
+        "ان",
+        "انه",
+        "انها",
+        "او",
+        "اول",
+        "اي",
+        "ايار",
+        "ايام",
+        "ايضا",
+        "ب",
+        "باسم",
+        "بان",
+        "برس",
+        "بسبب",
+        "بشكل",
+        "بعد",
+        "بعض",
+        "بن",
+        "به",
+        "بها",
+        "بين",
+        "تم",
+        "ثلاثة",
+        "ثم",
+        "جميع",
+        "حاليا",
+        "حتى",
+        "حوالى",
+        "حول",
+        "حيث",
+        "حين",
+        "خلال",
+        "دون",
+        "ذلك",
+        "زيارة",
+        "سنة",
+        "سنوات",
+        "شخصا",
+        "صباح",
+        "صفر",
+        "ضد",
+        "ضمن",
+        "عام",
+        "عاما",
+        "عدة",
+        "عدد",
+        "عدم",
+        "عشر",
+        "عشرة",
+        "على",
+        "عليه",
+        "عليها",
+        "عن",
+        "عند",
+        "عندما",
+        "غدا",
+        "غير",
+        "ـ",
+        "ف",
+        "فان",
+        "فى",
+        "في",
+        "فيه",
+        "فيها",
+        "قال",
+        "قبل",
+        "قد",
+        "قوة",
+        "كان",
+        "كانت",
+        "كل",
+        "كلم",
+        "كما",
+        "لا",
+        "لدى",
+        "لقاء",
+        "لكن",
+        "للامم",
+        "لم",
+        "لن",
+        "له",
+        "لها",
+        "لوكالة",
+        "ما",
+        "مايو",
+        "مساء",
+        "مع",
+        "مقابل",
+        "مليار",
+        "مليون",
+        "من",
+        "منذ",
+        "منها",
+        "نحو",
+        "نفسه",
+        "نهاية",
+        "هذا",
+        "هذه",
+        "هناك",
+        "هو",
+        "هي",
+        "و",
+        "و6",
+        "واحد",
+        "واضاف",
+        "واضافت",
+        "واكد",
+        "وان",
+        "واوضح",
+        "وفي",
+        "وقال",
+        "وقالت",
+        "وقد",
+        "وقف",
+        "وكان",
+        "وكانت",
+        "ولا",
+        "ولم",
+        "ومن",
+        "وهو",
+        "وهي",
+        "يكون",
+        "يمكن",
+        "يوم",
+    ],
+    "bn": [
+        "অনেক",
+        "অন্য",
+        "অবশ্য",
+        "আগে",
+        "আছে",
+        "আজ",
+        "আবার",
+        "আমরা",
+        "আমাদের",
+        "আর",
+        "ই",
+        "উত্তর",
+        "উপর",
+        "উপরে",
+        "এ",
+        "এই",
+        "এক্",
+        "এখন",
+        "এত",
+        "এব",
+        "এমন",
+        "এমনি",
+        "এর",
+        "এস",
+        "এসে",
+        "ও",
+        "ওই",
+        "কমনে",
+        "করা",
+        "করে",
+        "কাছে",
+        "কাজ",
+        "কাজে",
+        "কারণ",
+        "কি",
+        "কিছু",
+        "কে",
+        "কেউ",
+        "কেখা",
+        "কেন",
+        "কোটি",
+        "কোনো",
+        "কয়েক",
+        "খুব",
+        "গিয়ে",
+        "গেল",
+        "চার",
+        "চালু",
+        "চেষ্টা",
+        "ছিল",
+        "জানা",
+        "জ্নজন",
+        "টি",
+        "তখন",
+        "তবে",
+        "তা",
+        "তাই",
+        "তো",
+        "থাকা",
+        "থেকে",
+        "দিন",
+        "দু",
+        "দুই",
+        "দেওয়া",
+        "ধামার",
+        "নতুন",
+        "না",
+        "নাগাদ",
+        "নিয়ে",
+        "নেওয়া",
+        "নয়",
+        "পর",
+        "পরে",
+        "পাচ",
+        "পি",
+        "পেয়্র্",
+        "প্রতি",
+        "প্রথম",
+        "প্রযন্ত",
+        "প্রাথমিক",
+        "প্রায়",
+        "বক্তব্য",
+        "বন",
+        "বলা",
+        "বলে",
+        "বলেন",
+        "বহু",
+        "বা",
+        "বি",
+        "বিভিন্ন",
+        "বেশ",
+        "বেশি",
+        "মতো",
+        "মধ্যে",
+        "মনে",
+        "যখন",
+        "যদি",
+        "যা",
+        "যাওয়া",
+        "যে",
+        "র",
+        "রকম",
+        "লক্ষ",
+        "শুধু",
+        "শুরু",
+        "সঙ্গে",
+        "সব",
+        "সহ",
+        "সাধারণ",
+        "সামনে",
+        "সি",
+        "সে",
+        "সেই",
+        "হতে",
+        "হাজার",
+        "হয়",
+    ],
+    "ca": [
+        "a",
+        "abans",
+        "ací",
+        "ah",
+        "així",
+        "això",
+        "al",
+        "aleshores",
+        "algun",
+        "alguna",
+        "algunes",
+        "alguns",
+        "alhora",
+        "allà",
+        "allí",
+        "allò",
+        "als",
+        "altra",
+        "altre",
+        "altres",
+        "amb",
+        "ambdues",
+        "ambdós",
+        "apa",
+        "aquell",
+        "aquella",
+        "aquelles",
+        "aquells",
+        "aquest",
+        "aquesta",
+        "aquestes",
+        "aquests",
+        "aquí",
+        "baix",
+        "cada",
+        "cadascuna",
+        "cadascunes",
+        "cadascuns",
+        "cadascú",
+        "com",
+        "contra",
+        "d'un",
+        "d'una",
+        "d'unes",
+        "d'uns",
+        "dalt",
+        "de",
+        "del",
+        "dels",
+        "des",
+        "després",
+        "dins",
+        "dintre",
+        "donat",
+        "doncs",
+        "durant",
+        "e",
+        "eh",
+        "el",
+        "els",
+        "em",
+        "en",
+        "encara",
+        "ens",
+        "entre",
+        "eren",
+        "es",
+        "esta",
+        "estaven",
+        "esteu",
+        "està",
+        "estàvem",
+        "estàveu",
+        "et",
+        "etc",
+        "ets",
+        "fins",
+        "fora",
+        "gairebé",
+        "ha",
+        "han",
+        "has",
+        "havia",
+        "he",
+        "hem",
+        "heu",
+        "hi",
+        "ho",
+        "i",
+        "igual",
+        "iguals",
+        "ja",
+        "l'hi",
+        "la",
+        "les",
+        "li",
+        "li'n",
+        "llavors",
+        "m'he",
+        "ma",
+        "mal",
+        "malgrat",
+        "mateix",
+        "mateixa",
+        "mateixes",
+        "mateixos",
+        "me",
+        "mentre",
+        "meu",
+        "meus",
+        "meva",
+        "meves",
+        "molt",
+        "molta",
+        "moltes",
+        "molts",
+        "mon",
+        "mons",
+        "més",
+        "n'he",
+        "n'hi",
+        "ne",
+        "ni",
+        "no",
+        "nogensmenys",
+        "només",
+        "nosaltres",
+        "nostra",
+        "nostre",
+        "nostres",
+        "o",
+        "oh",
+        "oi",
+        "on",
+        "pas",
+        "pel",
+        "pels",
+        "per",
+        "perquè",
+        "però",
+        "poc",
+        "poca",
+        "pocs",
+        "poques",
+        "potser",
+        "propi",
+        "qual",
+        "quals",
+        "quan",
+        "quant",
+        "que",
+        "quelcom",
+        "qui",
+        "quin",
+        "quina",
+        "quines",
+        "quins",
+        "què",
+        "s'ha",
+        "s'han",
+        "sa",
+        "semblant",
+        "semblants",
+        "ses",
+        "seu",
+        "seus",
+        "seva",
+        "seves",
+        "si",
+        "sobre",
+        "sobretot",
+        "solament",
+        "sols",
+        "son",
+        "sons",
+        "sota",
+        "sou",
+        "sóc",
+        "són",
+        "t'ha",
+        "t'han",
+        "t'he",
+        "ta",
+        "tal",
+        "també",
+        "tampoc",
+        "tan",
+        "tant",
+        "tanta",
+        "tantes",
+        "teu",
+        "teus",
+        "teva",
+        "teves",
+        "ton",
+        "tons",
+        "tot",
+        "tota",
+        "totes",
+        "tots",
+        "un",
+        "una",
+        "unes",
+        "uns",
+        "us",
+        "va",
+        "vaig",
+        "vam",
+        "van",
+        "vas",
+        "veu",
+        "vosaltres",
+        "vostra",
+        "vostre",
+        "vostres",
+        "érem",
+        "éreu",
+        "és",
+    ],
+    "en": [
+        "a",
+        "a's",
+        "able",
+        "about",
+        "above",
+        "according",
+        "accordingly",
+        "across",
+        "actually",
+        "after",
+        "afterwards",
+        "again",
+        "against",
+        "ain't",
+        "all",
+        "allow",
+        "allows",
+        "almost",
+        "alone",
+        "along",
+        "already",
+        "also",
+        "although",
+        "always",
+        "am",
+        "among",
+        "amongst",
+        "an",
+        "and",
+        "another",
+        "any",
+        "anybody",
+        "anyhow",
+        "anyone",
+        "anything",
+        "anyway",
+        "anyways",
+        "anywhere",
+        "apart",
+        "appear",
+        "appreciate",
+        "appropriate",
+        "are",
+        "aren't",
+        "around",
+        "as",
+        "aside",
+        "ask",
+        "asking",
+        "associated",
+        "at",
+        "available",
+        "away",
+        "awfully",
+        "b",
+        "be",
+        "became",
+        "because",
+        "become",
+        "becomes",
+        "becoming",
+        "been",
+        "before",
+        "beforehand",
+        "behind",
+        "being",
+        "believe",
+        "below",
+        "beside",
+        "besides",
+        "best",
+        "better",
+        "between",
+        "beyond",
+        "both",
+        "brief",
+        "but",
+        "by",
+        "c",
+        "c'mon",
+        "c's",
+        "came",
+        "can",
+        "can't",
+        "cannot",
+        "cant",
+        "cause",
+        "causes",
+        "certain",
+        "certainly",
+        "changes",
+        "clearly",
+        "co",
+        "com",
+        "come",
+        "comes",
+        "concerning",
+        "consequently",
+        "consider",
+        "considering",
+        "contain",
+        "containing",
+        "contains",
+        "corresponding",
+        "could",
+        "couldn't",
+        "course",
+        "currently",
+        "d",
+        "definitely",
+        "described",
+        "despite",
+        "did",
+        "didn't",
+        "different",
+        "do",
+        "does",
+        "doesn't",
+        "doing",
+        "don't",
+        "done",
+        "down",
+        "downwards",
+        "during",
+        "e",
+        "each",
+        "edu",
+        "eg",
+        "eight",
+        "either",
+        "else",
+        "elsewhere",
+        "enough",
+        "entirely",
+        "especially",
+        "et",
+        "etc",
+        "even",
+        "ever",
+        "every",
+        "everybody",
+        "everyone",
+        "everything",
+        "everywhere",
+        "ex",
+        "exactly",
+        "example",
+        "except",
+        "f",
+        "far",
+        "few",
+        "fifth",
+        "first",
+        "five",
+        "followed",
+        "following",
+        "follows",
+        "for",
+        "former",
+        "formerly",
+        "forth",
+        "four",
+        "from",
+        "further",
+        "furthermore",
+        "g",
+        "get",
+        "gets",
+        "getting",
+        "given",
+        "gives",
+        "go",
+        "goes",
+        "going",
+        "gone",
+        "got",
+        "gotten",
+        "greetings",
+        "h",
+        "had",
+        "hadn't",
+        "happens",
+        "hardly",
+        "has",
+        "hasn't",
+        "have",
+        "haven't",
+        "having",
+        "he",
+        "he's",
+        "hello",
+        "help",
+        "hence",
+        "her",
+        "here",
+        "here's",
+        "hereafter",
+        "hereby",
+        "herein",
+        "hereupon",
+        "hers",
+        "herself",
+        "hi",
+        "him",
+        "himself",
+        "his",
+        "hither",
+        "hopefully",
+        "how",
+        "howbeit",
+        "however",
+        "i",
+        "i'd",
+        "i'll",
+        "i'm",
+        "i've",
+        "ie",
+        "if",
+        "ignored",
+        "immediate",
+        "in",
+        "inasmuch",
+        "inc",
+        "indeed",
+        "indicate",
+        "indicated",
+        "indicates",
+        "inner",
+        "insofar",
+        "instead",
+        "into",
+        "inward",
+        "is",
+        "isn't",
+        "it",
+        "it'd",
+        "it'll",
+        "it's",
+        "its",
+        "itself",
+        "j",
+        "just",
+        "k",
+        "keep",
+        "keeps",
+        "kept",
+        "know",
+        "known",
+        "knows",
+        "l",
+        "last",
+        "lately",
+        "later",
+        "latter",
+        "latterly",
+        "least",
+        "less",
+        "lest",
+        "let",
+        "let's",
+        "like",
+        "liked",
+        "likely",
+        "little",
+        "look",
+        "looking",
+        "looks",
+        "ltd",
+        "m",
+        "mainly",
+        "many",
+        "may",
+        "maybe",
+        "me",
+        "mean",
+        "meanwhile",
+        "merely",
+        "might",
+        "more",
+        "moreover",
+        "most",
+        "mostly",
+        "much",
+        "must",
+        "my",
+        "myself",
+        "n",
+        "name",
+        "namely",
+        "nd",
+        "near",
+        "nearly",
+        "necessary",
+        "need",
+        "needs",
+        "neither",
+        "never",
+        "nevertheless",
+        "new",
+        "next",
+        "nine",
+        "no",
+        "nobody",
+        "non",
+        "none",
+        "noone",
+        "nor",
+        "normally",
+        "not",
+        "nothing",
+        "novel",
+        "now",
+        "nowhere",
+        "o",
+        "obviously",
+        "of",
+        "off",
+        "often",
+        "oh",
+        "ok",
+        "okay",
+        "old",
+        "on",
+        "once",
+        "one",
+        "ones",
+        "only",
+        "onto",
+        "or",
+        "other",
+        "others",
+        "otherwise",
+        "ought",
+        "our",
+        "ours",
+        "ourselves",
+        "out",
+        "outside",
+        "over",
+        "overall",
+        "own",
+        "p",
+        "particular",
+        "particularly",
+        "per",
+        "perhaps",
+        "placed",
+        "please",
+        "plus",
+        "possible",
+        "presumably",
+        "probably",
+        "provides",
+        "q",
+        "que",
+        "quite",
+        "qv",
+        "r",
+        "rather",
+        "rd",
+        "re",
+        "really",
+        "reasonably",
+        "regarding",
+        "regardless",
+        "regards",
+        "relatively",
+        "respectively",
+        "right",
+        "s",
+        "said",
+        "same",
+        "saw",
+        "say",
+        "saying",
+        "says",
+        "second",
+        "secondly",
+        "see",
+        "seeing",
+        "seem",
+        "seemed",
+        "seeming",
+        "seems",
+        "seen",
+        "self",
+        "selves",
+        "sensible",
+        "sent",
+        "serious",
+        "seriously",
+        "seven",
+        "several",
+        "shall",
+        "she",
+        "should",
+        "shouldn't",
+        "since",
+        "six",
+        "so",
+        "some",
+        "somebody",
+        "somehow",
+        "someone",
+        "something",
+        "sometime",
+        "sometimes",
+        "somewhat",
+        "somewhere",
+        "soon",
+        "sorry",
+        "specified",
+        "specify",
+        "specifying",
+        "still",
+        "sub",
+        "such",
+        "sup",
+        "sure",
+        "t",
+        "t's",
+        "take",
+        "taken",
+        "tell",
+        "tends",
+        "th",
+        "than",
+        "thank",
+        "thanks",
+        "thanx",
+        "that",
+        "that's",
+        "thats",
+        "the",
+        "their",
+        "theirs",
+        "them",
+        "themselves",
+        "then",
+        "thence",
+        "there",
+        "there's",
+        "thereafter",
+        "thereby",
+        "therefore",
+        "therein",
+        "theres",
+        "thereupon",
+        "these",
+        "they",
+        "they'd",
+        "they'll",
+        "they're",
+        "they've",
+        "think",
+        "third",
+        "this",
+        "thorough",
+        "thoroughly",
+        "those",
+        "though",
+        "three",
+        "through",
+        "throughout",
+        "thru",
+        "thus",
+        "to",
+        "together",
+        "too",
+        "took",
+        "toward",
+        "towards",
+        "tried",
+        "tries",
+        "truly",
+        "try",
+        "trying",
+        "twice",
+        "two",
+        "u",
+        "un",
+        "under",
+        "unfortunately",
+        "unless",
+        "unlikely",
+        "until",
+        "unto",
+        "up",
+        "upon",
+        "us",
+        "use",
+        "used",
+        "useful",
+        "uses",
+        "using",
+        "usually",
+        "uucp",
+        "v",
+        "value",
+        "various",
+        "very",
+        "via",
+        "viz",
+        "vs",
+        "w",
+        "want",
+        "wants",
+        "was",
+        "wasn't",
+        "way",
+        "we",
+        "we'd",
+        "we'll",
+        "we're",
+        "we've",
+        "welcome",
+        "well",
+        "went",
+        "were",
+        "weren't",
+        "what",
+        "what's",
+        "whatever",
+        "when",
+        "whence",
+        "whenever",
+        "where",
+        "where's",
+        "whereafter",
+        "whereas",
+        "whereby",
+        "wherein",
+        "whereupon",
+        "wherever",
+        "whether",
+        "which",
+        "while",
+        "whither",
+        "who",
+        "who's",
+        "whoever",
+        "whole",
+        "whom",
+        "whose",
+        "why",
+        "will",
+        "willing",
+        "wish",
+        "with",
+        "within",
+        "without",
+        "won't",
+        "wonder",
+        "would",
+        "wouldn't",
+        "x",
+        "y",
+        "yes",
+        "yet",
+        "you",
+        "you'd",
+        "you'll",
+        "you're",
+        "you've",
+        "your",
+        "yours",
+        "yourself",
+        "yourselves",
+        "z",
+        "zero",
+    ],
+    "es": [
+        "a",
+        "actualmente",
+        "acuerdo",
+        "adelante",
+        "ademas",
+        "además",
+        "adrede",
+        "afirmó",
+        "agregó",
+        "ahi",
+        "ahora",
+        "ahí",
+        "al",
+        "algo",
+        "alguna",
+        "algunas",
+        "alguno",
+        "algunos",
+        "algún",
+        "alli",
+        "allí",
+        "alrededor",
+        "ambos",
+        "ampleamos",
+        "antano",
+        "antaño",
+        "ante",
+        "anterior",
+        "antes",
+        "apenas",
+        "aproximadamente",
+        "aquel",
+        "aquella",
+        "aquellas",
+        "aquello",
+        "aquellos",
+        "aqui",
+        "aquél",
+        "aquélla",
+        "aquéllas",
+        "aquéllos",
+        "aquí",
+        "arriba",
+        "arribaabajo",
+        "aseguró",
+        "asi",
+        "así",
+        "atras",
+        "aun",
+        "aunque",
+        "ayer",
+        "añadió",
+        "aún",
+        "b",
+        "bajo",
+        "bastante",
+        "bien",
+        "breve",
+        "buen",
+        "buena",
+        "buenas",
+        "bueno",
+        "buenos",
+        "c",
+        "cada",
+        "casi",
+        "cerca",
+        "cierta",
+        "ciertas",
+        "cierto",
+        "ciertos",
+        "cinco",
+        "claro",
+        "comentó",
+        "como",
+        "con",
+        "conmigo",
+        "conocer",
+        "conseguimos",
+        "conseguir",
+        "considera",
+        "consideró",
+        "consigo",
+        "consigue",
+        "consiguen",
+        "consigues",
+        "contigo",
+        "contra",
+        "cosas",
+        "creo",
+        "cual",
+        "cuales",
+        "cualquier",
+        "cuando",
+        "cuanta",
+        "cuantas",
+        "cuanto",
+        "cuantos",
+        "cuatro",
+        "cuenta",
+        "cuál",
+        "cuáles",
+        "cuándo",
+        "cuánta",
+        "cuántas",
+        "cuánto",
+        "cuántos",
+        "cómo",
+        "d",
+        "da",
+        "dado",
+        "dan",
+        "dar",
+        "de",
+        "debajo",
+        "debe",
+        "deben",
+        "debido",
+        "decir",
+        "dejó",
+        "del",
+        "delante",
+        "demasiado",
+        "demás",
+        "dentro",
+        "deprisa",
+        "desde",
+        "despacio",
+        "despues",
+        "después",
+        "detras",
+        "detrás",
+        "dia",
+        "dias",
+        "dice",
+        "dicen",
+        "dicho",
+        "dieron",
+        "diferente",
+        "diferentes",
+        "dijeron",
+        "dijo",
+        "dio",
+        "donde",
+        "dos",
+        "durante",
+        "día",
+        "días",
+        "dónde",
+        "e",
+        "ejemplo",
+        "el",
+        "ella",
+        "ellas",
+        "ello",
+        "ellos",
+        "embargo",
+        "empleais",
+        "emplean",
+        "emplear",
+        "empleas",
+        "empleo",
+        "en",
+        "encima",
+        "encuentra",
+        "enfrente",
+        "enseguida",
+        "entonces",
+        "entre",
+        "era",
+        "eramos",
+        "eran",
+        "eras",
+        "eres",
+        "es",
+        "esa",
+        "esas",
+        "ese",
+        "eso",
+        "esos",
+        "esta",
+        "estaba",
+        "estaban",
+        "estado",
+        "estados",
+        "estais",
+        "estamos",
+        "estan",
+        "estar",
+        "estará",
+        "estas",
+        "este",
+        "esto",
+        "estos",
+        "estoy",
+        "estuvo",
+        "está",
+        "están",
+        "ex",
+        "excepto",
+        "existe",
+        "existen",
+        "explicó",
+        "expresó",
+        "f",
+        "fin",
+        "final",
+        "fue",
+        "fuera",
+        "fueron",
+        "fui",
+        "fuimos",
+        "g",
+        "general",
+        "gran",
+        "grandes",
+        "gueno",
+        "h",
+        "ha",
+        "haber",
+        "habia",
+        "habla",
+        "hablan",
+        "habrá",
+        "había",
+        "habían",
+        "hace",
+        "haceis",
+        "hacemos",
+        "hacen",
+        "hacer",
+        "hacerlo",
+        "haces",
+        "hacia",
+        "haciendo",
+        "hago",
+        "han",
+        "hasta",
+        "hay",
+        "haya",
+        "he",
+        "hecho",
+        "hemos",
+        "hicieron",
+        "hizo",
+        "horas",
+        "hoy",
+        "hubo",
+        "i",
+        "igual",
+        "incluso",
+        "indicó",
+        "informo",
+        "informó",
+        "intenta",
+        "intentais",
+        "intentamos",
+        "intentan",
+        "intentar",
+        "intentas",
+        "intento",
+        "ir",
+        "j",
+        "junto",
+        "k",
+        "l",
+        "la",
+        "lado",
+        "largo",
+        "las",
+        "le",
+        "lejos",
+        "les",
+        "llegó",
+        "lleva",
+        "llevar",
+        "lo",
+        "los",
+        "luego",
+        "lugar",
+        "m",
+        "mal",
+        "manera",
+        "manifestó",
+        "mas",
+        "mayor",
+        "me",
+        "mediante",
+        "medio",
+        "mejor",
+        "mencionó",
+        "menos",
+        "menudo",
+        "mi",
+        "mia",
+        "mias",
+        "mientras",
+        "mio",
+        "mios",
+        "mis",
+        "misma",
+        "mismas",
+        "mismo",
+        "mismos",
+        "modo",
+        "momento",
+        "mucha",
+        "muchas",
+        "mucho",
+        "muchos",
+        "muy",
+        "más",
+        "mí",
+        "mía",
+        "mías",
+        "mío",
+        "míos",
+        "n",
+        "nada",
+        "nadie",
+        "ni",
+        "ninguna",
+        "ningunas",
+        "ninguno",
+        "ningunos",
+        "ningún",
+        "no",
+        "nos",
+        "nosotras",
+        "nosotros",
+        "nuestra",
+        "nuestras",
+        "nuestro",
+        "nuestros",
+        "nueva",
+        "nuevas",
+        "nuevo",
+        "nuevos",
+        "nunca",
+        "o",
+        "ocho",
+        "os",
+        "otra",
+        "otras",
+        "otro",
+        "otros",
+        "p",
+        "pais",
+        "para",
+        "parece",
+        "parte",
+        "partir",
+        "pasada",
+        "pasado",
+        "paìs",
+        "peor",
+        "pero",
+        "pesar",
+        "poca",
+        "pocas",
+        "poco",
+        "pocos",
+        "podeis",
+        "podemos",
+        "poder",
+        "podria",
+        "podriais",
+        "podriamos",
+        "podrian",
+        "podrias",
+        "podrá",
+        "podrán",
+        "podría",
+        "podrían",
+        "poner",
+        "por",
+        "porque",
+        "posible",
+        "primer",
+        "primera",
+        "primero",
+        "primeros",
+        "principalmente",
+        "pronto",
+        "propia",
+        "propias",
+        "propio",
+        "propios",
+        "proximo",
+        "próximo",
+        "próximos",
+        "pudo",
+        "pueda",
+        "puede",
+        "pueden",
+        "puedo",
+        "pues",
+        "q",
+        "qeu",
+        "que",
+        "quedó",
+        "queremos",
+        "quien",
+        "quienes",
+        "quiere",
+        "quiza",
+        "quizas",
+        "quizá",
+        "quizás",
+        "quién",
+        "quiénes",
+        "qué",
+        "r",
+        "raras",
+        "realizado",
+        "realizar",
+        "realizó",
+        "repente",
+        "respecto",
+        "s",
+        "sabe",
+        "sabeis",
+        "sabemos",
+        "saben",
+        "saber",
+        "sabes",
+        "salvo",
+        "se",
+        "sea",
+        "sean",
+        "segun",
+        "segunda",
+        "segundo",
+        "según",
+        "seis",
+        "ser",
+        "sera",
+        "será",
+        "serán",
+        "sería",
+        "señaló",
+        "si",
+        "sido",
+        "siempre",
+        "siendo",
+        "siete",
+        "sigue",
+        "siguiente",
+        "sin",
+        "sino",
+        "sobre",
+        "sois",
+        "sola",
+        "solamente",
+        "solas",
+        "solo",
+        "solos",
+        "somos",
+        "son",
+        "soy",
+        "soyos",
+        "su",
+        "supuesto",
+        "sus",
+        "suya",
+        "suyas",
+        "suyo",
+        "sé",
+        "sí",
+        "sólo",
+        "t",
+        "tal",
+        "tambien",
+        "también",
+        "tampoco",
+        "tan",
+        "tanto",
+        "tarde",
+        "te",
+        "temprano",
+        "tendrá",
+        "tendrán",
+        "teneis",
+        "tenemos",
+        "tener",
+        "tenga",
+        "tengo",
+        "tenido",
+        "tenía",
+        "tercera",
+        "ti",
+        "tiempo",
+        "tiene",
+        "tienen",
+        "toda",
+        "todas",
+        "todavia",
+        "todavía",
+        "todo",
+        "todos",
+        "total",
+        "trabaja",
+        "trabajais",
+        "trabajamos",
+        "trabajan",
+        "trabajar",
+        "trabajas",
+        "trabajo",
+        "tras",
+        "trata",
+        "través",
+        "tres",
+        "tu",
+        "tus",
+        "tuvo",
+        "tuya",
+        "tuyas",
+        "tuyo",
+        "tuyos",
+        "tú",
+        "u",
+        "ultimo",
+        "un",
+        "una",
+        "unas",
+        "uno",
+        "unos",
+        "usa",
+        "usais",
+        "usamos",
+        "usan",
+        "usar",
+        "usas",
+        "uso",
+        "usted",
+        "ustedes",
+        "v",
+        "va",
+        "vais",
+        "valor",
+        "vamos",
+        "van",
+        "varias",
+        "varios",
+        "vaya",
+        "veces",
+        "ver",
+        "verdad",
+        "verdadera",
+        "verdadero",
+        "vez",
+        "vosotras",
+        "vosotros",
+        "voy",
+        "vuestra",
+        "vuestras",
+        "vuestro",
+        "vuestros",
+        "w",
+        "x",
+        "y",
+        "ya",
+        "yo",
+        "z",
+        "él",
+        "ésa",
+        "ésas",
+        "ése",
+        "ésos",
+        "ésta",
+        "éstas",
+        "éste",
+        "éstos",
+        "última",
+        "últimas",
+        "último",
+        "últimos",
+    ],
+    "eu": [
+        "al",
+        "anitz",
+        "arabera",
+        "asko",
+        "baina",
+        "bat",
+        "batean",
+        "batek",
+        "bati",
+        "batzuei",
+        "batzuek",
+        "batzuetan",
+        "batzuk",
+        "bera",
+        "beraiek",
+        "berau",
+        "berauek",
+        "bere",
+        "berori",
+        "beroriek",
+        "beste",
+        "bezala",
+        "da",
+        "dago",
+        "dira",
+        "ditu",
+        "du",
+        "dute",
+        "edo",
+        "egin",
+        "ere",
+        "eta",
+        "eurak",
+        "ez",
+        "gainera",
+        "gu",
+        "gutxi",
+        "guzti",
+        "haiei",
+        "haiek",
+        "haietan",
+        "hainbeste",
+        "hala",
+        "han",
+        "handik",
+        "hango",
+        "hara",
+        "hari",
+        "hark",
+        "hartan",
+        "hau",
+        "hauei",
+        "hauek",
+        "hauetan",
+        "hemen",
+        "hemendik",
+        "hemengo",
+        "hi",
+        "hona",
+        "honek",
+        "honela",
+        "honetan",
+        "honi",
+        "hor",
+        "hori",
+        "horiei",
+        "horiek",
+        "horietan",
+        "horko",
+        "horra",
+        "horrek",
+        "horrela",
+        "horretan",
+        "horri",
+        "hortik",
+        "hura",
+        "izan",
+        "ni",
+        "noiz",
+        "nola",
+        "non",
+        "nondik",
+        "nongo",
+        "nor",
+        "nora",
+        "ze",
+        "zein",
+        "zen",
+        "zenbait",
+        "zenbat",
+        "zer",
+        "zergatik",
+        "ziren",
+        "zituen",
+        "zu",
+        "zuek",
+        "zuen",
+        "zuten",
+    ],
+    "fr": [
+        "a",
+        "abord",
+        "absolument",
+        "afin",
+        "ah",
+        "ai",
+        "aie",
+        "ailleurs",
+        "ainsi",
+        "ait",
+        "allaient",
+        "allo",
+        "allons",
+        "allô",
+        "alors",
+        "anterieur",
+        "anterieure",
+        "anterieures",
+        "apres",
+        "après",
+        "as",
+        "assez",
+        "attendu",
+        "au",
+        "aucun",
+        "aucune",
+        "aujourd",
+        "aujourd'hui",
+        "aupres",
+        "auquel",
+        "aura",
+        "auraient",
+        "aurait",
+        "auront",
+        "aussi",
+        "autre",
+        "autrefois",
+        "autrement",
+        "autres",
+        "autrui",
+        "aux",
+        "auxquelles",
+        "auxquels",
+        "avaient",
+        "avais",
+        "avait",
+        "avant",
+        "avec",
+        "avoir",
+        "avons",
+        "ayant",
+        "b",
+        "bah",
+        "bas",
+        "basee",
+        "bat",
+        "beau",
+        "beaucoup",
+        "bien",
+        "bigre",
+        "boum",
+        "bravo",
+        "brrr",
+        "c",
+        "car",
+        "ce",
+        "ceci",
+        "cela",
+        "celle",
+        "celle-ci",
+        "celle-là",
+        "celles",
+        "celles-ci",
+        "celles-là",
+        "celui",
+        "celui-ci",
+        "celui-là",
+        "cent",
+        "cependant",
+        "certain",
+        "certaine",
+        "certaines",
+        "certains",
+        "certes",
+        "ces",
+        "cet",
+        "cette",
+        "ceux",
+        "ceux-ci",
+        "ceux-là",
+        "chacun",
+        "chacune",
+        "chaque",
+        "cher",
+        "chers",
+        "chez",
+        "chiche",
+        "chut",
+        "chère",
+        "chères",
+        "ci",
+        "cinq",
+        "cinquantaine",
+        "cinquante",
+        "cinquantième",
+        "cinquième",
+        "clac",
+        "clic",
+        "combien",
+        "comme",
+        "comment",
+        "comparable",
+        "comparables",
+        "compris",
+        "concernant",
+        "contre",
+        "couic",
+        "crac",
+        "d",
+        "da",
+        "dans",
+        "de",
+        "debout",
+        "dedans",
+        "dehors",
+        "deja",
+        "delà",
+        "depuis",
+        "dernier",
+        "derniere",
+        "derriere",
+        "derrière",
+        "des",
+        "desormais",
+        "desquelles",
+        "desquels",
+        "dessous",
+        "dessus",
+        "deux",
+        "deuxième",
+        "deuxièmement",
+        "devant",
+        "devers",
+        "devra",
+        "different",
+        "differentes",
+        "differents",
+        "différent",
+        "différente",
+        "différentes",
+        "différents",
+        "dire",
+        "directe",
+        "directement",
+        "dit",
+        "dite",
+        "dits",
+        "divers",
+        "diverse",
+        "diverses",
+        "dix",
+        "dix-huit",
+        "dix-neuf",
+        "dix-sept",
+        "dixième",
+        "doit",
+        "doivent",
+        "donc",
+        "dont",
+        "douze",
+        "douzième",
+        "dring",
+        "du",
+        "duquel",
+        "durant",
+        "dès",
+        "désormais",
+        "e",
+        "effet",
+        "egale",
+        "egalement",
+        "egales",
+        "eh",
+        "elle",
+        "elle-même",
+        "elles",
+        "elles-mêmes",
+        "en",
+        "encore",
+        "enfin",
+        "entre",
+        "envers",
+        "environ",
+        "es",
+        "est",
+        "et",
+        "etant",
+        "etc",
+        "etre",
+        "eu",
+        "euh",
+        "eux",
+        "eux-mêmes",
+        "exactement",
+        "excepté",
+        "extenso",
+        "exterieur",
+        "f",
+        "fais",
+        "faisaient",
+        "faisant",
+        "fait",
+        "façon",
+        "feront",
+        "fi",
+        "flac",
+        "floc",
+        "font",
+        "g",
+        "gens",
+        "h",
+        "ha",
+        "hein",
+        "hem",
+        "hep",
+        "hi",
+        "ho",
+        "holà",
+        "hop",
+        "hormis",
+        "hors",
+        "hou",
+        "houp",
+        "hue",
+        "hui",
+        "huit",
+        "huitième",
+        "hum",
+        "hurrah",
+        "hé",
+        "hélas",
+        "i",
+        "il",
+        "ils",
+        "importe",
+        "j",
+        "je",
+        "jusqu",
+        "jusque",
+        "juste",
+        "k",
+        "l",
+        "la",
+        "laisser",
+        "laquelle",
+        "las",
+        "le",
+        "lequel",
+        "les",
+        "lesquelles",
+        "lesquels",
+        "leur",
+        "leurs",
+        "longtemps",
+        "lors",
+        "lorsque",
+        "lui",
+        "lui-meme",
+        "lui-même",
+        "là",
+        "lès",
+        "m",
+        "ma",
+        "maint",
+        "maintenant",
+        "mais",
+        "malgre",
+        "malgré",
+        "maximale",
+        "me",
+        "meme",
+        "memes",
+        "merci",
+        "mes",
+        "mien",
+        "mienne",
+        "miennes",
+        "miens",
+        "mille",
+        "mince",
+        "minimale",
+        "moi",
+        "moi-meme",
+        "moi-même",
+        "moindres",
+        "moins",
+        "mon",
+        "moyennant",
+        "multiple",
+        "multiples",
+        "même",
+        "mêmes",
+        "n",
+        "na",
+        "naturel",
+        "naturelle",
+        "naturelles",
+        "ne",
+        "neanmoins",
+        "necessaire",
+        "necessairement",
+        "neuf",
+        "neuvième",
+        "ni",
+        "nombreuses",
+        "nombreux",
+        "non",
+        "nos",
+        "notamment",
+        "notre",
+        "nous",
+        "nous-mêmes",
+        "nouveau",
+        "nul",
+        "néanmoins",
+        "nôtre",
+        "nôtres",
+        "o",
+        "oh",
+        "ohé",
+        "ollé",
+        "olé",
+        "on",
+        "ont",
+        "onze",
+        "onzième",
+        "ore",
+        "ou",
+        "ouf",
+        "ouias",
+        "oust",
+        "ouste",
+        "outre",
+        "ouvert",
+        "ouverte",
+        "ouverts",
+        "o|",
+        "où",
+        "p",
+        "paf",
+        "pan",
+        "par",
+        "parce",
+        "parfois",
+        "parle",
+        "parlent",
+        "parler",
+        "parmi",
+        "parseme",
+        "partant",
+        "particulier",
+        "particulière",
+        "particulièrement",
+        "pas",
+        "passé",
+        "pendant",
+        "pense",
+        "permet",
+        "personne",
+        "peu",
+        "peut",
+        "peuvent",
+        "peux",
+        "pff",
+        "pfft",
+        "pfut",
+        "pif",
+        "pire",
+        "plein",
+        "plouf",
+        "plus",
+        "plusieurs",
+        "plutôt",
+        "possessif",
+        "possessifs",
+        "possible",
+        "possibles",
+        "pouah",
+        "pour",
+        "pourquoi",
+        "pourrais",
+        "pourrait",
+        "pouvait",
+        "prealable",
+        "precisement",
+        "premier",
+        "première",
+        "premièrement",
+        "pres",
+        "probable",
+        "probante",
+        "procedant",
+        "proche",
+        "près",
+        "psitt",
+        "pu",
+        "puis",
+        "puisque",
+        "pur",
+        "pure",
+        "q",
+        "qu",
+        "quand",
+        "quant",
+        "quant-à-soi",
+        "quanta",
+        "quarante",
+        "quatorze",
+        "quatre",
+        "quatre-vingt",
+        "quatrième",
+        "quatrièmement",
+        "que",
+        "quel",
+        "quelconque",
+        "quelle",
+        "quelles",
+        "quelqu'un",
+        "quelque",
+        "quelques",
+        "quels",
+        "qui",
+        "quiconque",
+        "quinze",
+        "quoi",
+        "quoique",
+        "r",
+        "rare",
+        "rarement",
+        "rares",
+        "relative",
+        "relativement",
+        "remarquable",
+        "rend",
+        "rendre",
+        "restant",
+        "reste",
+        "restent",
+        "restrictif",
+        "retour",
+        "revoici",
+        "revoilà",
+        "rien",
+        "s",
+        "sa",
+        "sacrebleu",
+        "sait",
+        "sans",
+        "sapristi",
+        "sauf",
+        "se",
+        "sein",
+        "seize",
+        "selon",
+        "semblable",
+        "semblaient",
+        "semble",
+        "semblent",
+        "sent",
+        "sept",
+        "septième",
+        "sera",
+        "seraient",
+        "serait",
+        "seront",
+        "ses",
+        "seul",
+        "seule",
+        "seulement",
+        "si",
+        "sien",
+        "sienne",
+        "siennes",
+        "siens",
+        "sinon",
+        "six",
+        "sixième",
+        "soi",
+        "soi-même",
+        "soit",
+        "soixante",
+        "son",
+        "sont",
+        "sous",
+        "souvent",
+        "specifique",
+        "specifiques",
+        "speculatif",
+        "stop",
+        "strictement",
+        "subtiles",
+        "suffisant",
+        "suffisante",
+        "suffit",
+        "suis",
+        "suit",
+        "suivant",
+        "suivante",
+        "suivantes",
+        "suivants",
+        "suivre",
+        "superpose",
+        "sur",
+        "surtout",
+        "t",
+        "ta",
+        "tac",
+        "tant",
+        "tardive",
+        "te",
+        "tel",
+        "telle",
+        "tellement",
+        "telles",
+        "tels",
+        "tenant",
+        "tend",
+        "tenir",
+        "tente",
+        "tes",
+        "tic",
+        "tien",
+        "tienne",
+        "tiennes",
+        "tiens",
+        "toc",
+        "toi",
+        "toi-même",
+        "ton",
+        "touchant",
+        "toujours",
+        "tous",
+        "tout",
+        "toute",
+        "toutefois",
+        "toutes",
+        "treize",
+        "trente",
+        "tres",
+        "trois",
+        "troisième",
+        "troisièmement",
+        "trop",
+        "très",
+        "tsoin",
+        "tsouin",
+        "tu",
+        "té",
+        "u",
+        "un",
+        "une",
+        "unes",
+        "uniformement",
+        "unique",
+        "uniques",
+        "uns",
+        "v",
+        "va",
+        "vais",
+        "vas",
+        "vers",
+        "via",
+        "vif",
+        "vifs",
+        "vingt",
+        "vivat",
+        "vive",
+        "vives",
+        "vlan",
+        "voici",
+        "voilà",
+        "vont",
+        "vos",
+        "votre",
+        "vous",
+        "vous-mêmes",
+        "vu",
+        "vé",
+        "vôtre",
+        "vôtres",
+        "w",
+        "x",
+        "y",
+        "z",
+        "zut",
+        "à",
+        "â",
+        "ça",
+        "ès",
+        "étaient",
+        "étais",
+        "était",
+        "étant",
+        "été",
+        "être",
+        "ô",
+    ],
+    "hi": [
+        "अंदर",
+        "अत",
+        "अदि",
+        "अप",
+        "अपना",
+        "अपनि",
+        "अपनी",
+        "अपने",
+        "अभि",
+        "अभी",
+        "आदि",
+        "आप",
+        "इंहिं",
+        "इंहें",
+        "इंहों",
+        "इतयादि",
+        "इत्यादि",
+        "इन",
+        "इनका",
+        "इन्हीं",
+        "इन्हें",
+        "इन्हों",
+        "इस",
+        "इसका",
+        "इसकि",
+        "इसकी",
+        "इसके",
+        "इसमें",
+        "इसि",
+        "इसी",
+        "इसे",
+        "उंहिं",
+        "उंहें",
+        "उंहों",
+        "उन",
+        "उनका",
+        "उनकि",
+        "उनकी",
+        "उनके",
+        "उनको",
+        "उन्हीं",
+        "उन्हें",
+        "उन्हों",
+        "उस",
+        "उसके",
+        "उसि",
+        "उसी",
+        "उसे",
+        "एक",
+        "एवं",
+        "एस",
+        "एसे",
+        "ऐसे",
+        "ओर",
+        "और",
+        "कइ",
+        "कई",
+        "कर",
+        "करता",
+        "करते",
+        "करना",
+        "करने",
+        "करें",
+        "कहते",
+        "कहा",
+        "का",
+        "काफि",
+        "काफ़ी",
+        "कि",
+        "किंहें",
+        "किंहों",
+        "कितना",
+        "किन्हें",
+        "किन्हों",
+        "किया",
+        "किर",
+        "किस",
+        "किसि",
+        "किसी",
+        "किसे",
+        "की",
+        "कुछ",
+        "कुल",
+        "के",
+        "को",
+        "कोइ",
+        "कोई",
+        "कोन",
+        "कोनसा",
+        "कौन",
+        "कौनसा",
+        "गया",
+        "घर",
+        "जब",
+        "जहाँ",
+        "जहां",
+        "जा",
+        "जिंहें",
+        "जिंहों",
+        "जितना",
+        "जिधर",
+        "जिन",
+        "जिन्हें",
+        "जिन्हों",
+        "जिस",
+        "जिसे",
+        "जीधर",
+        "जेसा",
+        "जेसे",
+        "जैसा",
+        "जैसे",
+        "जो",
+        "तक",
+        "तब",
+        "तरह",
+        "तिंहें",
+        "तिंहों",
+        "तिन",
+        "तिन्हें",
+        "तिन्हों",
+        "तिस",
+        "तिसे",
+        "तो",
+        "था",
+        "थि",
+        "थी",
+        "थे",
+        "दबारा",
+        "दवारा",
+        "दिया",
+        "दुसरा",
+        "दुसरे",
+        "दूसरे",
+        "दो",
+        "द्वारा",
+        "न",
+        "नहिं",
+        "नहीं",
+        "ना",
+        "निचे",
+        "निहायत",
+        "नीचे",
+        "ने",
+        "पर",
+        "पहले",
+        "पुरा",
+        "पूरा",
+        "पे",
+        "फिर",
+        "बनि",
+        "बनी",
+        "बहि",
+        "बही",
+        "बहुत",
+        "बाद",
+        "बाला",
+        "बिलकुल",
+        "भि",
+        "भितर",
+        "भी",
+        "भीतर",
+        "मगर",
+        "मानो",
+        "मे",
+        "में",
+        "यदि",
+        "यह",
+        "यहाँ",
+        "यहां",
+        "यहि",
+        "यही",
+        "या",
+        "यिह",
+        "ये",
+        "रखें",
+        "रवासा",
+        "रहा",
+        "रहे",
+        "ऱ्वासा",
+        "लिए",
+        "लिये",
+        "लेकिन",
+        "व",
+        "वगेरह",
+        "वरग",
+        "वर्ग",
+        "वह",
+        "वहाँ",
+        "वहां",
+        "वहिं",
+        "वहीं",
+        "वाले",
+        "वुह",
+        "वे",
+        "वग़ैरह",
+        "संग",
+        "सकता",
+        "सकते",
+        "सबसे",
+        "सभि",
+        "सभी",
+        "साथ",
+        "साबुत",
+        "साभ",
+        "सारा",
+        "से",
+        "सो",
+        "हि",
+        "ही",
+        "हुअ",
+        "हुआ",
+        "हुइ",
+        "हुई",
+        "हुए",
+        "हे",
+        "हें",
+        "है",
+        "हैं",
+        "हो",
+        "होता",
+        "होति",
+        "होती",
+        "होते",
+        "होना",
+        "होने",
+    ],
+    "id": [
+        "ada",
+        "adalah",
+        "adanya",
+        "adapun",
+        "agak",
+        "agaknya",
+        "agar",
+        "akan",
+        "akankah",
+        "akhirnya",
+        "aku",
+        "akulah",
+        "amat",
+        "amatlah",
+        "anda",
+        "andalah",
+        "antar",
+        "antara",
+        "antaranya",
+        "apa",
+        "apaan",
+        "apabila",
+        "apakah",
+        "apalagi",
+        "apatah",
+        "atau",
+        "ataukah",
+        "ataupun",
+        "bagai",
+        "bagaikan",
+        "bagaimana",
+        "bagaimanakah",
+        "bagaimanapun",
+        "bagi",
+        "bahkan",
+        "bahwa",
+        "bahwasanya",
+        "banyak",
+        "beberapa",
+        "begini",
+        "beginian",
+        "beginikah",
+        "beginilah",
+        "begitu",
+        "begitukah",
+        "begitulah",
+        "begitupun",
+        "belum",
+        "belumlah",
+        "berapa",
+        "berapakah",
+        "berapalah",
+        "berapapun",
+        "bermacam",
+        "bersama",
+        "betulkah",
+        "biasa",
+        "biasanya",
+        "bila",
+        "bilakah",
+        "bisa",
+        "bisakah",
+        "boleh",
+        "bolehkah",
+        "bolehlah",
+        "buat",
+        "bukan",
+        "bukankah",
+        "bukanlah",
+        "bukannya",
+        "cuma",
+        "dahulu",
+        "dalam",
+        "dan",
+        "dapat",
+        "dari",
+        "daripada",
+        "dekat",
+        "demi",
+        "demikian",
+        "demikianlah",
+        "dengan",
+        "depan",
+        "di",
+        "dia",
+        "dialah",
+        "diantara",
+        "diantaranya",
+        "dikarenakan",
+        "dini",
+        "diri",
+        "dirinya",
+        "disini",
+        "disinilah",
+        "dong",
+        "dulu",
+        "enggak",
+        "enggaknya",
+        "entah",
+        "entahlah",
+        "hal",
+        "hampir",
+        "hanya",
+        "hanyalah",
+        "harus",
+        "haruslah",
+        "harusnya",
+        "hendak",
+        "hendaklah",
+        "hendaknya",
+        "hingga",
+        "ia",
+        "ialah",
+        "ibarat",
+        "ingin",
+        "inginkah",
+        "inginkan",
+        "ini",
+        "inikah",
+        "inilah",
+        "itu",
+        "itukah",
+        "itulah",
+        "jangan",
+        "jangankan",
+        "janganlah",
+        "jika",
+        "jikalau",
+        "juga",
+        "justru",
+        "kala",
+        "kalau",
+        "kalaulah",
+        "kalaupun",
+        "kalian",
+        "kami",
+        "kamilah",
+        "kamu",
+        "kamulah",
+        "kan",
+        "kapan",
+        "kapankah",
+        "kapanpun",
+        "karena",
+        "karenanya",
+        "ke",
+        "kecil",
+        "kemudian",
+        "kenapa",
+        "kepada",
+        "kepadanya",
+        "ketika",
+        "khususnya",
+        "kini",
+        "kinilah",
+        "kiranya",
+        "kita",
+        "kitalah",
+        "kok",
+        "lagi",
+        "lagian",
+        "lah",
+        "lain",
+        "lainnya",
+        "lalu",
+        "lama",
+        "lamanya",
+        "lebih",
+        "macam",
+        "maka",
+        "makanya",
+        "makin",
+        "malah",
+        "malahan",
+        "mampu",
+        "mampukah",
+        "mana",
+        "manakala",
+        "manalagi",
+        "masih",
+        "masihkah",
+        "masing",
+        "mau",
+        "maupun",
+        "melainkan",
+        "melalui",
+        "memang",
+        "mengapa",
+        "mereka",
+        "merekalah",
+        "merupakan",
+        "meski",
+        "meskipun",
+        "mungkin",
+        "mungkinkah",
+        "nah",
+        "namun",
+        "nanti",
+        "nantinya",
+        "nyaris",
+        "oleh",
+        "olehnya",
+        "pada",
+        "padahal",
+        "padanya",
+        "paling",
+        "pantas",
+        "para",
+        "pasti",
+        "pastilah",
+        "per",
+        "percuma",
+        "pernah",
+        "pula",
+        "pun",
+        "rupanya",
+        "saat",
+        "saatnya",
+        "saja",
+        "sajalah",
+        "saling",
+        "sama",
+        "sambil",
+        "sampai",
+        "sana",
+        "sangat",
+        "sangatlah",
+        "saya",
+        "sayalah",
+        "se",
+        "sebab",
+        "sebabnya",
+        "sebagai",
+        "sebagaimana",
+        "sebagainya",
+        "sebaliknya",
+        "sebanyak",
+        "sebegini",
+        "sebegitu",
+        "sebelum",
+        "sebelumnya",
+        "sebenarnya",
+        "seberapa",
+        "sebetulnya",
+        "sebisanya",
+        "sebuah",
+        "sedang",
+        "sedangkan",
+        "sedemikian",
+        "sedikit",
+        "sedikitnya",
+        "segala",
+        "segalanya",
+        "segera",
+        "seharusnya",
+        "sehingga",
+        "sejak",
+        "sejenak",
+        "sekali",
+        "sekalian",
+        "sekaligus",
+        "sekalipun",
+        "sekarang",
+        "seketika",
+        "sekiranya",
+        "sekitar",
+        "sekitarnya",
+        "sela",
+        "selagi",
+        "selain",
+        "selaku",
+        "selalu",
+        "selama",
+        "selamanya",
+        "seluruh",
+        "seluruhnya",
+        "semacam",
+        "semakin",
+        "semasih",
+        "semaunya",
+        "sementara",
+        "sempat",
+        "semua",
+        "semuanya",
+        "semula",
+        "sendiri",
+        "sendirinya",
+        "seolah",
+        "seorang",
+        "sepanjang",
+        "sepantasnya",
+        "sepantasnyalah",
+        "seperti",
+        "sepertinya",
+        "sering",
+        "seringnya",
+        "serta",
+        "serupa",
+        "sesaat",
+        "sesama",
+        "sesegera",
+        "sesekali",
+        "seseorang",
+        "sesuatu",
+        "sesuatunya",
+        "sesudah",
+        "sesudahnya",
+        "setelah",
+        "seterusnya",
+        "setiap",
+        "setidaknya",
+        "sewaktu",
+        "siapa",
+        "siapakah",
+        "siapapun",
+        "sini",
+        "sinilah",
+        "suatu",
+        "sudah",
+        "sudahkah",
+        "sudahlah",
+        "supaya",
+        "tadi",
+        "tadinya",
+        "tak",
+        "tanpa",
+        "tapi",
+        "telah",
+        "tentang",
+        "tentu",
+        "tentulah",
+        "tentunya",
+        "terdiri",
+        "terhadap",
+        "terhadapnya",
+        "terlalu",
+        "terlebih",
+        "tersebut",
+        "tersebutlah",
+        "tertentu",
+        "tetapi",
+        "tiap",
+        "tidak",
+        "tidakkah",
+        "tidaklah",
+        "toh",
+        "waduh",
+        "wah",
+        "wahai",
+        "walau",
+        "walaupun",
+        "wong",
+        "yaitu",
+        "yakni",
+        "yang",
+    ],
+    "mr": [
+        "अधिक",
+        "अनेक",
+        "अशी",
+        "असलयाचे",
+        "असलेल्या",
+        "असा",
+        "असून",
+        "असे",
+        "आज",
+        "आणि",
+        "आता",
+        "आपल्या",
+        "आला",
+        "आली",
+        "आले",
+        "आहे",
+        "आहेत",
+        "एक",
+        "एका",
+        "कमी",
+        "करणयात",
+        "करून",
+        "का",
+        "काम",
+        "काय",
+        "काही",
+        "किवा",
+        "की",
+        "केला",
+        "केली",
+        "केले",
+        "कोटी",
+        "गेल्या",
+        "घेऊन",
+        "जात",
+        "झाला",
+        "झाली",
+        "झाले",
+        "झालेल्या",
+        "टा",
+        "डॉ",
+        "तर",
+        "तरी",
+        "तसेच",
+        "ता",
+        "ती",
+        "तीन",
+        "ते",
+        "तो",
+        "त्या",
+        "त्याचा",
+        "त्याची",
+        "त्याच्या",
+        "त्याना",
+        "त्यानी",
+        "त्यामुळे",
+        "त्री",
+        "दिली",
+        "दोन",
+        "न",
+        "नाही",
+        "निर्ण्य",
+        "पण",
+        "पम",
+        "परयतन",
+        "पाटील",
+        "म",
+        "मात्र",
+        "माहिती",
+        "मी",
+        "मुबी",
+        "म्हणजे",
+        "म्हणाले",
+        "म्हणून",
+        "या",
+        "याचा",
+        "याची",
+        "याच्या",
+        "याना",
+        "यानी",
+        "येणार",
+        "येत",
+        "येथील",
+        "येथे",
+        "लाख",
+        "व",
+        "व्यकत",
+        "सर्व",
+        "सागित्ले",
+        "सुरू",
+        "हजार",
+        "हा",
+        "ही",
+        "हे",
+        "होणार",
+        "होत",
+        "होता",
+        "होती",
+        "होते",
+    ],
+    "pt": [
+        "a",
+        "acerca",
+        "adeus",
+        "agora",
+        "ainda",
+        "algmas",
+        "algo",
+        "algumas",
+        "alguns",
+        "ali",
+        "além",
+        "ambos",
+        "ano",
+        "anos",
+        "antes",
+        "ao",
+        "aos",
+        "apenas",
+        "apoio",
+        "apontar",
+        "após",
+        "aquela",
+        "aquelas",
+        "aquele",
+        "aqueles",
+        "aqui",
+        "aquilo",
+        "as",
+        "assim",
+        "através",
+        "atrás",
+        "até",
+        "aí",
+        "baixo",
+        "bastante",
+        "bem",
+        "bom",
+        "breve",
+        "cada",
+        "caminho",
+        "catorze",
+        "cedo",
+        "cento",
+        "certamente",
+        "certeza",
+        "cima",
+        "cinco",
+        "coisa",
+        "com",
+        "como",
+        "comprido",
+        "conhecido",
+        "conselho",
+        "contra",
+        "corrente",
+        "custa",
+        "cá",
+        "da",
+        "daquela",
+        "daquele",
+        "dar",
+        "das",
+        "de",
+        "debaixo",
+        "demais",
+        "dentro",
+        "depois",
+        "desde",
+        "desligado",
+        "dessa",
+        "desse",
+        "desta",
+        "deste",
+        "deve",
+        "devem",
+        "deverá",
+        "dez",
+        "dezanove",
+        "dezasseis",
+        "dezassete",
+        "dezoito",
+        "dia",
+        "diante",
+        "direita",
+        "diz",
+        "dizem",
+        "dizer",
+        "do",
+        "dois",
+        "dos",
+        "doze",
+        "duas",
+        "dá",
+        "dão",
+        "dúvida",
+        "e",
+        "ela",
+        "elas",
+        "ele",
+        "eles",
+        "em",
+        "embora",
+        "enquanto",
+        "entre",
+        "então",
+        "era",
+        "essa",
+        "essas",
+        "esse",
+        "esses",
+        "esta",
+        "estado",
+        "estar",
+        "estará",
+        "estas",
+        "estava",
+        "este",
+        "estes",
+        "esteve",
+        "estive",
+        "estivemos",
+        "estiveram",
+        "estiveste",
+        "estivestes",
+        "estou",
+        "está",
+        "estás",
+        "estão",
+        "eu",
+        "exemplo",
+        "falta",
+        "fará",
+        "favor",
+        "faz",
+        "fazeis",
+        "fazem",
+        "fazemos",
+        "fazer",
+        "fazes",
+        "fazia",
+        "faço",
+        "fez",
+        "fim",
+        "final",
+        "foi",
+        "fomos",
+        "for",
+        "fora",
+        "foram",
+        "forma",
+        "foste",
+        "fostes",
+        "fui",
+        "geral",
+        "grande",
+        "grandes",
+        "grupo",
+        "hoje",
+        "horas",
+        "há",
+        "iniciar",
+        "inicio",
+        "ir",
+        "irá",
+        "isso",
+        "ista",
+        "iste",
+        "isto",
+        "já",
+        "lado",
+        "ligado",
+        "local",
+        "logo",
+        "longe",
+        "lugar",
+        "lá",
+        "maior",
+        "maioria",
+        "maiorias",
+        "mais",
+        "mal",
+        "mas",
+        "me",
+        "meio",
+        "menor",
+        "menos",
+        "meses",
+        "mesmo",
+        "meu",
+        "meus",
+        "mil",
+        "minha",
+        "minhas",
+        "momento",
+        "muito",
+        "muitos",
+        "máximo",
+        "mês",
+        "na",
+        "nada",
+        "naquela",
+        "naquele",
+        "nas",
+        "nem",
+        "nenhuma",
+        "nessa",
+        "nesse",
+        "nesta",
+        "neste",
+        "no",
+        "noite",
+        "nome",
+        "nos",
+        "nossa",
+        "nossas",
+        "nosso",
+        "nossos",
+        "nova",
+        "nove",
+        "novo",
+        "novos",
+        "num",
+        "numa",
+        "nunca",
+        "não",
+        "nível",
+        "nós",
+        "número",
+        "o",
+        "obra",
+        "obrigada",
+        "obrigado",
+        "oitava",
+        "oitavo",
+        "oito",
+        "onde",
+        "ontem",
+        "onze",
+        "os",
+        "ou",
+        "outra",
+        "outras",
+        "outro",
+        "outros",
+        "para",
+        "parece",
+        "parte",
+        "partir",
+        "pegar",
+        "pela",
+        "pelas",
+        "pelo",
+        "pelos",
+        "perto",
+        "pessoas",
+        "pode",
+        "podem",
+        "poder",
+        "poderá",
+        "podia",
+        "ponto",
+        "pontos",
+        "por",
+        "porque",
+        "porquê",
+        "posição",
+        "possivelmente",
+        "posso",
+        "possível",
+        "pouca",
+        "pouco",
+        "povo",
+        "primeira",
+        "primeiro",
+        "promeiro",
+        "próprio",
+        "próximo",
+        "puderam",
+        "pôde",
+        "põe",
+        "põem",
+        "qual",
+        "qualquer",
+        "quando",
+        "quanto",
+        "quarta",
+        "quarto",
+        "quatro",
+        "que",
+        "quem",
+        "quer",
+        "quero",
+        "questão",
+        "quieto",
+        "quinta",
+        "quinto",
+        "quinze",
+        "quê",
+        "relação",
+        "sabe",
+        "saber",
+        "se",
+        "segunda",
+        "segundo",
+        "sei",
+        "seis",
+        "sem",
+        "sempre",
+        "ser",
+        "seria",
+        "sete",
+        "seu",
+        "seus",
+        "sexta",
+        "sexto",
+        "sim",
+        "sistema",
+        "sob",
+        "sobre",
+        "sois",
+        "somente",
+        "somos",
+        "sou",
+        "sua",
+        "suas",
+        "são",
+        "sétima",
+        "sétimo",
+        "tal",
+        "talvez",
+        "também",
+        "tanto",
+        "tarde",
+        "te",
+        "tem",
+        "temos",
+        "tempo",
+        "tendes",
+        "tenho",
+        "tens",
+        "tentar",
+        "tentaram",
+        "tente",
+        "tentei",
+        "ter",
+        "terceira",
+        "terceiro",
+        "teu",
+        "teus",
+        "teve",
+        "tipo",
+        "tive",
+        "tivemos",
+        "tiveram",
+        "tiveste",
+        "tivestes",
+        "toda",
+        "todas",
+        "todo",
+        "todos",
+        "trabalhar",
+        "trabalho",
+        "treze",
+        "três",
+        "tu",
+        "tua",
+        "tuas",
+        "tudo",
+        "tão",
+        "têm",
+        "um",
+        "uma",
+        "umas",
+        "uns",
+        "usa",
+        "usar",
+        "vai",
+        "vais",
+        "valor",
+        "veja",
+        "vem",
+        "vens",
+        "ver",
+        "verdade",
+        "verdadeiro",
+        "vez",
+        "vezes",
+        "viagem",
+        "vindo",
+        "vinte",
+        "você",
+        "vocês",
+        "vos",
+        "vossa",
+        "vossas",
+        "vosso",
+        "vossos",
+        "vários",
+        "vão",
+        "vêm",
+        "vós",
+        "zero",
+        "à",
+        "às",
+        "área",
+        "é",
+        "és",
+        "último",
+    ],
+    "so": [
+        "aad",
+        "albaabkii",
+        "atabo",
+        "ay",
+        "ayaa",
+        "ayee",
+        "ayuu",
+        "dhan",
+        "hadana",
+        "in",
+        "inuu",
+        "isku",
+        "jiray",
+        "jirtay",
+        "ka",
+        "kale",
+        "kasoo",
+        "ku",
+        "kuu",
+        "lakin",
+        "markii",
+        "oo",
+        "si",
+        "soo",
+        "uga",
+        "ugu",
+        "uu",
+        "waa",
+        "waxa",
+        "waxuu",
+    ],
+    "sw": [
+        "akasema",
+        "alikuwa",
+        "alisema",
+        "baada",
+        "basi",
+        "bila",
+        "cha",
+        "chini",
+        "hadi",
+        "hapo",
+        "hata",
+        "hivyo",
+        "hiyo",
+        "huku",
+        "huo",
+        "ili",
+        "ilikuwa",
+        "juu",
+        "kama",
+        "karibu",
+        "katika",
+        "kila",
+        "kima",
+        "kisha",
+        "kubwa",
+        "kutoka",
+        "kuwa",
+        "kwa",
+        "kwamba",
+        "kwenda",
+        "kwenye",
+        "la",
+        "lakini",
+        "mara",
+        "mdogo",
+        "mimi",
+        "mkubwa",
+        "mmoja",
+        "moja",
+        "muda",
+        "mwenye",
+        "na",
+        "naye",
+        "ndani",
+        "ng",
+        "ni",
+        "nini",
+        "nonkungu",
+        "pamoja",
+        "pia",
+        "sana",
+        "sasa",
+        "sauti",
+        "tafadhali",
+        "tena",
+        "tu",
+        "vile",
+        "wa",
+        "wakati",
+        "wake",
+        "walikuwa",
+        "wao",
+        "watu",
+        "wengine",
+        "wote",
+        "ya",
+        "yake",
+        "yangu",
+        "yao",
+        "yeye",
+        "yule",
+        "za",
+        "zaidi",
+        "zake",
+    ],
+    "ur": [
+        "آئی",
+        "آئے",
+        "آج",
+        "آخر",
+        "آخرکبر",
+        "آدهی",
+        "آًب",
+        "آٹھ",
+        "آیب",
+        "اة",
+        "اخبزت",
+        "اختتبم",
+        "ادھر",
+        "ارد",
+        "اردگرد",
+        "ارکبى",
+        "اش",
+        "اضتعوبل",
+        "اضتعوبلات",
+        "اضطرذ",
+        "اضکب",
+        "اضکی",
+        "اضکے",
+        "اطراف",
+        "اغیب",
+        "افراد",
+        "الگ",
+        "اور",
+        "اوًچب",
+        "اوًچبئی",
+        "اوًچی",
+        "اوًچے",
+        "اى",
+        "اً",
+        "اًذر",
+        "اًہیں",
+        "اٹھبًب",
+        "اپٌب",
+        "اپٌے",
+        "اچھب",
+        "اچھی",
+        "اچھے",
+        "اکثر",
+        "اکٹھب",
+        "اکٹھی",
+        "اکٹھے",
+        "اکیلا",
+        "اکیلی",
+        "اکیلے",
+        "اگرچہ",
+        "اہن",
+        "ایطے",
+        "ایک",
+        "ب",
+        "ت",
+        "تبزٍ",
+        "تت",
+        "تر",
+        "ترتیت",
+        "تریي",
+        "تعذاد",
+        "تن",
+        "تو",
+        "توبم",
+        "توہی",
+        "توہیں",
+        "تٌہب",
+        "تک",
+        "تھب",
+        "تھوڑا",
+        "تھوڑی",
+        "تھوڑے",
+        "تھی",
+        "تھے",
+        "تیي",
+        "ثب",
+        "ثبئیں",
+        "ثبترتیت",
+        "ثبری",
+        "ثبرے",
+        "ثبعث",
+        "ثبلا",
+        "ثبلترتیت",
+        "ثبہر",
+        "ثدبئے",
+        "ثرآں",
+        "ثراں",
+        "ثرش",
+        "ثعذ",
+        "ثغیر",
+        "ثلٌذ",
+        "ثلٌذوثبلا",
+        "ثلکہ",
+        "ثي",
+        "ثٌب",
+        "ثٌبرہب",
+        "ثٌبرہی",
+        "ثٌبرہے",
+        "ثٌبًب",
+        "ثٌذ",
+        "ثٌذکرو",
+        "ثٌذکرًب",
+        "ثٌذی",
+        "ثڑا",
+        "ثڑوں",
+        "ثڑی",
+        "ثڑے",
+        "ثھر",
+        "ثھرا",
+        "ثھراہوا",
+        "ثھرپور",
+        "ثھی",
+        "ثہت",
+        "ثہتر",
+        "ثہتری",
+        "ثہتریي",
+        "ثیچ",
+        "ج",
+        "خب",
+        "خبرہب",
+        "خبرہی",
+        "خبرہے",
+        "خبهوظ",
+        "خبًب",
+        "خبًتب",
+        "خبًتی",
+        "خبًتے",
+        "خبًٌب",
+        "خت",
+        "ختن",
+        "خجکہ",
+        "خص",
+        "خططرذ",
+        "خلذی",
+        "خو",
+        "خواى",
+        "خوًہی",
+        "خوکہ",
+        "خٌبة",
+        "خگہ",
+        "خگہوں",
+        "خگہیں",
+        "خیطب",
+        "خیطبکہ",
+        "در",
+        "درخبت",
+        "درخہ",
+        "درخے",
+        "درزقیقت",
+        "درضت",
+        "دش",
+        "دفعہ",
+        "دلچطپ",
+        "دلچطپی",
+        "دلچطپیبں",
+        "دو",
+        "دور",
+        "دوراى",
+        "دوضرا",
+        "دوضروں",
+        "دوضری",
+        "دوضرے",
+        "دوًوں",
+        "دکھبئیں",
+        "دکھبتب",
+        "دکھبتی",
+        "دکھبتے",
+        "دکھبو",
+        "دکھبًب",
+        "دکھبیب",
+        "دی",
+        "دیب",
+        "دیتب",
+        "دیتی",
+        "دیتے",
+        "دیر",
+        "دیٌب",
+        "دیکھو",
+        "دیکھٌب",
+        "دیکھی",
+        "دیکھیں",
+        "دے",
+        "ر",
+        "راضتوں",
+        "راضتہ",
+        "راضتے",
+        "رریعہ",
+        "رریعے",
+        "رکي",
+        "رکھ",
+        "رکھب",
+        "رکھتب",
+        "رکھتبہوں",
+        "رکھتی",
+        "رکھتے",
+        "رکھی",
+        "رکھے",
+        "رہب",
+        "رہی",
+        "رہے",
+        "ز",
+        "زبصل",
+        "زبضر",
+        "زبل",
+        "زبلات",
+        "زبلیہ",
+        "زصوں",
+        "زصہ",
+        "زصے",
+        "زقبئق",
+        "زقیتیں",
+        "زقیقت",
+        "زکن",
+        "زکویہ",
+        "زیبدٍ",
+        "صبف",
+        "صسیر",
+        "صفر",
+        "صورت",
+        "صورتسبل",
+        "صورتوں",
+        "صورتیں",
+        "ض",
+        "ضبت",
+        "ضبتھ",
+        "ضبدٍ",
+        "ضبرا",
+        "ضبرے",
+        "ضبل",
+        "ضبلوں",
+        "ضت",
+        "ضرور",
+        "ضرورت",
+        "ضروری",
+        "ضلطلہ",
+        "ضوچ",
+        "ضوچب",
+        "ضوچتب",
+        "ضوچتی",
+        "ضوچتے",
+        "ضوچو",
+        "ضوچٌب",
+        "ضوچی",
+        "ضوچیں",
+        "ضکب",
+        "ضکتب",
+        "ضکتی",
+        "ضکتے",
+        "ضکٌب",
+        "ضکی",
+        "ضکے",
+        "ضیذھب",
+        "ضیذھی",
+        "ضیذھے",
+        "ضیکٌڈ",
+        "ضے",
+        "طرف",
+        "طریق",
+        "طریقوں",
+        "طریقہ",
+        "طریقے",
+        "طور",
+        "طورپر",
+        "ظبہر",
+        "ع",
+        "عذد",
+        "عظین",
+        "علاقوں",
+        "علاقہ",
+        "علاقے",
+        "علاوٍ",
+        "عووهی",
+        "غبیذ",
+        "غخص",
+        "غذ",
+        "غروع",
+        "غروعبت",
+        "غے",
+        "فرد",
+        "فی",
+        "ق",
+        "قجل",
+        "قجیلہ",
+        "قطن",
+        "لئے",
+        "لا",
+        "لازهی",
+        "لو",
+        "لوجب",
+        "لوجی",
+        "لوجے",
+        "لوسبت",
+        "لوسہ",
+        "لوگ",
+        "لوگوں",
+        "لڑکپي",
+        "لگتب",
+        "لگتی",
+        "لگتے",
+        "لگٌب",
+        "لگی",
+        "لگیں",
+        "لگے",
+        "لی",
+        "لیب",
+        "لیٌب",
+        "لیں",
+        "لے",
+        "ه",
+        "هتعلق",
+        "هختلف",
+        "هسترم",
+        "هسترهہ",
+        "هسطوش",
+        "هسیذ",
+        "هطئلہ",
+        "هطئلے",
+        "هطبئل",
+        "هطتعول",
+        "هطلق",
+        "هعلوم",
+        "هػتول",
+        "هلا",
+        "هوکي",
+        "هوکٌبت",
+        "هوکٌہ",
+        "هٌبضت",
+        "هڑا",
+        "هڑًب",
+        "هڑے",
+        "هکول",
+        "هگر",
+        "هہرثبى",
+        "هیرا",
+        "هیری",
+        "هیرے",
+        "هیں",
+        "و",
+        "وار",
+        "والے",
+        "وٍ",
+        "ًئی",
+        "ًئے",
+        "ًب",
+        "ًبپطٌذ",
+        "ًبگسیر",
+        "ًطجت",
+        "ًقطہ",
+        "ًو",
+        "ًوخواى",
+        "ًکبلٌب",
+        "ًکتہ",
+        "ًہ",
+        "ًہیں",
+        "ًیب",
+        "ًے",
+        "ٓ آش",
+        "ٹھیک",
+        "پبئے",
+        "پبش",
+        "پبًب",
+        "پبًچ",
+        "پر",
+        "پراًب",
+        "پطٌذ",
+        "پل",
+        "پورا",
+        "پوچھب",
+        "پوچھتب",
+        "پوچھتی",
+        "پوچھتے",
+        "پوچھو",
+        "پوچھوں",
+        "پوچھٌب",
+        "پوچھیں",
+        "پچھلا",
+        "پھر",
+        "پہلا",
+        "پہلی",
+        "پہلےضی",
+        "پہلےضے",
+        "پہلےضےہی",
+        "پیع",
+        "چبر",
+        "چبہب",
+        "چبہٌب",
+        "چبہے",
+        "چلا",
+        "چلو",
+        "چلیں",
+        "چلے",
+        "چکب",
+        "چکی",
+        "چکیں",
+        "چکے",
+        "چھوٹب",
+        "چھوٹوں",
+        "چھوٹی",
+        "چھوٹے",
+        "چھہ",
+        "چیسیں",
+        "ڈھوًڈا",
+        "ڈھوًڈلیب",
+        "ڈھوًڈو",
+        "ڈھوًڈًب",
+        "ڈھوًڈی",
+        "ڈھوًڈیں",
+        "ک",
+        "کئی",
+        "کئے",
+        "کب",
+        "کبفی",
+        "کبم",
+        "کت",
+        "کجھی",
+        "کرا",
+        "کرتب",
+        "کرتبہوں",
+        "کرتی",
+        "کرتے",
+        "کرتےہو",
+        "کررہب",
+        "کررہی",
+        "کررہے",
+        "کرو",
+        "کرًب",
+        "کریں",
+        "کرے",
+        "کطی",
+        "کل",
+        "کن",
+        "کوئی",
+        "کوتر",
+        "کورا",
+        "کوروں",
+        "کورٍ",
+        "کورے",
+        "کوطي",
+        "کوى",
+        "کوًطب",
+        "کوًطی",
+        "کوًطے",
+        "کھولا",
+        "کھولو",
+        "کھولٌب",
+        "کھولی",
+        "کھولیں",
+        "کھولے",
+        "کہ",
+        "کہب",
+        "کہتب",
+        "کہتی",
+        "کہتے",
+        "کہو",
+        "کہوں",
+        "کہٌب",
+        "کہی",
+        "کہیں",
+        "کہے",
+        "کی",
+        "کیب",
+        "کیطب",
+        "کیطرف",
+        "کیطے",
+        "کیلئے",
+        "کیوًکہ",
+        "کیوں",
+        "کیے",
+        "کے",
+        "کےثعذ",
+        "کےرریعے",
+        "گئی",
+        "گئے",
+        "گب",
+        "گرد",
+        "گروٍ",
+        "گروپ",
+        "گروہوں",
+        "گٌتی",
+        "گی",
+        "گیب",
+        "گے",
+        "ہر",
+        "ہن",
+        "ہو",
+        "ہوئی",
+        "ہوئے",
+        "ہوا",
+        "ہوبرا",
+        "ہوبری",
+        "ہوبرے",
+        "ہوتب",
+        "ہوتی",
+        "ہوتے",
+        "ہورہب",
+        "ہورہی",
+        "ہورہے",
+        "ہوضکتب",
+        "ہوضکتی",
+        "ہوضکتے",
+        "ہوًب",
+        "ہوًی",
+        "ہوًے",
+        "ہوچکب",
+        "ہوچکی",
+        "ہوچکے",
+        "ہوگئی",
+        "ہوگئے",
+        "ہوگیب",
+        "ہوں",
+        "ہی",
+        "ہیں",
+        "ہے",
+        "ی",
+        "یقیٌی",
+        "یہ",
+        "یہبں",
+    ],
+    "vi": [
+        "a ha",
+        "a-lô",
+        "ai",
+        "ai ai",
+        "ai nấy",
+        "alô",
+        "amen",
+        "anh",
+        "bao giờ",
+        "bao lâu",
+        "bao nhiêu",
+        "bao nả",
+        "bay biến",
+        "biết",
+        "biết bao",
+        "biết bao nhiêu",
+        "biết chừng nào",
+        "biết mấy",
+        "biết đâu",
+        "biết đâu chừng",
+        "biết đâu đấy",
+        "bà",
+        "bài",
+        "bác",
+        "bây bẩy",
+        "bây chừ",
+        "bây giờ",
+        "bây nhiêu",
+        "bèn",
+        "béng",
+        "bông",
+        "bạn",
+        "bản",
+        "bất chợt",
+        "bất cứ",
+        "bất giác",
+        "bất kì",
+        "bất kể",
+        "bất kỳ",
+        "bất luận",
+        "bất nhược",
+        "bất quá",
+        "bất thình lình",
+        "bất tử",
+        "bất đồ",
+        "bấy",
+        "bấy chầy",
+        "bấy chừ",
+        "bấy giờ",
+        "bấy lâu",
+        "bấy lâu nay",
+        "bấy nay",
+        "bấy nhiêu",
+        "bập bà bập bõm",
+        "bập bõm",
+        "bắt đầu từ",
+        "bằng",
+        "bằng không",
+        "bằng nấy",
+        "bằng ấy",
+        "bển",
+        "bệt",
+        "bị",
+        "bỏ mẹ",
+        "bỗng",
+        "bỗng chốc",
+        "bỗng dưng",
+        "bỗng không",
+        "bỗng nhiên",
+        "bỗng đâu",
+        "bộ",
+        "bội phần",
+        "bớ",
+        "bởi",
+        "bởi chưng",
+        "bởi nhưng",
+        "bởi thế",
+        "bởi vì",
+        "bởi vậy",
+        "bức",
+        "cao",
+        "cha",
+        "cha chả",
+        "chao ôi",
+        "chiếc",
+        "cho",
+        "cho nên",
+        "cho tới",
+        "cho tới khi",
+        "cho đến",
+        "cho đến khi",
+        "choa",
+        "chu cha",
+        "chui cha",
+        "chung cục",
+        "chung qui",
+        "chung quy",
+        "chung quy lại",
+        "chuyện",
+        "chành chạnh",
+        "chí chết",
+        "chính",
+        "chính là",
+        "chính thị",
+        "chùn chùn",
+        "chùn chũn",
+        "chú",
+        "chú mày",
+        "chú mình",
+        "chúng mình",
+        "chúng ta",
+        "chúng tôi",
+        "chăn chắn",
+        "chăng",
+        "chưa",
+        "chầm chập",
+        "chậc",
+        "chắc",
+        "chắc hẳn",
+        "chẳng lẽ",
+        "chẳng những",
+        "chẳng nữa",
+        "chẳng phải",
+        "chết nỗi",
+        "chết thật",
+        "chết tiệt",
+        "chỉ",
+        "chỉn",
+        "chốc chốc",
+        "chớ",
+        "chớ chi",
+        "chợt",
+        "chủn",
+        "chứ",
+        "chứ lị",
+        "coi bộ",
+        "coi mòi",
+        "con",
+        "cu cậu",
+        "cuốn",
+        "cuộc",
+        "càng",
+        "các",
+        "cái",
+        "cây",
+        "còn",
+        "có",
+        "có chăng là",
+        "có dễ",
+        "có thể",
+        "có vẻ",
+        "cóc khô",
+        "cô",
+        "cô mình",
+        "công nhiên",
+        "cùng",
+        "cùng cực",
+        "cùng nhau",
+        "cùng với",
+        "căn",
+        "căn cắt",
+        "cũng",
+        "cũng như",
+        "cũng vậy",
+        "cũng vậy thôi",
+        "cơ",
+        "cơ chừng",
+        "cơ hồ",
+        "cơ mà",
+        "cơn",
+        "cả",
+        "cả thảy",
+        "cả thể",
+        "cảm ơn",
+        "cần",
+        "cật lực",
+        "cật sức",
+        "cậu",
+        "cổ lai",
+        "của",
+        "cứ",
+        "cứ việc",
+        "cực lực",
+        "do",
+        "do vì",
+        "do vậy",
+        "do đó",
+        "duy",
+        "dào",
+        "dì",
+        "dù cho",
+        "dù rằng",
+        "dưới",
+        "dạ",
+        "dần dà",
+        "dần dần",
+        "dầu sao",
+        "dẫu",
+        "dẫu sao",
+        "dễ sợ",
+        "dễ thường",
+        "dở chừng",
+        "dữ",
+        "em",
+        "giữa",
+        "gì",
+        "hay",
+        "hoàn toàn",
+        "hoặc",
+        "hơn",
+        "hầu hết",
+        "họ",
+        "hỏi",
+        "khi",
+        "khác",
+        "không",
+        "luôn",
+        "là",
+        "làm",
+        "lên",
+        "lúc",
+        "lại",
+        "lần",
+        "lớn",
+        "muốn",
+        "mà",
+        "mình",
+        "mỗi",
+        "một",
+        "một cách",
+        "mới",
+        "mợ",
+        "ngay",
+        "ngay cả",
+        "ngay khi",
+        "ngay lúc",
+        "ngay lập tức",
+        "ngay tức khắc",
+        "ngay từ",
+        "nghe chừng",
+        "nghe đâu",
+        "nghen",
+        "nghiễm nhiên",
+        "nghỉm",
+        "ngoài",
+        "ngoài ra",
+        "ngoải",
+        "ngày",
+        "ngày càng",
+        "ngày ngày",
+        "ngày xưa",
+        "ngày xửa",
+        "ngôi",
+        "ngõ hầu",
+        "ngăn ngắt",
+        "ngươi",
+        "người",
+        "ngọn",
+        "ngọt",
+        "ngộ nhỡ",
+        "nh",
+        "nhau",
+        "nhiên hậu",
+        "nhiều",
+        "nhiệt liệt",
+        "nhung nhăng",
+        "nhà",
+        "nhân dịp",
+        "nhân tiện",
+        "nhé",
+        "nhón nhén",
+        "như",
+        "như chơi",
+        "như không",
+        "như quả",
+        "như thể",
+        "như tuồng",
+        "như vậy",
+        "nhưng",
+        "nhưng mà",
+        "nhược bằng",
+        "nhất",
+        "nhất loạt",
+        "nhất luật",
+        "nhất mực",
+        "nhất nhất",
+        "nhất quyết",
+        "nhất sinh",
+        "nhất thiết",
+        "nhất tâm",
+        "nhất tề",
+        "nhất đán",
+        "nhất định",
+        "nhận",
+        "nhỉ",
+        "nhỡ ra",
+        "những",
+        "những ai",
+        "những như",
+        "nào",
+        "này",
+        "nên",
+        "nên chi",
+        "nó",
+        "nóc",
+        "nói",
+        "năm",
+        "nơi",
+        "nấy",
+        "nếu",
+        "nếu như",
+        "nền",
+        "nọ",
+        "nớ",
+        "nức nở",
+        "nữa",
+        "oai oái",
+        "oái",
+        "pho",
+        "phè",
+        "phóc",
+        "phót",
+        "phăn phắt",
+        "phương chi",
+        "phải",
+        "phải chi",
+        "phải chăng",
+        "phắt",
+        "phỉ phui",
+        "phỏng",
+        "phỏng như",
+        "phốc",
+        "phụt",
+        "phứt",
+        "qua",
+        "qua quít",
+        "qua quýt",
+        "quyết",
+        "quyết nhiên",
+        "quyển",
+        "quá",
+        "quá chừng",
+        "quá lắm",
+        "quá sá",
+        "quá thể",
+        "quá trời",
+        "quá xá",
+        "quá đỗi",
+        "quá độ",
+        "quá ư",
+        "quý hồ",
+        "quả",
+        "quả là",
+        "quả tang",
+        "quả thật",
+        "quả tình",
+        "quả vậy",
+        "quả đúng",
+        "ra",
+        "ra phết",
+        "ra sao",
+        "ra trò",
+        "ren rén",
+        "riu ríu",
+        "riêng",
+        "riệt",
+        "rày",
+        "ráo",
+        "ráo trọi",
+        "rén",
+        "rích",
+        "rón rén",
+        "rút cục",
+        "răng",
+        "rất",
+        "rằng",
+        "rằng là",
+        "rốt cuộc",
+        "rốt cục",
+        "rồi",
+        "rứa",
+        "sa sả",
+        "sao",
+        "sau",
+        "sau chót",
+        "sau cuối",
+        "sau cùng",
+        "sau đó",
+        "so",
+        "song le",
+        "suýt",
+        "sì",
+        "sạch",
+        "sất",
+        "sắp",
+        "sẽ",
+        "số",
+        "số là",
+        "sốt sột",
+        "sở dĩ",
+        "sự",
+        "tanh",
+        "tha hồ",
+        "than ôi",
+        "thanh",
+        "theo",
+        "thi thoảng",
+        "thoạt",
+        "thoạt nhiên",
+        "thoắt",
+        "thuần",
+        "thà",
+        "thà là",
+        "thà rằng",
+        "thành ra",
+        "thành thử",
+        "thái quá",
+        "tháng",
+        "thì",
+        "thì thôi",
+        "thình lình",
+        "thím",
+        "thôi",
+        "thúng thắng",
+        "thương ôi",
+        "thường",
+        "thảo hèn",
+        "thảo nào",
+        "thấy",
+        "thẩy",
+        "thậm",
+        "thậm chí",
+        "thật lực",
+        "thật ra",
+        "thật vậy",
+        "thế",
+        "thế là",
+        "thế mà",
+        "thế nào",
+        "thế nên",
+        "thế ra",
+        "thế thì",
+        "thế à",
+        "thếch",
+        "thỉnh thoảng",
+        "thỏm",
+        "thốc",
+        "thốc tháo",
+        "thốt",
+        "thốt nhiên",
+        "thộc",
+        "thời gian",
+        "thục mạng",
+        "thửa",
+        "thực ra",
+        "thực sự",
+        "thực vậy",
+        "tiếp theo",
+        "tiếp đó",
+        "tiện thể",
+        "toà",
+        "toé khói",
+        "toẹt",
+        "trong",
+        "trên",
+        "trước",
+        "trước kia",
+        "trước nay",
+        "trước tiên",
+        "trước đây",
+        "trước đó",
+        "trếu tráo",
+        "trển",
+        "trệt",
+        "trệu trạo",
+        "trỏng",
+        "trời đất ơi",
+        "trừ phi",
+        "tuy",
+        "tuy nhiên",
+        "tuy rằng",
+        "tuy thế",
+        "tuy vậy",
+        "tuyệt nhiên",
+        "tuần tự",
+        "tuốt luốt",
+        "tuốt tuồn tuột",
+        "tuốt tuột",
+        "tà tà",
+        "tênh",
+        "tít mù",
+        "tò te",
+        "tôi",
+        "tông tốc",
+        "tù tì",
+        "tăm tắp",
+        "tại",
+        "tại vì",
+        "tấm",
+        "tấn",
+        "tất cả",
+        "tất thảy",
+        "tất tần tật",
+        "tất tật",
+        "tắp",
+        "tắp lự",
+        "tọt",
+        "tỏ ra",
+        "tỏ vẻ",
+        "tốc tả",
+        "tối ư",
+        "tột",
+        "tớ",
+        "tới",
+        "tức thì",
+        "tức tốc",
+        "từ",
+        "từng",
+        "tự vì",
+        "tựu trung",
+        "veo",
+        "veo veo",
+        "việc",
+        "vung thiên địa",
+        "vung tàn tán",
+        "vung tán tàn",
+        "và",
+        "vào",
+        "vâng",
+        "vèo",
+        "vì",
+        "vì chưng",
+        "vì thế",
+        "vì vậy",
+        "ví bằng",
+        "ví dù",
+        "ví phỏng",
+        "ví thử",
+        "vô hình trung",
+        "vô kể",
+        "vô luận",
+        "vô vàn",
+        "văng tê",
+        "vạn nhất",
+        "vả chăng",
+        "vả lại",
+        "vẫn",
+        "vậy",
+        "vậy là",
+        "vậy thì",
+        "về",
+        "vị tất",
+        "vốn dĩ",
+        "với",
+        "với lại",
+        "vở",
+        "vụt",
+        "vừa",
+        "vừa mới",
+        "xa xả",
+        "xiết bao",
+        "xon xón",
+        "xoành xoạch",
+        "xoét",
+        "xoẳn",
+        "xoẹt",
+        "xuất kì bất ý",
+        "xuất kỳ bất ý",
+        "xuể",
+        "xuống",
+        "xăm xúi",
+        "xăm xăm",
+        "xăm xắm",
+        "xềnh xệch",
+        "xệp",
+        "à",
+        "à ơi",
+        "ào",
+        "á",
+        "á à",
+        "ái",
+        "ái chà",
+        "ái dà",
+        "áng",
+        "âu là",
+        "ô hay",
+        "ô hô",
+        "ô kê",
+        "ô kìa",
+        "ôi chao",
+        "ôi thôi",
+        "ông",
+        "úi",
+        "úi chà",
+        "úi dào",
+        "ý",
+        "ý chừng",
+        "ý da",
+        "đang",
+        "đi",
+        "điều",
+        "đành đạch",
+        "đáng lí",
+        "đáng lý",
+        "đáng lẽ",
+        "đánh đùng",
+        "đáo để",
+        "đây",
+        "đã",
+        "đó",
+        "được",
+        "đại loại",
+        "đại nhân",
+        "đại phàm",
+        "đại để",
+        "đến",
+        "đến nỗi",
+        "đều",
+        "để",
+        "ơ",
+        "ơ hay",
+        "ơ kìa",
+        "ơi",
+        "ư",
+        "ạ",
+        "ạ ơi",
+        "ấy",
+        "ầu ơ",
+        "ắt",
+        "ắt hẳn",
+        "ắt là",
+        "ối dào",
+        "ối giời",
+        "ối giời ơi",
+        "ồ",
+        "ổng",
+        "ớ",
+        "ờ",
+        "ở",
+        "ở trên",
+        "ủa",
+        "ứ hự",
+        "ứ ừ",
+        "ừ",
+        "ử",
+    ],
+    "yo": [
+        "a",
+        "an",
+        "bá",
+        "bí",
+        "bẹ̀rẹ̀",
+        "fún",
+        "fẹ́",
+        "gbogbo",
+        "inú",
+        "jù",
+        "jẹ",
+        "jẹ́",
+        "kan",
+        "kì",
+        "kí",
+        "kò",
+        "láti",
+        "lè",
+        "lọ",
+        "mi",
+        "mo",
+        "máa",
+        "mọ̀",
+        "ni",
+        "náà",
+        "ní",
+        "nígbà",
+        "nítorí",
+        "nǹkan",
+        "o",
+        "padà",
+        "pé",
+        "púpọ̀",
+        "pẹ̀lú",
+        "rẹ̀",
+        "sì",
+        "sí",
+        "sínú",
+        "ṣ",
+        "ti",
+        "tí",
+        "wà",
+        "wá",
+        "wọn",
+        "wọ́n",
+        "yìí",
+        "àti",
+        "àwọn",
+        "é",
+        "í",
+        "òun",
+        "ó",
+        "ń",
+        "ńlá",
+        "ṣe",
+        "ṣé",
+        "ṣùgbọ́n",
+        "ẹmọ́",
+        "ọjọ́",
+        "ọ̀pọ̀lọpọ̀",
+    ],
+    "zh": [
+        "、",
+        "。",
+        "〈",
+        "〉",
+        "《",
+        "》",
+        "一",
+        "一切",
+        "一则",
+        "一方面",
+        "一旦",
+        "一来",
+        "一样",
+        "一般",
+        "七",
+        "万一",
+        "三",
+        "上下",
+        "不仅",
+        "不但",
+        "不光",
+        "不单",
+        "不只",
+        "不如",
+        "不怕",
+        "不惟",
+        "不成",
+        "不拘",
+        "不比",
+        "不然",
+        "不特",
+        "不独",
+        "不管",
+        "不论",
+        "不过",
+        "不问",
+        "与",
+        "与其",
+        "与否",
+        "与此同时",
+        "且",
+        "两者",
+        "个",
+        "临",
+        "为",
+        "为了",
+        "为什么",
+        "为何",
+        "为着",
+        "乃",
+        "乃至",
+        "么",
+        "之",
+        "之一",
+        "之所以",
+        "之类",
+        "乌乎",
+        "乎",
+        "乘",
+        "九",
+        "也",
+        "也好",
+        "也罢",
+        "了",
+        "二",
+        "于",
+        "于是",
+        "于是乎",
+        "云云",
+        "五",
+        "人家",
+        "什么",
+        "什么样",
+        "从",
+        "从而",
+        "他",
+        "他人",
+        "他们",
+        "以",
+        "以便",
+        "以免",
+        "以及",
+        "以至",
+        "以至于",
+        "以致",
+        "们",
+        "任",
+        "任何",
+        "任凭",
+        "似的",
+        "但",
+        "但是",
+        "何",
+        "何况",
+        "何处",
+        "何时",
+        "作为",
+        "你",
+        "你们",
+        "使得",
+        "例如",
+        "依",
+        "依照",
+        "俺",
+        "俺们",
+        "倘",
+        "倘使",
+        "倘或",
+        "倘然",
+        "倘若",
+        "借",
+        "假使",
+        "假如",
+        "假若",
+        "像",
+        "八",
+        "六",
+        "兮",
+        "关于",
+        "其",
+        "其一",
+        "其中",
+        "其二",
+        "其他",
+        "其余",
+        "其它",
+        "其次",
+        "具体地说",
+        "具体说来",
+        "再者",
+        "再说",
+        "冒",
+        "冲",
+        "况且",
+        "几",
+        "几时",
+        "凭",
+        "凭借",
+        "则",
+        "别",
+        "别的",
+        "别说",
+        "到",
+        "前后",
+        "前者",
+        "加之",
+        "即",
+        "即令",
+        "即使",
+        "即便",
+        "即或",
+        "即若",
+        "又",
+        "及",
+        "及其",
+        "及至",
+        "反之",
+        "反过来",
+        "反过来说",
+        "另",
+        "另一方面",
+        "另外",
+        "只是",
+        "只有",
+        "只要",
+        "只限",
+        "叫",
+        "叮咚",
+        "可",
+        "可以",
+        "可是",
+        "可见",
+        "各",
+        "各个",
+        "各位",
+        "各种",
+        "各自",
+        "同",
+        "同时",
+        "向",
+        "向着",
+        "吓",
+        "吗",
+        "否则",
+        "吧",
+        "吧哒",
+        "吱",
+        "呀",
+        "呃",
+        "呕",
+        "呗",
+        "呜",
+        "呜呼",
+        "呢",
+        "呵",
+        "呸",
+        "呼哧",
+        "咋",
+        "和",
+        "咚",
+        "咦",
+        "咱",
+        "咱们",
+        "咳",
+        "哇",
+        "哈",
+        "哈哈",
+        "哉",
+        "哎",
+        "哎呀",
+        "哎哟",
+        "哗",
+        "哟",
+        "哦",
+        "哩",
+        "哪",
+        "哪个",
+        "哪些",
+        "哪儿",
+        "哪天",
+        "哪年",
+        "哪怕",
+        "哪样",
+        "哪边",
+        "哪里",
+        "哼",
+        "哼唷",
+        "唉",
+        "啊",
+        "啐",
+        "啥",
+        "啦",
+        "啪达",
+        "喂",
+        "喏",
+        "喔唷",
+        "嗡嗡",
+        "嗬",
+        "嗯",
+        "嗳",
+        "嘎",
+        "嘎登",
+        "嘘",
+        "嘛",
+        "嘻",
+        "嘿",
+        "四",
+        "因",
+        "因为",
+        "因此",
+        "因而",
+        "固然",
+        "在",
+        "在下",
+        "地",
+        "多",
+        "多少",
+        "她",
+        "她们",
+        "如",
+        "如上所述",
+        "如何",
+        "如其",
+        "如果",
+        "如此",
+        "如若",
+        "宁",
+        "宁可",
+        "宁愿",
+        "宁肯",
+        "它",
+        "它们",
+        "对",
+        "对于",
+        "将",
+        "尔后",
+        "尚且",
+        "就",
+        "就是",
+        "就是说",
+        "尽",
+        "尽管",
+        "岂但",
+        "己",
+        "并",
+        "并且",
+        "开外",
+        "开始",
+        "归",
+        "当",
+        "当着",
+        "彼",
+        "彼此",
+        "往",
+        "待",
+        "得",
+        "怎",
+        "怎么",
+        "怎么办",
+        "怎么样",
+        "怎样",
+        "总之",
+        "总的来看",
+        "总的来说",
+        "总的说来",
+        "总而言之",
+        "恰恰相反",
+        "您",
+        "慢说",
+        "我",
+        "我们",
+        "或",
+        "或是",
+        "或者",
+        "所",
+        "所以",
+        "打",
+        "把",
+        "抑或",
+        "拿",
+        "按",
+        "按照",
+        "换句话说",
+        "换言之",
+        "据",
+        "接着",
+        "故",
+        "故此",
+        "旁人",
+        "无宁",
+        "无论",
+        "既",
+        "既是",
+        "既然",
+        "时候",
+        "是",
+        "是的",
+        "替",
+        "有",
+        "有些",
+        "有关",
+        "有的",
+        "望",
+        "朝",
+        "朝着",
+        "本",
+        "本着",
+        "来",
+        "来着",
+        "极了",
+        "果然",
+        "果真",
+        "某",
+        "某个",
+        "某些",
+        "根据",
+        "正如",
+        "此",
+        "此外",
+        "此间",
+        "毋宁",
+        "每",
+        "每当",
+        "比",
+        "比如",
+        "比方",
+        "沿",
+        "沿着",
+        "漫说",
+        "焉",
+        "然则",
+        "然后",
+        "然而",
+        "照",
+        "照着",
+        "甚么",
+        "甚而",
+        "甚至",
+        "用",
+        "由",
+        "由于",
+        "由此可见",
+        "的",
+        "的话",
+        "相对而言",
+        "省得",
+        "着",
+        "着呢",
+        "矣",
+        "离",
+        "第",
+        "等",
+        "等等",
+        "管",
+        "紧接着",
+        "纵",
+        "纵令",
+        "纵使",
+        "纵然",
+        "经",
+        "经过",
+        "结果",
+        "给",
+        "继而",
+        "综上所述",
+        "罢了",
+        "者",
+        "而",
+        "而且",
+        "而况",
+        "而外",
+        "而已",
+        "而是",
+        "而言",
+        "能",
+        "腾",
+        "自",
+        "自个儿",
+        "自从",
+        "自各儿",
+        "自家",
+        "自己",
+        "自身",
+        "至",
+        "至于",
+        "若",
+        "若是",
+        "若非",
+        "莫若",
+        "虽",
+        "虽则",
+        "虽然",
+        "虽说",
+        "被",
+        "要",
+        "要不",
+        "要不是",
+        "要不然",
+        "要么",
+        "要是",
+        "让",
+        "论",
+        "设使",
+        "设若",
+        "该",
+        "诸位",
+        "谁",
+        "谁知",
+        "赶",
+        "起",
+        "起见",
+        "趁",
+        "趁着",
+        "越是",
+        "跟",
+        "较",
+        "较之",
+        "边",
+        "过",
+        "还是",
+        "还有",
+        "这",
+        "这个",
+        "这么",
+        "这么些",
+        "这么样",
+        "这么点儿",
+        "这些",
+        "这会儿",
+        "这儿",
+        "这就是说",
+        "这时",
+        "这样",
+        "这边",
+        "这里",
+        "进而",
+        "连",
+        "连同",
+        "通过",
+        "遵照",
+        "那",
+        "那个",
+        "那么",
+        "那么些",
+        "那么样",
+        "那些",
+        "那会儿",
+        "那儿",
+        "那时",
+        "那样",
+        "那边",
+        "那里",
+        "鄙人",
+        "鉴于",
+        "阿",
+        "除",
+        "除了",
+        "除此之外",
+        "除非",
+        "随",
+        "随着",
+        "零",
+        "非但",
+        "非徒",
+        "靠",
+        "顺",
+        "顺着",
+        "首先",
+        "︿",
+        "！",
+        "＃",
+        "＄",
+        "％",
+        "＆",
+        "（",
+        "）",
+        "＊",
+        "＋",
+        "，",
+        "０",
+        "１",
+        "２",
+        "３",
+        "４",
+        "５",
+        "６",
+        "７",
+        "８",
+        "９",
+        "：",
+        "；",
+        "＜",
+        "＞",
+        "？",
+        "＠",
+        "［",
+        "］",
+        "｛",
+        "｜",
+        "｝",
+        "～",
+        "￥",
+    ],
+}