diff --git a/app.py b/app.py index a498d3f5e9cfd97ecc0495facae59fd74a7a3bb7..7a5aa14fc72c35ad31be5a4180839685451928b3 100644 --- a/app.py +++ b/app.py @@ -13,6 +13,8 @@ import numpy as np import matplotlib.pyplot as plt +from filtering import Filtering + class Visualization: def __init__( @@ -390,6 +392,9 @@ class Visualization: ax.set_ylabel("frequency in the documents") st.pyplot(fig) + def check_personal_doc(self): + pass + def download_data(self): st.header("Download data") @@ -408,6 +413,7 @@ class Visualization: self.filtering_of_words() self.plot_distributions_filtering_parameters() #self.plot_zipf_law() + self.check_personal_doc() self.download_data() diff --git a/badwords.py b/badwords.py new file mode 100644 index 0000000000000000000000000000000000000000..64f1c200ef867bcaac1eee2645e0381a0fcee439 --- /dev/null +++ b/badwords.py @@ -0,0 +1,2682 @@ +# Merge +# https://github.com/zacanger/profane-words +# and +# https://github.com/thisandagain/washyourmouthoutwithsoap/blob/develop/data/build.json +# and +# https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words + + +english_badwords = [ + "abuse", + "anal", + "anilingus", + "anus", + "aroused", + "arse", + "arsehole", + "ass", + "asses", + "assfuck", + "asshat", + "asshole", + "assholes", + "autoerotic", + "bangbros", + "banging", + "bareback", + "bastard", + "bastards", + "bazongas", + "bbw", + "bdsm", + "biatch", + "bicurious", + "bigass", + "bigtits", + "bimbo", + "bimbos", + "bitch", + "bitches", + "bitching", + "blowjob", + "blowjobs", + "boche", + "boner", + "boners", + "boob", + "boobies", + "boobs", + "booty", + "brothel", + "buceta", + "bugger", + "buggered", + "buggery", + "bukkake", + "bule", + "buttcheeks", + "buttfuck", + "butthead", + "butthole", + "buttplug", + "cameltoe", + "camgirl", + "camwhore", + "chink", + "chinks", + "cialis", + "clit", + "clitoris", + "clits", + "clitty", + "clusterfuck", + "cock", + "cock-head", + "cockblock", + "cockfight", + "cockhead", + "cocks", + "cocksman", + "cocksucker", + "cocksucking", + "coital", + "coitus", + "coochie", + "cooly", + "coon", + "coons", + "copulate", + "cowgirl", + "crabs", + "creampie", + "cum", + "cumming", + "cums", + "cumshot", + "cumshots", + "cumslut", + "cunnilingus", + "cunny", + "cunt", + "cunts", + "cybersex", + "darkey", + "darkie", + "darkies", + "darky", + "deepthroat", + "deepthroating", + "dick", + "dickhole", + "dicks", + "dildo", + "dildos", + "dogging", + "doggy-style", + "doggystyle", + "dominatrix", + "dommes", + "dong", + "dp", + "dupa", + "dyke", + "dykes", + "ecchi", + "ejaculate", + "ejaculated", + "ejaculates", + "ejaculating", + "ejaculation", + "ejaculations", + "enema", + "erect", + "erection", + "ero", + "erotic", + "erotism", + "escort", + "fag", + "fagging", + "faggot", + "fagot", + "fagots", + "fags", + "felch", + "fellate", + "fellatio", + "femdom", + "fetish", + "figging", + "fingerbang", + "fingering", + "fisted", + "fister", + "fisting", + "floozy", + "fondle", + "footfetish", + "footjob", + "foreskin", + "fornicate", + "foursome", + "fuck", + "fuckable", + "fuckbook", + "fuckboy", + "fuckbuddy", + "fucked", + "fucker", + "fuckers", + "fuckfest", + "fuckhole", + "fuckin", + "fucking", + "fucks", + "fuk", + "fukin", + "fuking", + "g-spot", + "gangbang", + "gangbanged", + "gangbanger", + "gangbangs", + "genital", + "genitals", + "gigolo", + "glans", + "gonad", + "gonads", + "gook", + "gringo", + "gringos", + "grope", + "gspot", + "guido", + "handjob", + "haole", + "hapa", + "hardcore", + "hardon", + "harem", + "hentai", + "hindoo", + "hoe", + "hoes", + "honky", + "hooker", + "hookers", + "hooter", + "hooters", + "hori", + "horndog", + "horney", + "horniest", + "horny", + "humped", + "humper", + "humping", + "hussy", + "hymen", + "ikey", + "incest", + "injun", + "intercourse", + "interracial", + "jack-off", + "jackoff", + "jailbait", + "jerk-off", + "jerkoff", + "jiggy", + "jism", + "jizz", + "jizzed", + "kaffir", + "kafir", + "kike", + "kikes", + "kinkster", + "kinky", + "kkk", + "klan", + "kraut", + "labia", + "lapdance", + "libido", + "licker", + "licking", + "limey", + "lingerie", + "livesex", + "lolita", + "lovemaking", + "lust", + "lusting", + "masochist", + "masterbate", + "masterbating", + "masterbation", + "masturbate", + "masturbating", + "masturbation", + "milf", + "minge", + "missionary", + "molest", + "molestation", + "molester", + "munging", + "muschi", + "nads", + "naked", + "necked", + "necro", + "negress", + "negro", + "negroes", + "negroid", + "negros", + "nig", + "nigar", + "nigga", + "niggas", + "niggaz", + "nigger", + "niggers", + "nigra", + "nipple", + "nipples", + "nookie", + "nooky", + "nooner", + "nude", + "nudie", + "nudity", + "nymph", + "nympho", + "nymphomania", + "orgasim", + "orgasm", + "orgasms", + "orgies", + "orgy", + "orifice", + "p0rn", + "paedophile", + "pantie", + "panties", + "panty", + "pastie", + "pecker", + "pedo", + "pedophile", + "pedophilia", + "pedophiliac", + "peeper", + "peepshow", + "pegging", + "penetrate", + "penetration", + "penile", + "penis", + "penises", + "penus", + "perv", + "phallic", + "phonesex", + "pickaninnies", + "pimp", + "playboy", + "playgirl", + "poontang", + "porn", + "porno", + "pornography", + "pornos", + "pr0n", + "premature", + "preteen", + "pron", + "prostitute", + "pube", + "pubes", + "pubic", + "pubis", + "punani", + "pussies", + "pussy", + "pussys", + "pusy", + "puta", + "puto", + "queef", + "quickie", + "quicky", + "quim", + "randy", + "rape", + "raped", + "raper", + "raping", + "rapist", + "rectum", + "redneck", + "rednecks", + "redskin", + "redskins", + "rimjob", + "rimming", + "russki", + "s&m", + "sadism", + "sadist", + "sambo", + "santorum", + "schlong", + "scissoring", + "semen", + "sex", + "sexed", + "sexi", + "sexing", + "sexo", + "sexpot", + "sextoy", + "sexual", + "sexually", + "sexx", + "sexxx", + "sexxxy", + "sexxy", + "sexy", + "sh!t", + "sh1t", + "shagging", + "shemale", + "sissy", + "skank", + "skanks", + "slapper", + "slut", + "sluts", + "slutting", + "slutty", + "smut", + "smutty", + "sodomise", + "sodomite", + "sodomize", + "sodomy", + "spank", + "sperm", + "spic", + "spick", + "splooge", + "spooge", + "squaw", + "squirting", + "steamy", + "stiffy", + "strapon", + "suck", + "sucked", + "sucker", + "sucking", + "sucks", + "swallow", + "swallower", + "swinger", + "teabagging", + "testical", + "testicle", + "testicles", + "testis", + "threesome", + "threeway", + "titfuck", + "titjob", + "tits", + "tittie", + "titties", + "titty", + "tittyfuck", + "tity", + "toots", + "topless", + "trannie", + "tranny", + "tribadism", + "twat", + "twats", + "undies", + "undressing", + "upskirt", + "vag", + "vagina", + "vaginal", + "viagra", + "vibrator", + "virgin", + "vixen", + "voyeur", + "vulva", + "wank", + "wanker", + "wanking", + "wazoo", + "wedgie", + "wench", + "wetback", + "whore", + "whored", + "whorehouse", + "whores", + "whoring", + "wigger", + "willie", + "willies", + "willy", + "wog", + "wop", + "x-rated", + "xxx", + "xxxxxx", + "yaoi", + "yid", + "zoophile", + "zoophilia", +] + +badwords = { + "ar": english_badwords + + [ + "احتلام", + "اغتصاب", + "بز", + "بزاز", + "بظر", + "بيضان", + "تمص", + "ثدي", + "جماع", + "حلمة", + "خنثي", + "خول", + "زب", + "سحاق", + "سحاقية", + "سكس", + "شاذ", + "شرج", + "شرموطة", + "شهوة", + "طيز", + "عاهرة", + "عرص", + "فرج", + "قحبة", + "قضيب", + "كس", + "لبوة", + "لحس", + "لعق", + "لواط", + "لوطي", + "مبادل", + "متناك", + "متناكة", + "مص", + "مفلقسة", + "نيك", + ], + "ca": english_badwords + + [ + "avortament", + "anal", + "anus", + "cul", + "ass-fucker", + "asss", + "asshole", + "assholes", + "bolera", + "boles", + "bastardo", + "bellend", + "bestial", + "bestialitat", + "puta", + "femelles", + "picant", + "sagnant", + "mamada", + "bollok", + "boob", + "pits", + "buceta", + "bum", + "culata", + "catifa muncher", + "picar", + "cipa", + "clitoris", + "polla", + "galletejador", + "gallines", + "coon", + "merda", + "cum", + "correguda", + "cunillingus", + "boig", + "maleït", + "consolador", + "consoladors", + "dink", + "canalla", + "duche", + "dique", + "ejaculació", + "ejaculat", + "ejacula", + "ejaculant", + "fag", + "fagging", + "fagot", + "fagots", + "fanny", + "felching", + "fel.lació", + "brida", + "follar", + "follat", + "escuradents", + "follant", + "folles", + "fucks", + "empacadora de llaminadures", + "déu maldit", + "deu meu", + "infern", + "hore", + "córrer", + "retrocés", + "kock", + "llavis", + "lujuria", + "lució", + "masoquista", + "masturbarse", + "puta mare", + "nazi", + "nigger", + "negres", + "orgasim", + "orgasme", + "orgasmes", + "pecker", + "penis", + "piss", + "mossegat", + "pisser", + "pisses", + "pissing", + "treure de polleguera", + "caca", + "porno", + "pornografia", + "picades", + "pube", + "coques", + "gatet", + "violació", + "violador", + "recte", + "retard", + "rimming", + "sàdic", + "cargolar", + "escrot", + "semen", + "sexe", + "shag", + "borratxos", + "transsexual", + "mossegar", + "shitted", + "skank", + "smegma", + "smut", + "arrebat", + "fill de puta", + "spac", + "spunk", + "testicle", + "tit", + "tetas", + "titt", + "turd", + "vagina", + "viagra", + "vulva", + "wang", + "wank", + "x classificat", + "xxx", + ], + "en": english_badwords, + "es": english_badwords + + [ + "Asesinato", + "Bollera", + "Cabrón", + "Caca", + "Chupada", + "Chupapollas", + "Chupetón", + "Concha de tu madre", + "Coprofagía", + "Coño", + "Culo", + "Drogas", + "Esperma", + "Fiesta de salchichas", + "Follador", + "Follar", + "Gilipichis", + "Gilipollas", + "Hacer una paja", + "Haciendo el amor", + "Heroína", + "Hija de puta", + "Hijaputa", + "Hijo de puta", + "Hijoputa", + "Idiota", + "Imbécil", + "Jilipollas", + "Kapullo", + "Lameculos", + "Maciza", + "Macizorra", + "Mamada", + "Marica", + "Mariconazo", + "Maricón", + "Mierda", + "Nazi", + "Orina", + "Pedo", + "Pendejo", + "Pervertido", + "Pezón", + "Pinche", + "Pis", + "Prostituta", + "Puta", + "Racista", + "Ramera", + "Semen", + "Sexo", + "Sexo oral", + "Soplagaitas", + "Soplapollas", + "Sádico", + "Tetas grandes", + "Travesti", + "Trio", + "Tía buena", + "Verga", + "Vulva", + "aborto", + "agallas", + "anal", + "ano", + "arrebatar", + "asno", + "atornillar", + "bastardo", + "bestial", + "bestialidad", + "bolas", + "bollok", + "bolsa de pelota", + "brida", + "buceta", + "cabron", + "cagadas", + "cagado", + "cagando", + "campana", + "carajo", + "chupar la polla", + "cipa", + "clítoris", + "concha", + "consolador", + "consoladores", + "corrida", + "coño", + "coños", + "culo", + "culos", + "cunillingus", + "córneo", + "de mierda", + "dique", + "duche", + "enojado", + "escroto", + "espacio", + "estúpido", + "extremo", + "eyacula", + "eyaculación", + "eyaculado", + "eyacular", + "fagging", + "felación", + "felching", + "folla", + "follada", + "follador de culo", + "folladores", + "follar", + "fudge packer", + "gallos", + "grieta", + "hacerse una paja", + "hijo de puta", + "hore", + "infierno", + "kock", + "labios vaginales", + "los pechos", + "lujuria", + "madre folladora", + "maldita sea", + "maldito", + "maldito sea", + "mamada", + "mapache", + "maricones", + "maricón", + "martillo", + "masoquista", + "masturbarse", + "mear", + "mierda", + "molesto", + "muncher alfombra", + "nazi", + "negro", + "niggers", + "orgasimo", + "orgasmo", + "orgasmos", + "orinando", + "pelusa", + "pene", + "perra", + "perras", + "perro follador", + "pinchazo", + "pinchazos", + "pisser", + "polla", + "porno", + "pornografía", + "pube", + "puta", + "putas", + "pájaro carpintero", + "quejas", + "recto", + "retardar", + "rimming", + "sangriento", + "semen", + "sexo", + "skank", + "smegma", + "sádico", + "testículo", + "teta", + "tetas", + "tirón", + "tizón", + "tonto", + "transexual", + "vagina", + "vete a la mierda", + "viagra", + "violación", + "violador", + "vulva", + "wang", + "x clasificado", + "xxx", + "zurullo", + ], + "eu": english_badwords + + [ + "abortu", + "anal", + "ipurdi", + "kabroi", + "puta", + "clitoris", + "cunillingus", + "madarikatu", + "zakil", + "hazia isuri", + "arraio", + "izorratu", + "infernu", + "emagaldu", + "lizunkeri", + "lizun", + "masokista", + "masturbatu", + "nazi", + "beltz", + "orgasmo", + "pixa", + "porno", + "pornografia", + "alu", + "bortxaketa", + "bortxatzaile", + "sadista", + "ipurzulo", + "hazi", + "semen", + "sexu", + "kaka", + "putaseme", + "barrabil", + "titi", + "bagina", + "viagra", + ], + "fr": english_badwords + + [ + "MALPT", + "anal", + "anus", + "arracher", + "avortement", + "baise", + "baiser", + "baiseur de chien", + "baiseurs", + "baisée", + "bander", + "bellend", + "bestial", + "bestialité", + "bigornette", + "bite", + "bitte", + "bloblos", + "bollok", + "boob", + "bordel", + "bourré", + "bourrée", + "bout", + "brackmard", + "branlage", + "branler", + "branlette", + "branleur", + "branleuse", + "bride", + "brouter le cresson", + "buceta", + "caca", + "chatte", + "chattes", + "chiasse", + "chienne", + "chiennes", + "chier", + "chiottes", + "chié", + "cipa", + "clito", + "clitoris", + "clochard", + "cochonneries", + "con", + "connard", + "connards", + "connasse", + "conne", + "convoitise", + "coq", + "coqs", + "corné", + "couilles", + "cramouille", + "cran", + "cul", + "culs", + "cunillingus", + "damné", + "des balles", + "digue", + "duché", + "déconne", + "déconner", + "emballeur de fudge", + "emmerdant", + "emmerder", + "emmerdeur", + "emmerdeuse", + "enculer", + "enculeur", + "enculeurs", + "enculé", + "enculée", + "enfer", + "enfoiré", + "enfoirée", + "espacer", + "fagging", + "fagot", + "fagots", + "faire chier", + "fellation", + "fente", + "fille de pute", + "fils de pute", + "folle", + "foutre", + "fuckings", + "gerbe", + "gerber", + "godemiché", + "godes", + "gouine", + "grande folle", + "grogniasse", + "gueule", + "hore", + "jouir", + "kock", + "la putain de ta mère", + "les lèvres", + "les seins", + "luxure", + "masochiste", + "masturber", + "merde", + "merdeuse", + "merdeux", + "merdique", + "meuf", + "mère enculée", + "ménage à trois", + "mésange", + "nazi", + "negro", + "nique ta mère", + "nique ta race", + "nègre", + "nègres", + "orgasim", + "orgasme", + "orgasmes", + "palucher", + "penchant", + "pipe", + "pipi", + "piquer", + "piqûres", + "pisse", + "pisser", + "porno", + "pornographie", + "pouffiasse", + "pousse-crotte", + "pube", + "putain", + "putain de", + "pute", + "pédale", + "pédé", + "pénis", + "péter", + "queue", + "quéquette", + "ramoner", + "rectum", + "retard", + "rimming", + "râpé", + "sac de billes", + "sac à foutre", + "sac à merde", + "sadique", + "salaud", + "salope", + "salopes", + "sanglant", + "scrotum", + "se branler", + "seins", + "sexe", + "skank", + "smegma", + "sperme", + "suce", + "suceuse", + "tanche", + "tapette", + "tapis muncher", + "testicule", + "teuch", + "titt", + "transexuelle", + "tremper", + "tringler", + "trique", + "troncher", + "trou du cul", + "turlute", + "vagin", + "viagra", + "violeur", + "vulve", + "wang", + "x évalué", + "xxx", + "zigounette", + "zizi", + "zut", + "éjaculant", + "éjaculation", + "éjacule", + "éjaculer", + "éjaculé", + "étron", + ], + "hi": english_badwords + + [ + "aand", + "aandu", + "balatkar", + "balatkari", + "behen chod", + "beti chod", + "bhadva", + "bhadve", + "bhandve", + "bhangi", + "bhootni ke", + "bhosad", + "bhosadi ke", + "bitching", + "blowjob", + "bollok", + "boobe", + "buceta", + "chakke", + "chinaal", + "chinki", + "chod", + "chodu", + "chodu bhagat", + "chooche", + "choochi", + "choope", + "choot", + "choot ke baal", + "chootia", + "chootiya", + "chuche", + "chuchi", + "chudaap", + "chudai khanaa", + "chudam chudai", + "chude", + "chut", + "chut ka chuha", + "chut ka churan", + "chut ka mail", + "chut ke baal", + "chut ke dhakkan", + "chut maarli", + "chutad", + "chutadd", + "chutan", + "chutia", + "chutiya", + "cipa", + "cunillingus", + "dink", + "duche", + "ejaculated", + "ejaculates", + "ejaculating", + "fagging", + "fagots", + "felching", + "fuckers", + "fuckings", + "fucks", + "gaand", + "gaandfat", + "gaandmasti", + "gaandufad", + "gandfattu", + "gandu", + "gashti", + "gasti", + "ghassa", + "ghasti", + "gucchi", + "gucchu", + "harami", + "haramzade", + "hawas", + "hawas ke pujari", + "hijda", + "hijra", + "jhant", + "jhant chaatu", + "jhant ka keeda", + "jhant ke baal", + "jhant ke pissu", + "jhantu", + "kamine", + "kaminey", + "kanjar", + "kutta", + "kutta kamina", + "kutte ki aulad", + "kutte ki jat", + "kuttiya", + "loda", + "lodu", + "lund", + "lund choos", + "lund ka bakkal", + "lund khajoor", + "lundtopi", + "lundure", + "lusting", + "maa ki chut", + "maal", + "madar chod", + "madarchod", + "madhavchod", + "masochist", + "mooh mein le", + "mutth", + "mutthal", + "najayaz", + "najayaz aulaad", + "najayaz paidaish", + "orgasim", + "paki", + "pataka", + "patakha", + "pisser", + "pisses", + "pissing", + "pube", + "pussies", + "raand", + "randaap", + "randi", + "randi rona", + "rimming", + "saala", + "saala kutta", + "saali kutti", + "saali randi", + "shagging", + "shite", + "shitted", + "shitting", + "shitty", + "skank", + "sluts", + "spac", + "suar", + "suar ke lund", + "suar ki aulad", + "tatte", + "tatti", + "teri maa ka bhosada", + "teri maa ka boba chusu", + "teri maa ki behenchod ", + "teri maa ki chut", + "tharak", + "tharki", + "titt", + "tu chuda", + "turd", + "wank", + "xxx", + "अंडकोश की थैली", + "अंडा", + "अरे नहीं", + "अश्लील", + "उल्लू", + "एक्स रेटेड", + "ओगाज़्म", + "कमबख्त", + "काम करना", + "कामोद्दीपक चित्र", + "कालीन का चूरा", + "किन्नर", + "कुतिया", + "कुत्ते-कमीने", + "कून", + "कॉक", + "गड़बड़", + "गधा कमीने", + "गधे", + "गर्भपात", + "गुदा", + "गेंद का थैला", + "गेंदों", + "गोली चलाने की आवाज़", + "घटिया इंसान", + "चाकलेट का रंग", + "चिंक", + "चुभन", + "चूची", + "चूतड़", + "चोंच", + "छीनना", + "जी में आये करो", + "झटका बंद", + "ठगना पैकर", + "डिल्डो", + "दुष्ट", + "दूर जाने का अभद्र संकेत देना", + "धत् तेरे की", + "नरक", + "नाजी", + "निकला हुआ किनारा", + "नितंब", + "पंगा लेना", + "पिछाड़ी", + "पीड़न कामुक", + "पेशाब", + "पॉर्न", + "फटना", + "फूहड़", + "बकवास", + "बट", + "बलात्कार", + "बहुत मदहोश", + "बांध", + "बिल्ली", + "बेल अंत", + "बेवकूफों", + "बोल पड़ना", + "भगवान-शापित", + "भगशेफ", + "मल", + "मलाशय", + "माँ कमीने", + "मुखमैथुन", + "मुर्गा", + "मुर्गा के", + "मुर्गा चूसने वाला", + "मूर्ख", + "मैल", + "योनि", + "योनी", + "यौन-संबंध", + "रक्तरंजित", + "लानत है", + "लिंग", + "लुटेरा", + "लेबिया", + "वहशी", + "वहशीता", + "वियाग्रा", + "वीर्य", + "वेश्या", + "वैंग", + "वो साले", + "शिफ़्ट को", + "शिश्नमल", + "संभोग सुख", + "सह", + "सह शॉट", + "साहस", + "सिगरेट", + "सींग का बना हुआ", + "स्तन", + "स्तनों", + "हवस", + "हस्तमैथुन", + "होमोसेक्सुअल", + "होर", + ], + "id": english_badwords + + [ + "abortus", + "anal", + "dubur", + "pantat", + "bajingan", + "keledai", + "keparat", + "tas bola", + "bola", + "bellend", + "kejam", + "kebinatangan", + "menggerutu", + "pelacur", + "berdarah", + "blowjob", + "bollok", + "dada", + "payudara", + "buceta", + "gelandangan", + "pengunyah karpet", + "celah", + "cipa", + "kelentit", + "kokang", + "pengisap ayam", + "ayam", + "coon", + "sampah", + "air mani", + "cumshot", + "cunillingus", + "vagina", + "mengutuk", + "kontol", + "dildo", + "dink", + "anjing-keparat", + "duche", + "tanggul", + "berejakulasi", + "ejakulasi", + "homo", + "fagging", + "kayu bakar", + "penggemar", + "felching", + "fellatio", + "flens", + "brengsek", + "kacau", + "sialan", + "persetan", + "pengepakan fudge", + "terkutuk", + "ya tuhan", + "neraka", + "hore", + "terangsang", + "kock", + "labia", + "nafsu", + "bernafsu", + "masokis", + "masturbasi", + "keparat ibu", + "nazi", + "orang negro", + "negro", + "orgasim", + "orgasme", + "cotok", + "penis", + "kencing", + "kesal", + "pisser", + "bikin", + "buritan", + "porno", + "pornografi", + "tusukan", + "menusuk", + "pube", + "pussies", + "memperkosa", + "pemerkosa", + "memperlambat", + "rimming", + "sadis", + "meniduri", + "skrotum", + "seks", + "bercinta", + "waria", + "kotoran", + "shite", + "kengerian", + "dikirim", + "buang hajat", + "menyebalkan", + "smegma", + "jelaga", + "merebut", + "dasar bajingan", + "ruang", + "keberanian", + "buah pelir", + "titt", + "viagra", + "vulva", + "wang", + "terima kasih", + "x diberi peringkat", + "xxx", + ], + "kn": english_badwords + + [ + "ಗರ್ಭಪಾತ", + "ಗುದ", + "ಗುದದ್ವಾರ", + "ಕತ್ತೆ", + "ಆಶ್-ಫಕರ್", + "ಅಸ್ಹೋಲ್", + "ಅಸೋಲೆಸ್", + "ಬಾಲ್ಬಾಗ್", + "ಚೆಂಡುಗಳು", + "ಬಾಸ್ಟರ್ಡ್", + "ಬೆಲೆಂಡ್", + "ಮೃದ್ವಂಗಿ", + "ಪ್ರಾಣಿಜನ್ಯತೆ", + "ಬಿಚ್", + "ಬಿಟ್ಚಿಸ್", + "ಬೆಚಿಂಗ್", + "ರಕ್ತಸಿಕ್ತ", + "ಬ್ಲೋಜಾಬ್", + "ಬೊಲ್ಲೊಕ್", + "ಕುರುಚಲು ಗಿಡ", + "ಬೂಬಿಗಳು", + "ಸ್ತನಗಳನ್ನು", + "ಬುಕೆಟಾ", + "ತಿಕ", + "ಬಟ್", + "ಕಾರ್ಪೆಟ್ ಮಂಚರ್", + "ಚಿಂಕ್", + "ಸಿಪಾ", + "ಚಂದ್ರನಾಡಿ", + "ಕೋಳಿ", + "ಕೋಳಿ ಸಕ್ಕರ್", + "ಕಾಕ್ಸ್", + "ಕೂನ್", + "ಅಮೇಧ್ಯ", + "ಕಮ್", + "ಕಮ್ಶಾಟ್", + "ಕುನಿಲ್ಲಸ್", + "ಕಂಟ್", + "ಡ್ಯಾಮ್", + "ಡಿಕ್", + "ದ್ವಿಧ್ರುವಿ", + "dildos", + "ಡಿಂಕ್", + "ನಾಯಿ-ಫಕರ್", + "ಡಚೆ", + "ಡೈಕ್", + "ಹೊರಹೊಮ್ಮಿಸು", + "ಸ್ಫೂರ್ತಿ", + "ಎಜಾಕ್ಯುಲೇಟ್ಸ್", + "ಇಜಲಲೇಟಿಂಗ್", + "ಉದ್ಗಾರ", + "ತಮಾಷೆ", + "ಮಂದಗತಿ", + "ಮಬ್ಬು", + "fagots", + "ಫ್ಯಾನಿ", + "ಹೊಡೆತ", + "ಪತನ", + "ಚಾಚುಪಟ್ಟಿ", + "ಫಕ್", + "ನಾಶವಾಗಿದ್ದನು", + "ಫಕರ್", + "fuckers", + "ಫಕಿಂಗ್", + "ಫಕಿಂಗ್ಸ್", + "ಇಷ್ಟಪಡುತ್ತಾನೆ", + "ಮಿಠಾಯಿ ಪ್ಯಾಕರ್", + "ದೇವರನ್ನು ಹಾನಿಗೊಳಗಾಯಿತು", + "ಗಾಡ್ಡಮ್", + "ನರಕ", + "ಹೋರ್", + "ಮೊನಚಾದ", + "ಜರ್ಕ್-ಆಫ್", + "ಕೋಕ್", + "ಯೋನಿಯ", + "ಕಾಮ", + "ಕಾಮುಕ", + "ಮಾಸೋಚಿಸ್ಟ್", + "ಹಸ್ತಮೈಥುನ ಮಾಡು", + "ತಾಯಿ ಫಕರ್", + "ನಾಜಿ", + "ನಿಗರ್", + "ನಿಗ್ಗರ್ಗಳು", + "ಒರಾಸಿಮ್", + "ಪರಾಕಾಷ್ಠೆ", + "ಪರಾಕಾಷ್ಠೆಗಳನ್ನು", + "ಪೆಕರ್", + "ಶಿಶ್ನ", + "ಮೂತ್ರ ವಿಸರ್ಜಿಸು", + "ನಿರುತ್ಸಾಹಗೊಂಡಿದೆ", + "ಪಿಸರ್", + "ಮೂತ್ರಪಿಂಡಗಳು", + "pissing", + "ಪಿಸ್ಸಾಫ್", + "ಪೂಪ್", + "ಅಶ್ಲೀಲತೆ", + "ಅಶ್ಲೀಲ", + "ಚುಚ್ಚು", + "ಪ್ರಿಕ್ಸ್", + "ಪಬ್", + "ಪುಸಿಗಳು", + "ಪುಸಿ", + "ಅತ್ಯಾಚಾರ", + "ಅತ್ಯಾಚಾರಿ", + "ಗುದನಾಳದ", + "ರಿಟಾರ್ಡ್", + "ಹಚ್ಚುವುದು", + "ದುಃಖಗಾರ", + "ತಿರುಗಿಸುವುದು", + "ಸ್ಕ್ರೋಟಮ್", + "ವೀರ್ಯ", + "ಲೈಂಗಿಕತೆ", + "ಶಾಗ್", + "ಶಾಗ್ಗಿಂಗ್", + "ಶೆಮೇಲ್", + "ಶಿಟ್", + "ಷೈಟ್", + "ಶಿಟ್ಸ್", + "shitted", + "ಅಲುಗಾಡುವಿಕೆ", + "ಅಸಹ್ಯ", + "ಸ್ಕಾಂಕ್", + "ಸೂಳೆ", + "ಸ್ಲಟ್ಗಳು", + "ಸ್ಮೆಗ್ಮಾ", + "ಕೊಳೆತ", + "ಸ್ನ್ಯಾಚ್", + "ಮಗ-ಆಫ್-ಬಿಚ್", + "spac", + "ಉಬ್ಬು", + "ವೃಷಣ", + "ಟಿಟ್", + "ಚೇಕಡಿ ಹಕ್ಕಿಗಳು", + "turd", + "ಯೋನಿ", + "ವಯಾಗ್ರ", + "ವಾಂಗ್", + "ಮುಷ್ಕರ", + "x ರೇಟೆಡ್", + "xxx", + ], + "ml": english_badwords + + [ + "ഗർഭഛിദ്രം", + "വിശപ്പ്", + "മലദ്വാരം", + "കഴുത", + "അസി ഫക്കർ", + "കഴുതകളെ", + "ആസ്ഹോൾ", + "അശ്ളീലങ്ങൾ", + "ബോൾബാഗ്", + "പന്തുകൾ", + "തന്തയില്ലാത്തവൻ", + "ബെല്ലെൻഡ്", + "മൃഗീയമായ", + "മൃഗീയത", + "ബിച്ച്", + "ബിച്ചുകൾ", + "ബിപിഡിംഗ്", + "രക്തരൂക്ഷിതമായ", + "ആശ്വാസം", + "ബലോക്ക്", + "ബോബ്", + "പൂക്കൾ", + "സ്തനങ്ങൾ", + "ബ്യൂട്ടാ", + "ബം", + "മയക്കുമരുന്ന്", + "പരവതാനി മാൻച്ചർ", + "ചുംബ്", + "സിപാ", + "ക്ലോറിസിസ്", + "കോക്ക്", + "കോക്ക് സക്കർ", + "കോക്സ്", + "കോൺ", + "ക്രാപ്പ്", + "ശുക്ലം", + "പുരുഷാരം", + "സി", + "മുഷിഞ്ഞ", + "കഷ്ടം", + "ഡിക്ക്", + "ഡിൽഡോ", + "dildos", + "ഡൈൻ", + "നായ-ഫക്കർ", + "ഡച്ച്", + "ഡൈകെ", + "ശമിപ്പിക്കുക", + "മോഷ്ടിച്ചു", + "വികാരങ്ങൾ", + "വിരസത", + "മടി", + "ക്ഷീണിപ്പിക്കുക", + "fagot", + "വഞ്ചന", + "ഫാനി", + "വേദന", + "flange", + "ഊമ്പി", + "സംഭോഗം ചെയ്യുക", + "ഫക്കർ", + "നർമ്മം", + "ഫഡ്ജ് പാക്കർ", + "ദൈവം-കൊള്ളിത", + "ഗോഡ്ഡം", + "നരകം", + "വയ്ക്കുക", + "വൃത്തികെട്ട", + "ജെർക് ഓഫ്", + "കിക്ക്", + "ലാബിയ", + "മോഹം", + "മോഹഭംഗം", + "മാസോച്ചിസ്റ്റ്", + "സ്വയംഭോഗം ചെയ്യുക", + "അമ്മ ഫക്കർ", + "നാസി", + "നിഗർ", + "മയക്കുമരുന്നുകൾ", + "രതിമൂർച്ഛ", + "പെക്കർ", + "ലിംഗം", + "മൂത്രമൊഴിക്കുക", + "കുഴഞ്ഞുവീഴുന്നു", + "പിസ്സർ", + "പിസ്സകൾ", + "pissing", + "പിസ്സോഫ്", + "poop", + "അശ്ലീലം", + "അശ്ലീലത", + "പ്രാവി", + "വിസർജ്യങ്ങൾ", + "പ്യൂബ്", + "pussies", + "pussy", + "ബലാൽസംഗം", + "ബലാത്സംഗം", + "മലാശയം", + "തുടരുക", + "റിമ്മിംഗ്", + "സചിസ്റ്റ്", + "വഞ്ചി", + "പുല്ല്", + "ബീജം", + "ശവം", + "ഷാഗിംഗ്", + "അവൾ", + "ഷീറ്റ്", + "ഷെയ്റ്റ്", + "shits", + "തിന്നിട്ടില്ല", + "ഷോർട്ട്", + "ഷൈറ്റി", + "സ്കാൻ", + "മന്ദഹസരം", + "സ്നെഗമാ", + "പുഞ്ചിരി", + "പിടിക്കുക", + "വെറുക്കപ്പെട്ടയാൾ", + "സ്പെയ്ക്", + "തുളച്ച്", + "വൃഷണം", + "പേ", + "ടിത്ത്", + "കുഴപ്പമില്ല", + "യോനി", + "വരാഗ്ര", + "വാൽവ", + "വാങ്", + "വാൻ", + "വേശ്യ", + "x റേറ്റുചെയ്തു", + "xxx", + ], + "mr": english_badwords + + [ + "गर्भपात", + "गुदा", + "गाढव", + "गांडुळ", + "asses", + "asshole", + "assholes", + "ballbag", + "चेंडू", + "बॅस्टर्ड", + "बेलेंड", + "बेस्टियल", + "प्राण्यांबरोबर", + "कुत्री", + "बिट्स", + "खूनी", + "blowjob", + "बोलोक", + "बोब", + "स्तन", + "बसीटा", + "बम", + "बट", + "कार्पेट मुन्चर", + "चिंक", + "सिपा", + "क्लिटोरिस", + "मुर्ख", + "मांसाहारी", + "कॉक्स", + "कॉनन", + "बकवास", + "सह", + "cumshot", + "कनिलिंगस", + "कांट", + "धिक्कार", + "डिक", + "dildo", + "डिल्डो", + "डंक", + "duche", + "डाईक", + "उद्गार", + "उत्साही", + "ejaculates", + "उत्सुकता", + "स्खलन", + "फॅग", + "फॅगिंग", + "फॅगॉट", + "फॅगॉट्स", + "फॅनी", + "फेलिंग", + "फॅलेटीओ", + "निकला", + "fucked", + "गुप्तचर", + "fuckers", + "fucking", + "fuckings", + "fucks", + "फडगे पॅकर", + "देव-शापित", + "देव", + "नरक", + "होरे", + "शिंग", + "झटका बंद", + "कॉक", + "लॅबिया", + "वासना", + "मासोचिस्ट", + "हस्तमैथुन करा", + "आई माकड", + "नाझी", + "निगर", + "निगार", + "ऑर्गॅसिम", + "संभोग", + "orgasms", + "चापटी", + "पुरुषाचे जननेंद्रिय", + "पेशी", + "pissed", + "पिसर", + "pisses", + "पिसिंग", + "पिसोफ", + "घाट", + "अश्लील", + "पोर्नोग्राफी", + "मुरुम", + "प्रिक्स", + "प्यूब", + "pussies", + "मांजर", + "बलात्कार", + "गुदाशय", + "मंद", + "rimming", + "दुःखी", + "screwing", + "स्क्रोटम", + "वीर्य", + "लिंग", + "शेग", + "shagging", + "शेमले", + "विचित्र", + "shite", + "shits", + "shitted", + "shitting", + "shitty", + "घाणेरडा", + "फट", + "sluts", + "सुगंध", + "स्मट", + "छेडछाड", + "मुलगा-एक-कुत्री", + "spac", + "तिरस्कार", + "परीक्षक", + "शीर्षक", + "टिट", + "टर्ड", + "योनी", + "वियाग्रा", + "वल्वा", + "वांग", + "विंक", + "वेश्या", + "एक्स रेट केले", + "xxx", + ], + "pt": english_badwords + + [ + "aborto", + "amador", + "anal", + "aparafusar", + "aranha", + "ariano", + "arrebatar", + "ass-filho da puta", + "asses", + "balalao", + "bastardo", + "bate uma", + "bellend", + "bestial", + "bestialidade", + "bicha", + "bichano", + "bichanos", + "bichas", + "biscate", + "bissexual", + "boceta", + "bolas", + "bollok", + "boob", + "boquete", + "bosta", + "braulio de borracha", + "buceta", + "bumbum", + "bunda", + "burro", + "cabrao", + "cacete", + "cadela", + "cadelas", + "cagando", + "cagar", + "calçado", + "camisinha", + "caralho", + "cerveja", + "chochota", + "chupar", + "cipa", + "clitoris", + "clitóris", + "cobiçoso", + "cocaína", + "cocô", + "coito", + "colhoes", + "com tesão", + "comedor de tapetes", + "comer", + "cona", + "consolo", + "coon", + "coragem", + "corno", + "cu", + "cunillingus", + "dar o rabo", + "desgraçado", + "dildo", + "dildos", + "dink", + "dog-filho da puta", + "droga", + "duche", + "dum raio", + "ejacula", + "ejaculado", + "ejacular", + "ejaculação", + "empacotador de fudge", + "escroto", + "esporra", + "estuprador", + "estupro", + "fagging", + "fanny", + "fecal", + "felação", + "felching", + "fenda", + "filho da puta", + "filhos da puta", + "foda", + "foda-se", + "fode", + "foder", + "fodido", + "frango assado", + "galo", + "galos", + "gozada", + "gozar", + "grelho", + "heroína", + "homem gay", + "homoerótico", + "homosexual", + "hore", + "idiota", + "idiotas", + "inferno", + "kock", + "lolita", + "luxúria", + "lábios", + "lésbica", + "maldito", + "mama", + "masoquista", + "masturbar", + "merda", + "merdas", + "mesa", + "mijando", + "mijar", + "nazista", + "negro", + "niggers", + "não me chateies", + "orgasim", + "orgasmo", + "orgasmos", + "otário", + "paneleiro", + "passar um cheque", + "pau", + "peidar", + "peitos", + "peituda", + "pica", + "picadas", + "pinto", + "pisser", + "porcaria", + "porno", + "pornografia", + "pornô", + "porra", + "prostituta", + "pube", + "punheta", + "puta", + "puta que pariu", + "puta que te pariu", + "putaria", + "puto", + "pênis", + "queca", + "retardar", + "reto", + "rimming", + "sacanagem", + "saco", + "saco de bola", + "sangrento", + "sapatona", + "sexo", + "shite", + "skank", + "smegma", + "spac", + "sujeira", + "sádico", + "sêmen", + "testículo", + "tetas", + "titt", + "torneira", + "transando", + "transar", + "transsexual", + "trepada", + "vadia", + "vadias", + "vagabunda", + "vagabundo", + "vagina", + "vai tomar no cu", + "vai-te foder", + "veado", + "viagra", + "vibrador", + "vulva", + "wang", + "x avaliado", + "xana", + "xixi", + "xochota", + "xxx", + "ânus", + ], + "te": english_badwords + + [ + "గర్భస్రావం", + "అంగ", + "పాయువు", + "గాడిద", + "గాడిద-fucker", + "asses", + "assholes", + "బాల్బ్యాగ్", + "బంతుల్లో", + "బాస్టర్డ్", + "బెల్లెండ్", + "మృగ", + "బెస్టియాలిటీ", + "బిచ్", + "bitches", + "బిట్చింగ్", + "బ్లడీ", + "blowjob", + "బోల్లక", + "బూబ్", + "వక్షోజాలను", + "ఛాతీ", + "buceta", + "బం", + "బట్", + "కార్పెట్ ముంచర్", + "చింక్", + "cipa", + "స్త్రీగుహ్యాంకురము", + "ఆత్మవిశ్వాసం", + "కాక్-సక్కర్", + "కాక్స్", + "కూన్", + "చెత్త", + "కం", + "cumshot", + "క్యునిల్లింగస్", + "కంట్", + "తిట్టు", + "డిక్", + "లైంగిక సంతృప్తి కోసం స్త్రీలు ఉపయోగించే పురుషాంగము వంటి పరికరము", + "డిల్డోస్", + "dink", + "కుక్క-fucker", + "డూష్", + "డైక్", + "స్ఖలించు", + "ఎజాక్యులేటెడ్", + "ఎజాక్యులేట్స్", + "ఎరాక్యులేటింగ్", + "స్ఖలనం", + "నవుకరు", + "ఫాగ్గింగ్", + "ఫాగాట్", + "ఫగాట్స్", + "fanny", + "ఫెల్చింగ్", + "కుడుచుట", + "అచ్చు", + "ఫక్", + "ఇబ్బంది పెట్టాడు", + "fucker", + "ఫకర్స్", + "ఫకింగ్", + "ఫకింగ్స్", + "ఫక్స్", + "ఫడ్జ్ ప్యాకర్", + "దేవతలా మంచిది", + "గాడ్డామ్", + "నరకం", + "హోర్", + "horny", + "జెర్క్-ఆఫ్", + "కాక్", + "పెదవి", + "కామం", + "మనసు పడ్డట్లు చిత్రించారు", + "masochist", + "హస్తప్రయోగం", + "తల్లి ఫెకర్", + "నాజీ", + "నిగ్గర్", + "నిగ్గర్స్", + "ఆర్గాసిమ్", + "స్కలనం", + "orgasms", + "pecker", + "పురుషాంగం", + "విసర్జన", + "pissed", + "పిస్సర్", + "పిస్సీస్", + "పిస్సింగ్", + "పిస్సాఫ్", + "poop", + "శృంగార", + "పోర్నో", + "అశ్లీల", + "బుడతడు", + "ప్రిక్స్", + "ప్యూబ్", + "pussies", + "పుస్సీ", + "రేప్", + "ఉన్నప్పటికీ బలాత్కారం", + "పురీషనాళం", + "రిటార్డ్", + "రిమ్మింగ్", + "పీడన కాముకత", + "screwing", + "స్క్రోటమ్", + "వీర్యం", + "సెక్స్", + "బొచ్చు", + "షగ్గింగ్", + "షీమేల్", + "ఒంటి", + "షైట్", + "షిట్స్", + "షిట్టెడ్", + "షిట్టింగ్", + "shitty", + "స్కాన్క్", + "నీతి", + "స్లట్స్", + "శిశ్న", + "స్మట్", + "స్నాచ్", + "ఒక బిచ్ కుమారుడు ఆఫ్", + "spac", + "స్పంక్", + "వృషణాలు", + "తునక", + "టిట్స్", + "టిట్", + "turd", + "యోని", + "వయాగ్రా", + "జననాంగం", + "వాంగ్", + "వ్యాంక్", + "వేశ్య", + "x రేట్", + "xxx", + ], + "vi": english_badwords + + [ + "sự phá thai", + "hậu môn", + "mông", + "đồ ngu", + "lừa", + "lỗ đít", + "túi bóng", + "những quả bóng", + "đồ khốn", + "tuyệt vời", + "mục sư", + "lòng tốt", + "chó cái", + "dính máu", + "công việc thổi", + "bollok", + "boob", + "ngực", + "buceta", + "ăn mày", + "thảm muncher", + "sứt mẻ", + "cipa", + "âm vật", + "gà", + "gà hút", + "gà trống", + "coon", + "tào lao", + "kiêm", + "cum", + "cunillingus", + "lồn", + "chỉ trích", + "tinh ranh", + "dương vật giả", + "dink", + "chó-chó", + "duche", + "đê", + "xuất tinh", + "fag", + "đóng băng", + "fagot", + "đồ ăn vặt", + "người hâm mộ", + "nỉ", + "thất bại", + "mặt bích", + "chết tiệt", + "quái", + "đụ", + "ôm", + "đóng gói fudge", + "địa ngục", + "có", + "sừng", + "giật", + "kock", + "môi âm", + "ham muốn", + "khổ dâm", + "thủ dâm", + "mẹ kiếp", + "nazi", + "người da đen", + "người mách nước", + "cực khoái", + "người mổ", + "dương vật", + "đi tiểu", + "bực mình", + "đái", + "phân", + "khiêu dâm", + "nội dung khiêu dâm", + "châm", + "chích", + "pube", + "pussies", + "âm hộ", + "hiếp dâm", + "trực tràng", + "chậm phát triển", + "xé", + "người tàn bạo", + "vặn vít", + "bìu", + "tinh dịch", + "tình dục", + "lông", + "xáo trộn", + "đồng tính", + "cứt", + "shite", + "ván trượt", + "đĩ", + "quần lót", + "smegma", + "xì trum", + "con trai", + "spac", + "spunk", + "tinh hoàn", + "ăn miếng trả miếng", + "titt", + "cỏ", + "âm đạo", + "viagra", + "âm môn", + "wang", + "đã ngủ", + "con điếm", + "x đánh giá", + "xxx", + ], + "zh": english_badwords + + [ + "13.", + "13点", + "㞗", + "三级片", + "下三烂", + "下贱", + "个老子的", + "九游", + "乳", + "乳交", + "乳头", + "乳房", + "乳波臀浪", + "交配", + "仆街", + "仆街", + "他奶奶", + "他奶奶的", + "他奶娘的", + "他妈", + "他妈ㄉ王八蛋", + "他妈地", + "他妈的", + "他娘", + "他马的", + "你个傻比", + "你他马的", + "你全家", + "你奶奶的", + "你她马的", + "你妈", + "你妈的", + "你娘", + "你娘卡好", + "你娘咧", + "你它妈的", + "你它马的", + "你是鸡", + "你是鸭", + "你老味", + "你老母", + "你老闆", + "你马的", + "做爱", + "傻比", + "傻逼", + "册那", + "冚家拎", + "冚家鏟", + "军妓", + "几八", + "几叭", + "几巴", + "几芭", + "刚度", + "刚瘪三", + "包皮", + "十三点", + "卖B", + "卖比", + "卖淫", + "卵", + "卵子", + "双峰微颤", + "口交", + "口肯", + "叫床", + "吃屎", + "后庭", + "吹箫", + "咸家伶", + "咸家鏟", + "塞你公", + "塞你娘", + "塞你母", + "塞你爸", + "塞你老师", + "塞你老母", + "处女", + "外阴", + "大卵子", + "大卵泡", + "大鸡巴", + "奶", + "奶奶的熊", + "奶子", + "奸", + "奸你", + "她妈地", + "她妈的", + "她马的", + "妈B", + "妈个B", + "妈个比", + "妈个老比", + "妈妈的", + "妈比", + "妈的", + "妈的B", + "妈逼", + "妓", + "妓女", + "妓院", + "妳她妈的", + "妳妈的", + "妳娘的", + "妳老母的", + "妳马的", + "姘头", + "姣西", + "姦", + "娘个比", + "娘的", + "婊子", + "婊子养的", + "嫖娼", + "嫖客", + "它妈地", + "它妈的", + "密洞", + "射你", + "射精", + "小乳头", + "小卵子", + "小卵泡", + "小瘪三", + "小肉粒", + "小骚比", + "小骚货", + "小鸡巴", + "小鸡鸡", + "尻", + "屁眼", + "屁股", + "屄", + "屌", + "屎忽", + "巨乳", + "干x娘", + "干七八", + "干你", + "干你妈", + "干你娘", + "干你老母", + "干你良", + "干妳妈", + "干妳娘", + "干妳老母", + "干妳马", + "干您娘", + "干机掰", + "干死CS", + "干死GM", + "干死你", + "干死客服", + "幹", + "强奸", + "强奸你", + "性", + "性交", + "性器", + "性无能", + "性爱", + "情色", + "想上你", + "懆您妈", + "懆您娘", + "懒8", + "懒八", + "懒叫", + "懒教", + "成人", + "我操你祖宗十八代", + "扒光", + "打炮", + "打飞机", + "抽插", + "招妓", + "插你", + "插死你", + "撒尿", + "撚", + "操你", + "操你全家", + "操你奶奶", + "操你妈", + "操你娘", + "操你祖宗", + "操你老妈", + "操你老母", + "操妳", + "操妳全家", + "操妳妈", + "操妳娘", + "操妳祖宗", + "操机掰", + "操比", + "操逼", + "放荡", + "日他娘", + "日你", + "日你妈", + "日你老娘", + "日你老母", + "日批", + "月经", + "机八", + "机巴", + "机机歪歪", + "杂种", + "柒", + "浪叫", + "淫", + "淫乱", + "淫妇", + "淫棍", + "淫水", + "淫秽", + "淫荡", + "淫西", + "湿透的内裤", + "激情", + "灨你娘", + "烂货", + "烂逼", + "爛", + "狗屁", + "狗日", + "狗狼养的", + "玉杵", + "王八蛋", + "瓜娃子", + "瓜婆娘", + "瓜批", + "瘪三", + "白烂", + "白痴", + "白癡", + "硬膠", + "祖宗", + "私服", + "笨實", + "笨蛋", + "粉腸", + "精子", + "老二", + "老味", + "老母", + "老瘪三", + "老骚比", + "老骚货", + "肉壁", + "肉棍子", + "肉棒", + "肉缝", + "肏", + "肛交", + "肥西", + "色情", + "花柳", + "荡妇", + "賤", + "贝肉", + "贱B", + "贱人", + "贱货", + "贼你妈", + "赛你老母", + "赛妳阿母", + "赣您娘", + "躝癱", + "轮奸", + "迷药", + "逼", + "逼样", + "野鸡", + "閪", + "阳具", + "阳萎", + "阴唇", + "阴户", + "阴核", + "阴毛", + "阴茎", + "阴道", + "阴部", + "陰莖", + "雞巴", + "靠北", + "靠母", + "靠爸", + "靠背", + "靠腰", + "驶你公", + "驶你娘", + "驶你母", + "驶你爸", + "驶你老师", + "驶你老母", + "骚比", + "骚货", + "骚逼", + "鬼公", + "鳩", + "鸡8", + "鸡八", + "鸡叭", + "鸡吧", + "鸡奸", + "鸡巴", + "鸡芭", + "鸡鸡", + "龟儿子", + "龟头", + ], +} diff --git a/en.arpa.bin b/en.arpa.bin new file mode 100644 index 0000000000000000000000000000000000000000..41880f89fbf5b3d8e64fb2ab5d3e70753ca3c1ed --- /dev/null +++ b/en.arpa.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e90c9b25af01dcaa2667ed45d012d891269760fc6eccfe8dbbd161eb20e01d7d +size 4403509656 diff --git a/en.sp.model b/en.sp.model new file mode 100644 index 0000000000000000000000000000000000000000..937daf7e94e4808d7babd5739bb0d048474a9c5e --- /dev/null +++ b/en.sp.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:262c0b0bd4ebc592e439453bc7e006d0ed12d1914e206a1fb8c7fba091f52c4d +size 1389058 diff --git a/filtering.py b/filtering.py new file mode 100644 index 0000000000000000000000000000000000000000..c75664e1ca452d10d2c42b7a0db5a4f2455b83b9 --- /dev/null +++ b/filtering.py @@ -0,0 +1,879 @@ +import re + +import numpy as np + +import fasttext + +import sentencepiece +import kenlm + +import pathlib + +from languages_id import langs_id +from parameters_filtering import parameters_filtering +from normalization import normalization +from stopwords import stopwords +from badwords import badwords + + +class LoadParameters: + @staticmethod + def load_parameters(lang_dataset_id): + if lang_dataset_id in parameters_filtering: + param = parameters_filtering[lang_dataset_id] + else: + param = parameters_filtering["default"] + return param + + @staticmethod + def load_stopwords(lang_dataset_id): + stopwords_lang_id = langs_id.loc[ + langs_id["dataset_id"] == lang_dataset_id, "stopwords_id" + ].iloc[0] + if stopwords_lang_id: + stopwords_lang = set(stopwords[stopwords_lang_id]) + else: + stopwords_lang = None + return stopwords_lang + + @staticmethod + def load_badwords(lang_dataset_id): + badwords_lang_id = langs_id.loc[ + langs_id["dataset_id"] == lang_dataset_id, "badwords_id" + ].iloc[0] + if badwords_lang_id: + badwords_lang = set(badwords[badwords_lang_id]) + else: + badwords_lang = None + return badwords_lang + + @staticmethod + def load_model_lang_id(lang_dataset_id, path_fasttext_model): + fasttext_lang_id = langs_id.loc[ + langs_id["dataset_id"] == lang_dataset_id, "fasttext_id" + ].iloc[0] + if fasttext_lang_id: + model_lang_id = fasttext.load_model(path_fasttext_model) + else: + model_lang_id = None + return model_lang_id + + @staticmethod + def load_sentencepiece_model(lang_dataset_id, path_sentencepiece_model): + sentencepiece_lang_id = langs_id.loc[ + langs_id["dataset_id"] == lang_dataset_id, "sentencepiece_id" + ].iloc[0] + if sentencepiece_lang_id: + sentencepiece_model = sentencepiece.SentencePieceProcessor() + sentencepiece_model.load(path_sentencepiece_model) + else: + sentencepiece_model = None + return sentencepiece_model + + @staticmethod + def load_kenlm_model(lang_dataset_id, path_kenlm_model): + kenlm_lang_id = langs_id.loc[ + langs_id["dataset_id"] == lang_dataset_id, "kenlm_id" + ].iloc[0] + if kenlm_lang_id: + kenlm_model = kenlm.Model(path_kenlm_model) + else: + kenlm_model = None + return kenlm_model + + +class ModifyingDocuments: + @staticmethod + def remove_empty_el_from_list(list_): + return [el for el in list_ if el] + + @staticmethod + def remove_non_printing_characters(document, non_printing_characters_re): + return non_printing_characters_re.sub("", document) + + @staticmethod + def uniform_whitespace( + document, + whitespace=[ + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "", + "„", + ], + ): + """There are different whitespace characters.""" + whitespace = set(whitespace) + document = "".join( + [char if char not in whitespace else " " for char in document] + ) + return document + + @staticmethod + def replace_digits_with_zeros(document, digits_re): + return digits_re.sub("0", document) + + @staticmethod + def replace_unicode_punctuation(document, unicode_punctuation): + return "".join(unicode_punctuation.get(c, c) for c in document) + + @staticmethod + def normalization( + document, + remove_non_printing_characters, + strip, + lower_case, + uniform_whitespace, + replace_digits_with_zeros, + replace_unicode_punctuation, + non_printing_characters_re=normalization["non_printing_characters_re"], + digits_re=normalization["digits_re"], + unicode_punctuation=normalization["unicode_punctuation"], + ): + if remove_non_printing_characters: + document = ModifyingDocuments.remove_non_printing_characters( + document, non_printing_characters_re + ) + if strip: + document = document.strip() + if not document: + return document + if lower_case: + document = document.lower() + if uniform_whitespace: + document = ModifyingDocuments.uniform_whitespace(document) + if replace_digits_with_zeros: + document = ModifyingDocuments.replace_digits_with_zeros(document, digits_re) + if replace_unicode_punctuation: + document = ModifyingDocuments.replace_unicode_punctuation( + document, unicode_punctuation + ) + return document + + @staticmethod + def tokenization(document, sentencepiece_model, join_on_whitespace): + document_tokenized = sentencepiece_model.encode_as_pieces(document) + if join_on_whitespace: + document_tokenized = " ".join(document_tokenized) + return document_tokenized + + @staticmethod + def split_on_whitespace( + document, + new_line=False, + tab=False, + ): + """This method also removes concatenated spaces.""" + sep = [" "] + new_line * ["\n"] + tab * ["\t"] + sep = "|".join(sep) + split_document = re.split(sep, document) + split_document = ModifyingDocuments.remove_empty_el_from_list(split_document) + return split_document + + @staticmethod + def strip(document, strip_characters): + """Way faster than document.strip(strip_characters) + since strip_characters is now a set instead of a str, + and it contains a lot of elements (all the emojis).""" + if not document: + return document + beg_ind = 0 + end_ind = len(document) + for i in range(len(document)): + if document[i] in strip_characters: + beg_ind += 1 + else: + break + for i in range(1, len(document) + 1): + if document[-i] in strip_characters: + end_ind -= 1 + else: + break + document_stripped = document[beg_ind:end_ind] + return document_stripped + + @staticmethod + def get_words_from_document( + document, sentencepiece_model_tok, lower_case, strip_characters + ): + """Get words from a document. Non reversible since the document + is split on multiple characters, words are stripped of + special characters and characters are converted to lower case. + Useful to compute ratios, like the stopwords ratio.""" + if sentencepiece_model_tok: + document_normalized = ModifyingDocuments.normalization( + document=document, + remove_non_printing_characters=True, + strip=True, + lower_case=True, + uniform_whitespace=True, + replace_digits_with_zeros=True, + replace_unicode_punctuation=True, + ) + words = ModifyingDocuments.tokenization( + document_normalized, sentencepiece_model_tok, join_on_whitespace=False + ) + else: + words = ModifyingDocuments.split_on_whitespace( + document, new_line=True, tab=True + ) + if lower_case: + words = [word.lower() for word in words] + if strip_characters: + words = [ModifyingDocuments.strip(word, strip_characters) for word in words] + words = ModifyingDocuments.remove_empty_el_from_list(words) + return words + + @staticmethod + def words_augmentation(words, group_size, join_char): + """Augment words, especially for Chinese (without a space between words) + and Vietnamese (with a space between syllables).""" + augmentation = [ + join_char.join(words[i : i + group_size]) + for i in range(len(words) - group_size + 1) + ] + return augmentation + + @staticmethod + def split_on_newline_tab_whitespace(document): + """First split on "\n", then on "\t", then on " ".""" + sentences = document.split("\n") + sentences = [sentence.split("\t") for sentence in sentences] + sentences = [ + [ + ModifyingDocuments.split_on_whitespace(subsentence) + for subsentence in sentence + ] + for sentence in sentences + ] + return sentences + + @staticmethod + def merge_on_whitespace_tab_newline(sentences): + """Invert the method split_on_newline_tab_whitespace. + Removes concatenated separators.""" + sentences = [ + [" ".join(subsentence) for subsentence in sentence if subsentence] + for sentence in sentences + ] + sentences = ["\t".join(sentence) for sentence in sentences if sentence] + if not sentences: + return "" + document = "\n".join(sentences) + return document + + @staticmethod + def should_keep_word_with_incorrect_substrings( + word, strip_characters, incorrect_word_substrings + ): + word = ModifyingDocuments.strip(word, strip_characters) + should_keep = all( + [(i_substr not in word) for i_substr in incorrect_word_substrings] + ) + return should_keep + + @staticmethod + def remove_words_with_incorrect_substrings( + document, + strip_characters, + incorrect_word_substrings, + ): + sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document) + sentences = [ + [ + [ + word + for word in subsentence + if ModifyingDocuments.should_keep_word_with_incorrect_substrings( + word, strip_characters, incorrect_word_substrings + ) + ] + for subsentence in sentence + ] + for sentence in sentences + ] + document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences) + return document + + @staticmethod + def should_keep_long_word(word, strip_characters, length_word_max_cutoff): + """If the word is too long but it contains only one + special character, it might be a concatenation of one word, + a punctuation, and another word, with no space between them. + In this case, we give the word a pass.""" + if len(word) <= length_word_max_cutoff: + return True + word = ModifyingDocuments.strip(word, strip_characters) + if not word: # The word consisted only of strip characters + return False + if len(word) <= length_word_max_cutoff: + return True + return False + + def remove_long_words( + document, + strip_characters, + length_word_max_cutoff, + ): + sentences = ModifyingDocuments.split_on_newline_tab_whitespace(document) + sentences = [ + [ + [ + word + for word in subsentence + if ModifyingDocuments.should_keep_long_word( + word, + strip_characters, + length_word_max_cutoff, + ) + ] + for subsentence in sentence + ] + for sentence in sentences + ] + document = ModifyingDocuments.merge_on_whitespace_tab_newline(sentences) + return document + + @staticmethod + def modifying_documents( + document, + cond_uniform_whitespace, + cond_replace_unicode_punctuation, + cond_remove_words_with_incorrect_substrings, + strip_characters, + incorrect_word_substrings, + cond_remove_long_words, + length_word_max_cutoff, + ): + document = ModifyingDocuments.normalization( + document=document, + remove_non_printing_characters=False, + strip=True, + lower_case=False, + uniform_whitespace=cond_uniform_whitespace, + replace_digits_with_zeros=False, + replace_unicode_punctuation=cond_replace_unicode_punctuation, + ) + if cond_remove_words_with_incorrect_substrings: + document = ModifyingDocuments.remove_words_with_incorrect_substrings( + document, + strip_characters, + incorrect_word_substrings, + ) + if cond_remove_long_words: + document = ModifyingDocuments.remove_long_words( + document, + strip_characters, + length_word_max_cutoff, + ) + return document + + +class FunctionDatasetModifyingDocuments: + def __init__(self, lang_dataset_id): + self.lang_dataset_id = lang_dataset_id + self.param = LoadParameters.load_parameters(lang_dataset_id) + + def __call__(self, example): + example["text"] = ModifyingDocuments.modifying_documents( + document=example["text"], + cond_uniform_whitespace=self.param["cond_uniform_whitespace"], + cond_replace_unicode_punctuation=self.param[ + "cond_replace_unicode_punctuation" + ], + cond_remove_words_with_incorrect_substrings=self.param[ + "cond_remove_words_with_incorrect_substrings" + ], + strip_characters=self.param["strip_characters"], + incorrect_word_substrings=self.param["incorrect_word_substrings"], + cond_remove_long_words=self.param["cond_remove_long_words"], + length_word_max_cutoff=self.param["length_word_max_cutoff"], + ) + return example + + def __reduce__(self): + return (self.__class__, (self.lang_dataset_id,)) + + +class Filtering: + @staticmethod + def check_number_words( + document, + sentencepiece_model_tok, + strip_characters, + number_words_min_cutoff, + number_words_max_cutoff, + ): + words = ModifyingDocuments.get_words_from_document( + document, + sentencepiece_model_tok, + lower_case=False, + strip_characters=strip_characters, + ) + cond = (len(words) >= number_words_min_cutoff) and ( + len(words) <= number_words_max_cutoff + ) + return cond + + @staticmethod + def compute_repetitions_ratio(document, repetitions_length): + def get_freq_ngrams(document, n): + ngrams = [document[i : i + n] for i in range(len(document) - n + 1)] + freq_ngrams = {} + for ngram in ngrams: + freq_ngrams[ngram] = freq_ngrams.get(ngram, 0) + 1 + return freq_ngrams + + freq_ngrams = get_freq_ngrams(document, repetitions_length) + if len(freq_ngrams) == 0: + return 0 + freq_ngrams = list(freq_ngrams.values()) + freq_ngrams = sorted(freq_ngrams, reverse=True) + num_rep_ngrams = int(np.sqrt(len(freq_ngrams))) + repetitions_ratio = sum(freq_ngrams[:num_rep_ngrams]) / sum(freq_ngrams) + return repetitions_ratio + + @staticmethod + def check_repetitions_removal( + document, + repetitions_length, + repetitions_max_cutoff, + ): + repetitions_ratio = Filtering.compute_repetitions_ratio( + document, repetitions_length + ) + cond = repetitions_ratio <= repetitions_max_cutoff + return cond + + @staticmethod + def compute_special_characters_ratio(document, special_characters): + special_characters_ratio = len( + [char for char in document if char in special_characters] + ) / len(document) + return special_characters_ratio + + @staticmethod + def check_special_characters( + document, + special_characters, + special_characters_max_cutoff, + ): + special_characters_ratio = Filtering.compute_special_characters_ratio( + document, special_characters + ) + cond = special_characters_ratio <= special_characters_max_cutoff + return cond + + @staticmethod + def compute_stopwords_ratio( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + stopwords, + ): + words = ModifyingDocuments.get_words_from_document( + document, + sentencepiece_model_tok, + lower_case=True, + strip_characters=strip_characters, + ) + if not words: + return 0 + augmentation = [] + if cond_words_augmentation: + augmentation = [ + ModifyingDocuments.words_augmentation( + words, group_size, words_augmentation_join_char + ) + for group_size in words_augmentation_group_sizes + ] + augmentation = [word for augm in augmentation for word in augm] + stopwords_ratio = len( + [word for word in words + augmentation if word in stopwords] + ) / len(words) + if stopwords_ratio > 1.0: + stopwords_ratio = 1.0 + return stopwords_ratio + + @staticmethod + def check_stopwords( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + stopwords, + stopwords_min_cutoff, + ): + cond = True + if stopwords: + stopwords_ratio = Filtering.compute_stopwords_ratio( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + stopwords, + ) + cond = stopwords_ratio >= stopwords_min_cutoff + return cond + + @staticmethod + def compute_badwords_ratio( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + badwords, + ): + words = ModifyingDocuments.get_words_from_document( + document, + sentencepiece_model_tok, + lower_case=True, + strip_characters=strip_characters, + ) + if not words: + return 0 + augmentation = [] + if cond_words_augmentation: + augmentation = [ + ModifyingDocuments.words_augmentation( + words, group_size, words_augmentation_join_char + ) + for group_size in words_augmentation_group_sizes + ] + augmentation = [word for augm in augmentation for word in augm] + badwords_ratio = len( + [word for word in words + augmentation if word in badwords] + ) / len(words) + if badwords_ratio > 1.0: + badwords_ratio = 1.0 + for word in augmentation: + if word in badwords: + print(word) + return badwords_ratio + + @staticmethod + def check_badwords( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + badwords, + badwords_max_cutoff, + ): + cond = True + if badwords: + badwords_ratio = Filtering.compute_badwords_ratio( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + badwords, + ) + cond = badwords_ratio <= badwords_max_cutoff + return cond + + @staticmethod + def compute_lang_id_pred_score(document, model_lang_id): + document = document.lower().replace("\n", " ") + pred = model_lang_id.predict(document) + lang_pred_fasttext_id = pred[0][0].replace("__label__", "") + score_pred = pred[1][0] + lang_pred_dataset_id = langs_id.loc[ + langs_id["fasttext_id"] == lang_pred_fasttext_id, "dataset_id" + ] + if len(lang_pred_dataset_id) > 0: + lang_pred_dataset_id = lang_pred_dataset_id.iloc[0] + else: + lang_pred_dataset_id = "unknown" + return lang_pred_dataset_id, score_pred + + @staticmethod + def check_lang_id( + document, + lang_dataset_id, + model_lang_id, + lang_id_min_cutoff, + ): + cond = True + if model_lang_id: + lang_pred_dataset_id, score_pred = Filtering.compute_lang_id_pred_score( + document, model_lang_id + ) + cond = (lang_pred_dataset_id == lang_dataset_id) and ( + score_pred >= lang_id_min_cutoff + ) + return cond + + @staticmethod + def compute_perplexity_score(document, sentencepiece_model, kenlm_model): + document = ModifyingDocuments.normalization( + document=document, + remove_non_printing_characters=True, + strip=True, + lower_case=True, + uniform_whitespace=True, + replace_digits_with_zeros=True, + replace_unicode_punctuation=True, + ) + document = ModifyingDocuments.tokenization( + document, sentencepiece_model, join_on_whitespace=True + ) + doc_log_score, doc_length = 0, 0 + for line in document.split("\n"): + log_score = kenlm_model.score(line) + length = len(line.split()) + 1 + doc_log_score += log_score + doc_length += length + pp_score = 10.0 ** (-doc_log_score / doc_length) + pp_score = round(pp_score, 1) + return pp_score + + @staticmethod + def check_perplexity( + document, + sentencepiece_model, + kenlm_model, + perplexity_max_cutoff, + ): + cond = True + if kenlm_model: + score = Filtering.compute_perplexity_score( + document, sentencepiece_model, kenlm_model + ) + cond = score <= perplexity_max_cutoff + return cond + + @staticmethod + def filtering( + document, + cond_check_number_words, + sentencepiece_model_tok, + strip_characters, + number_words_min_cutoff, + number_words_max_cutoff, + cond_check_repetitions_removal, + repetitions_length, + repetitions_max_cutoff, + cond_check_special_characters, + special_characters, + special_characters_max_cutoff, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + cond_check_stopwords, + stopwords, + stopwords_min_cutoff, + cond_check_badwords, + badwords, + badwords_max_cutoff, + cond_check_lang_id, + lang_dataset_id, + model_lang_id, + lang_id_min_cutoff, + cond_check_perplexity, + sentencepiece_model, + kenlm_model, + perplexity_max_cutoff, + ): + if cond_check_number_words: + if not Filtering.check_number_words( + document, + sentencepiece_model_tok, + strip_characters, + number_words_min_cutoff, + number_words_max_cutoff, + ): + return False + if cond_check_repetitions_removal: + if not Filtering.check_repetitions_removal( + document, + repetitions_length, + repetitions_max_cutoff, + ): + return False + if cond_check_special_characters: + if not Filtering.check_special_characters( + document, + special_characters, + special_characters_max_cutoff, + ): + return False + if cond_check_stopwords: + if not Filtering.check_stopwords( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + stopwords, + stopwords_min_cutoff, + ): + return False + if cond_check_badwords: + if not Filtering.check_badwords( + document, + sentencepiece_model_tok, + strip_characters, + cond_words_augmentation, + words_augmentation_group_sizes, + words_augmentation_join_char, + badwords, + badwords_max_cutoff, + ): + return False + if cond_check_lang_id: + if not Filtering.check_lang_id( + document, + lang_dataset_id, + model_lang_id, + lang_id_min_cutoff, + ): + return False + if cond_check_perplexity: + if not Filtering.check_perplexity( + document, + sentencepiece_model, + kenlm_model, + perplexity_max_cutoff, + ): + return False + return True + + +class FunctionDatasetFiltering: + def __init__( + self, + lang_dataset_id, + path_fasttext_model, + path_sentencepiece_model, + path_kenlm_model, + ): + self.lang_dataset_id = lang_dataset_id + self.path_fasttext_model = path_fasttext_model + self.path_sentencepiece_model = path_sentencepiece_model + self.path_kenlm_model = path_kenlm_model + + self.param = LoadParameters.load_parameters(lang_dataset_id) + self.stopwords = LoadParameters.load_stopwords(lang_dataset_id) + self.badwords = LoadParameters.load_badwords(lang_dataset_id) + self.model_lang_id = LoadParameters.load_model_lang_id( + lang_dataset_id, path_fasttext_model + ) + self.sentencepiece_model = LoadParameters.load_sentencepiece_model( + lang_dataset_id, path_sentencepiece_model + ) + self.sentencepiece_model_tok = ( + self.sentencepiece_model if self.param["tokenization"] else None + ) + self.kenlm_model = LoadParameters.load_kenlm_model( + lang_dataset_id, path_kenlm_model + ) + + def __call__(self, example): + keep_example = Filtering.filtering( + document=example["text"], + cond_check_number_words=self.param["cond_check_number_words"], + sentencepiece_model_tok=self.sentencepiece_model_tok, + strip_characters=self.param["strip_characters"], + number_words_min_cutoff=self.param["number_words_min_cutoff"], + number_words_max_cutoff=self.param["number_words_max_cutoff"], + cond_check_repetitions_removal=self.param["check_repetitions_removal"], + repetitions_length=self.param["repetitions_length"], + repetitions_max_cutoff=self.param["repetitions_max_cutoff"], + cond_check_special_characters=self.param["cond_check_special_characters"], + special_characters=self.param["special_characters"], + special_characters_max_cutoff=self.param["special_characters_max_cutoff"], + cond_words_augmentation=self.param["cond_words_augmentation"], + words_augmentation_group_sizes=self.param["words_augmentation_group_sizes"], + words_augmentation_join_char=self.param["words_augmentation_join_char"], + cond_check_stopwords=self.param["cond_check_stopwords"], + stopwords=self.stopwords, + stopwords_min_cutoff=self.param["stopwords_min_cutoff"], + cond_check_badwords=self.param["cond_check_badwords"], + badwords=self.badwords, + badwords_max_cutoff=self.param["badwords_max_cutoff"], + cond_check_lang_id=self.param["cond_check_lang_id"], + lang_dataset_id=self.lang_dataset_id, + model_lang_id=self.model_lang_id, + lang_id_min_cutoff=self.param["lang_id_min_cutoff"], + cond_check_perplexity=self.param["cond_check_perplexity"], + sentencepiece_model=self.sentencepiece_model, + kenlm_model=self.kenlm_model, + perplexity_max_cutoff=self.param["perplexity_max_cutoff"], + ) + return keep_example + + def __reduce__(self): + return ( + self.__class__, + ( + self.lang_dataset_id, + self.path_fasttext_model, + self.path_sentencepiece_model, + self.path_kenlm_model, + ), + ) + + +class DatasetFiltering: + def __init__( + self, + dataset, + lang_dataset_id, + path_fasttext_model, + path_sentencepiece_model, + path_kenlm_model, + num_proc, + path_dir_save_dataset, + ): + self.ds = dataset + self.lang_dataset_id = lang_dataset_id + self.path_fasttext_model = path_fasttext_model + self.path_sentencepiece_model = path_sentencepiece_model + self.path_kenlm_model = path_kenlm_model + self.num_proc = num_proc + self.path_dir_save_dataset = path_dir_save_dataset + + def modifying_documents(self): + dataset_modifying_documents = FunctionDatasetModifyingDocuments( + self.lang_dataset_id + ) + self.ds = self.ds.map(dataset_modifying_documents, num_proc=self.num_proc) + + def filtering(self): + func_dataset_filtering = FunctionDatasetFiltering( + self.lang_dataset_id, + self.path_fasttext_model, + self.path_sentencepiece_model, + self.path_kenlm_model, + ) + self.ds = self.ds.filter(func_dataset_filtering, num_proc=self.num_proc) + + def save_dataset(self): + pathlib.Path(self.path_dir_save_dataset).mkdir(parents=True, exist_ok=True) + path_dir_save_dataset = pathlib.PurePath( + self.path_dir_save_dataset, self.lang_dataset_id + ) + pathlib.Path(path_dir_save_dataset).mkdir(parents=True, exist_ok=True) + self.ds.save_to_disk(path_dir_save_dataset) diff --git a/languages_id.py b/languages_id.py new file mode 100644 index 0000000000000000000000000000000000000000..5b7747e93374668bccdd282b736cee321735b9f5 --- /dev/null +++ b/languages_id.py @@ -0,0 +1,231 @@ +import pandas as pd + + +langs_id = [ + { + "lang": "Afrikaans", + "dataset_id": "af", + "stopwords_id": "af", + "badwords_id": None, + "fasttext_id": "af", + "sentencepiece_id": "af", + "kenlm_id": "af", + }, + { + "lang": "Arabic", + "dataset_id": "ar", + "stopwords_id": "ar", + "badwords_id": "ar", + "fasttext_id": "ar", + "sentencepiece_id": "ar", + "kenlm_id": "ar", + }, + { + "lang": "Egyptian Arabic", + "dataset_id": "arz", + "stopwords_id": None, + "badwords_id": None, + "fasttext_id": "arz", + "sentencepiece_id": None, + "kenlm_id": None, + }, + { + "lang": "Assamese", + "dataset_id": "as", + "stopwords_id": None, + "badwords_id": None, + "fasttext_id": "as", + "sentencepiece_id": None, + "kenlm_id": None, + }, + { + "lang": "Bengali", + "dataset_id": "bn", + "stopwords_id": "bn", + "badwords_id": None, + "fasttext_id": "bn", + "sentencepiece_id": "bn", + "kenlm_id": "bn", + }, + { + "lang": "Catalan", + "dataset_id": "ca", + "stopwords_id": "ca", + "badwords_id": "ca", + "fasttext_id": "ca", + "sentencepiece_id": "ca", + "kenlm_id": "ca", + }, + { + "lang": "English", + "dataset_id": "en", + "stopwords_id": "en", + "badwords_id": "en", + "fasttext_id": "en", + "sentencepiece_id": "en", + "kenlm_id": "en", + }, + { + "lang": "Spanish", + "dataset_id": "es", + "stopwords_id": "es", + "badwords_id": "es", + "fasttext_id": "es", + "sentencepiece_id": "es", + "kenlm_id": "es", + }, + { + "lang": "Basque", + "dataset_id": "eu", + "stopwords_id": "eu", + "badwords_id": "eu", + "fasttext_id": "eu", + "sentencepiece_id": None, + "kenlm_id": None, + }, + { + "lang": "French", + "dataset_id": "fr", + "stopwords_id": "fr", + "badwords_id": "fr", + "fasttext_id": "fr", + "sentencepiece_id": "fr", + "kenlm_id": "fr", + }, + { + "lang": "Gujarati", + "dataset_id": "gu", + "stopwords_id": None, + "badwords_id": None, + "fasttext_id": "gu", + "sentencepiece_id": "gu", + "kenlm_id": "gu", + }, + { + "lang": "Hindi", + "dataset_id": "hi", + "stopwords_id": "hi", + "badwords_id": "hi", + "fasttext_id": "hi", + "sentencepiece_id": "hi", + "kenlm_id": "hi", + }, + { + "lang": "Indonesian", + "dataset_id": "id", + "stopwords_id": "id", + "badwords_id": "id", + "fasttext_id": "id", + "sentencepiece_id": "id", + "kenlm_id": "id", + }, + { + "lang": "Kannada", + "dataset_id": "kn", + "stopwords_id": None, + "badwords_id": "kn", + "fasttext_id": "kn", + "sentencepiece_id": "kn", + "kenlm_id": "kn", + }, + { + "lang": "Malayalam", + "dataset_id": "ml", + "stopwords_id": None, + "badwords_id": "ml", + "fasttext_id": "ml", + "sentencepiece_id": "ml", + "kenlm_id": "ml", + }, + { + "lang": "Marathi", + "dataset_id": "mr", + "stopwords_id": "mr", + "badwords_id": "mr", + "fasttext_id": "mr", + "sentencepiece_id": "mr", + "kenlm_id": "mr", + }, + { + "lang": "Portuguese", + "dataset_id": "pt", + "stopwords_id": "pt", + "badwords_id": "pt", + "fasttext_id": "pt", + "sentencepiece_id": "pt", + "kenlm_id": "pt", + }, + { + "lang": "Somali", + "dataset_id": "so", + "stopwords_id": "so", + "badwords_id": None, + "fasttext_id": "so", + "sentencepiece_id": None, + "kenlm_id": None, + }, + { + "lang": "Swahili", + "dataset_id": "sw", + "stopwords_id": "sw", + "badwords_id": None, + "fasttext_id": "sw", + "sentencepiece_id": None, + "kenlm_id": None, + }, + { + "lang": "Tamil", + "dataset_id": "ta", + "stopwords_id": None, + "badwords_id": None, + "fasttext_id": "ta", + "sentencepiece_id": None, + "kenlm_id": None, + }, + { + "lang": "Telugu", + "dataset_id": "te", + "stopwords_id": None, + "badwords_id": "te", + "fasttext_id": "te", + "sentencepiece_id": None, + "kenlm_id": None, + }, + { + "lang": "Urdu", + "dataset_id": "ur", + "stopwords_id": "ur", + "badwords_id": None, + "fasttext_id": "ur", + "sentencepiece_id": None, + "kenlm_id": None, + }, + { + "lang": "Vietnamese", + "dataset_id": "vi", + "stopwords_id": "vi", + "badwords_id": "vi", + "fasttext_id": "vi", + "sentencepiece_id": None, + "kenlm_id": None, + }, + { + "lang": "Yoruba", + "dataset_id": "yo", + "stopwords_id": "yo", + "badwords_id": None, + "fasttext_id": "yo", + "sentencepiece_id": None, + "kenlm_id": None, + }, + { + "lang": "Chinese", + "dataset_id": "zh", + "stopwords_id": "zh", + "badwords_id": "zh", + "fasttext_id": "zh", + "sentencepiece_id": "zh", + "kenlm_id": "zh", + }, +] +langs_id = pd.DataFrame(langs_id) diff --git a/lid.176.bin b/lid.176.bin new file mode 100644 index 0000000000000000000000000000000000000000..f8707035ea3cc86ac248a4e31fa6368cd845476a --- /dev/null +++ b/lid.176.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e +size 131266198 diff --git a/normalization.py b/normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..652e810fb5019c5177f6fd0abf9635f322f23927 --- /dev/null +++ b/normalization.py @@ -0,0 +1,52 @@ +import re +from typing import Dict + + +non_printing_characters_re = re.compile( + f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" +) + +digits_re: re.Pattern = re.compile(r"\d") + +unicode_punctuation: Dict[str, str] = { + ",": ",", + "。": ".", + "、": ",", + "„": '"', + "”": '"', + "“": '"', + "«": '"', + "»": '"', + "1": '"', + "」": '"', + "「": '"', + "《": '"', + "》": '"', + "´": "'", + "∶": ":", + ":": ":", + "?": "?", + "!": "!", + "(": "(", + ")": ")", + ";": ";", + "–": "-", + "—": " - ", + ".": ". ", + "~": "~", + "’": "'", + "…": "...", + "━": "-", + "〈": "<", + "〉": ">", + "【": "[", + "】": "]", + "%": "%", + "►": "-", +} + +normalization = { + "non_printing_characters_re": non_printing_characters_re, + "digits_re": digits_re, + "unicode_punctuation": unicode_punctuation, +} diff --git a/requirements.txt b/packages.txt similarity index 100% rename from requirements.txt rename to packages.txt diff --git a/parameters_filtering.py b/parameters_filtering.py new file mode 100644 index 0000000000000000000000000000000000000000..0ac5f1adaae1bf1ffeb4639186551a0181cb4410 --- /dev/null +++ b/parameters_filtering.py @@ -0,0 +1,852 @@ +import string +import emoji + + +main_special_characters = string.punctuation + string.digits + string.whitespace +other_special_characters = ( + "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═" + "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖" + "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" + "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬x?▷Г♫∟™ª₪®「—" + "❖」﴾》" +) +emoji = list(emoji.UNICODE_EMOJI["en"].keys()) + +special_characters_default = set(main_special_characters + other_special_characters) +special_characters_default.update(emoji) + + +parameters_filtering_default = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": False, + "length_word_max_cutoff": 50, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.4, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": False, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.70, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_af = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.6, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_ar = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.45, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 1000000, +} + +parameters_filtering_arz = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.5, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_as = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_bn = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.275, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.05, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 575000, +} + +parameters_filtering_ca = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.35, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 1750000, +} + +parameters_filtering_en = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": True, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 20, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.4, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.3, + "cond_check_badwords": True, + "badwords_max_cutoff": 0.045, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.80, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 2500, +} + +parameters_filtering_es = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.2, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 2500000, +} + +parameters_filtering_eu = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 35, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_fr = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.35, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.15, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_gu = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 250000, +} + +parameters_filtering_hi = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 25, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.35, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 600000, +} + +parameters_filtering_id = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.25, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 2500000, +} + +parameters_filtering_kn = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 50, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 400000, +} + +parameters_filtering_ml = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 50, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.2, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 1600000, +} + +parameters_filtering_mr = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 425000, +} + +parameters_filtering_pt = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0.15, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": True, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_so = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": False, + "length_word_max_cutoff": 1000, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": False, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_sw = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.275, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_ta = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 50, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_te = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 35, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.25, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_ur = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.4, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_vi = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.35, + "cond_words_augmentation": True, + "words_augmentation_group_sizes": [2, 3], + "words_augmentation_join_char": " ", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_yo = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": True, + "length_word_max_cutoff": 30, + "cond_check_number_words": True, + "tokenization": False, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.3, + "cond_words_augmentation": False, + "words_augmentation_group_sizes": [], + "words_augmentation_join_char": "", + "cond_check_stopwords": True, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering_zh = { + "cond_uniform_whitespace": True, + "cond_replace_unicode_punctuation": False, + "cond_remove_words_with_incorrect_substrings": False, + "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], + "cond_remove_long_words": False, + "length_word_max_cutoff": 1000, + "cond_check_number_words": True, + "tokenization": True, + "strip_characters": special_characters_default, + "number_words_min_cutoff": 1, + "number_words_max_cutoff": 100000, + "check_repetitions_removal": True, + "repetitions_length": 10, + "repetitions_max_cutoff": 0.106, + "cond_check_special_characters": True, + "special_characters": special_characters_default, + "special_characters_max_cutoff": 0.4, + "cond_words_augmentation": True, + "words_augmentation_group_sizes": [2, 3], + "words_augmentation_join_char": "", + "cond_check_stopwords": False, + "stopwords_min_cutoff": 0, + "cond_check_badwords": False, + "badwords_max_cutoff": 0.2, + "cond_check_lang_id": True, + "lang_id_min_cutoff": 0.75, + "cond_check_perplexity": False, + "perplexity_max_cutoff": 3000000, +} + +parameters_filtering = { + "default": parameters_filtering_default, + "af": parameters_filtering_af, + "ar": parameters_filtering_ar, + "arz": parameters_filtering_arz, + "as": parameters_filtering_as, + "bn": parameters_filtering_bn, + "ca": parameters_filtering_ca, + "en": parameters_filtering_en, + "es": parameters_filtering_es, + "eu": parameters_filtering_eu, + "fr": parameters_filtering_fr, + "gu": parameters_filtering_gu, + "hi": parameters_filtering_hi, + "id": parameters_filtering_id, + "kn": parameters_filtering_kn, + "ml": parameters_filtering_ml, + "mr": parameters_filtering_mr, + "pt": parameters_filtering_pt, + "so": parameters_filtering_so, + "sw": parameters_filtering_sw, + "ta": parameters_filtering_ta, + "te": parameters_filtering_te, + "ur": parameters_filtering_ur, + "vi": parameters_filtering_vi, + "yo": parameters_filtering_yo, + "zh": parameters_filtering_zh, +} diff --git a/stopwords.py b/stopwords.py new file mode 100644 index 0000000000000000000000000000000000000000..e75bbd4fcf3860add6204b0c413703f37074d9b7 --- /dev/null +++ b/stopwords.py @@ -0,0 +1,5395 @@ +# From https://github.com/6/stopwords-json +# From https://github.com/stopwords-iso/stopwords-iso for Urdu and Vietnamese + + +stopwords = { + "af": [ + "'n", + "aan", + "af", + "al", + "as", + "baie", + "by", + "daar", + "dag", + "dat", + "die", + "dit", + "een", + "ek", + "en", + "gaan", + "gesê", + "haar", + "het", + "hom", + "hulle", + "hy", + "in", + "is", + "jou", + "jy", + "kan", + "kom", + "ma", + "maar", + "met", + "my", + "na", + "nie", + "om", + "ons", + "op", + "saam", + "sal", + "se", + "sien", + "so", + "sy", + "te", + "toe", + "uit", + "van", + "vir", + "was", + "wat", + "ʼn", + ], + "ar": [ + "،", + "أ", + "ا", + "اثر", + "اجل", + "احد", + "اخرى", + "اذا", + "اربعة", + "اطار", + "اعادة", + "اعلنت", + "اف", + "اكثر", + "اكد", + "الا", + "الاخيرة", + "الان", + "الاول", + "الاولى", + "التى", + "التي", + "الثاني", + "الثانية", + "الذاتي", + "الذى", + "الذي", + "الذين", + "السابق", + "الف", + "الماضي", + "المقبل", + "الوقت", + "الى", + "اليوم", + "اما", + "امام", + "امس", + "ان", + "انه", + "انها", + "او", + "اول", + "اي", + "ايار", + "ايام", + "ايضا", + "ب", + "باسم", + "بان", + "برس", + "بسبب", + "بشكل", + "بعد", + "بعض", + "بن", + "به", + "بها", + "بين", + "تم", + "ثلاثة", + "ثم", + "جميع", + "حاليا", + "حتى", + "حوالى", + "حول", + "حيث", + "حين", + "خلال", + "دون", + "ذلك", + "زيارة", + "سنة", + "سنوات", + "شخصا", + "صباح", + "صفر", + "ضد", + "ضمن", + "عام", + "عاما", + "عدة", + "عدد", + "عدم", + "عشر", + "عشرة", + "على", + "عليه", + "عليها", + "عن", + "عند", + "عندما", + "غدا", + "غير", + "ـ", + "ف", + "فان", + "فى", + "في", + "فيه", + "فيها", + "قال", + "قبل", + "قد", + "قوة", + "كان", + "كانت", + "كل", + "كلم", + "كما", + "لا", + "لدى", + "لقاء", + "لكن", + "للامم", + "لم", + "لن", + "له", + "لها", + "لوكالة", + "ما", + "مايو", + "مساء", + "مع", + "مقابل", + "مليار", + "مليون", + "من", + "منذ", + "منها", + "نحو", + "نفسه", + "نهاية", + "هذا", + "هذه", + "هناك", + "هو", + "هي", + "و", + "و6", + "واحد", + "واضاف", + "واضافت", + "واكد", + "وان", + "واوضح", + "وفي", + "وقال", + "وقالت", + "وقد", + "وقف", + "وكان", + "وكانت", + "ولا", + "ولم", + "ومن", + "وهو", + "وهي", + "يكون", + "يمكن", + "يوم", + ], + "bn": [ + "অনেক", + "অন্য", + "অবশ্য", + "আগে", + "আছে", + "আজ", + "আবার", + "আমরা", + "আমাদের", + "আর", + "ই", + "উত্তর", + "উপর", + "উপরে", + "এ", + "এই", + "এক্", + "এখন", + "এত", + "এব", + "এমন", + "এমনি", + "এর", + "এস", + "এসে", + "ও", + "ওই", + "কমনে", + "করা", + "করে", + "কাছে", + "কাজ", + "কাজে", + "কারণ", + "কি", + "কিছু", + "কে", + "কেউ", + "কেখা", + "কেন", + "কোটি", + "কোনো", + "কয়েক", + "খুব", + "গিয়ে", + "গেল", + "চার", + "চালু", + "চেষ্টা", + "ছিল", + "জানা", + "জ্নজন", + "টি", + "তখন", + "তবে", + "তা", + "তাই", + "তো", + "থাকা", + "থেকে", + "দিন", + "দু", + "দুই", + "দেওয়া", + "ধামার", + "নতুন", + "না", + "নাগাদ", + "নিয়ে", + "নেওয়া", + "নয়", + "পর", + "পরে", + "পাচ", + "পি", + "পেয়্র্", + "প্রতি", + "প্রথম", + "প্রযন্ত", + "প্রাথমিক", + "প্রায়", + "বক্তব্য", + "বন", + "বলা", + "বলে", + "বলেন", + "বহু", + "বা", + "বি", + "বিভিন্ন", + "বেশ", + "বেশি", + "মতো", + "মধ্যে", + "মনে", + "যখন", + "যদি", + "যা", + "যাওয়া", + "যে", + "র", + "রকম", + "লক্ষ", + "শুধু", + "শুরু", + "সঙ্গে", + "সব", + "সহ", + "সাধারণ", + "সামনে", + "সি", + "সে", + "সেই", + "হতে", + "হাজার", + "হয়", + ], + "ca": [ + "a", + "abans", + "ací", + "ah", + "així", + "això", + "al", + "aleshores", + "algun", + "alguna", + "algunes", + "alguns", + "alhora", + "allà", + "allí", + "allò", + "als", + "altra", + "altre", + "altres", + "amb", + "ambdues", + "ambdós", + "apa", + "aquell", + "aquella", + "aquelles", + "aquells", + "aquest", + "aquesta", + "aquestes", + "aquests", + "aquí", + "baix", + "cada", + "cadascuna", + "cadascunes", + "cadascuns", + "cadascú", + "com", + "contra", + "d'un", + "d'una", + "d'unes", + "d'uns", + "dalt", + "de", + "del", + "dels", + "des", + "després", + "dins", + "dintre", + "donat", + "doncs", + "durant", + "e", + "eh", + "el", + "els", + "em", + "en", + "encara", + "ens", + "entre", + "eren", + "es", + "esta", + "estaven", + "esteu", + "està", + "estàvem", + "estàveu", + "et", + "etc", + "ets", + "fins", + "fora", + "gairebé", + "ha", + "han", + "has", + "havia", + "he", + "hem", + "heu", + "hi", + "ho", + "i", + "igual", + "iguals", + "ja", + "l'hi", + "la", + "les", + "li", + "li'n", + "llavors", + "m'he", + "ma", + "mal", + "malgrat", + "mateix", + "mateixa", + "mateixes", + "mateixos", + "me", + "mentre", + "meu", + "meus", + "meva", + "meves", + "molt", + "molta", + "moltes", + "molts", + "mon", + "mons", + "més", + "n'he", + "n'hi", + "ne", + "ni", + "no", + "nogensmenys", + "només", + "nosaltres", + "nostra", + "nostre", + "nostres", + "o", + "oh", + "oi", + "on", + "pas", + "pel", + "pels", + "per", + "perquè", + "però", + "poc", + "poca", + "pocs", + "poques", + "potser", + "propi", + "qual", + "quals", + "quan", + "quant", + "que", + "quelcom", + "qui", + "quin", + "quina", + "quines", + "quins", + "què", + "s'ha", + "s'han", + "sa", + "semblant", + "semblants", + "ses", + "seu", + "seus", + "seva", + "seves", + "si", + "sobre", + "sobretot", + "solament", + "sols", + "son", + "sons", + "sota", + "sou", + "sóc", + "són", + "t'ha", + "t'han", + "t'he", + "ta", + "tal", + "també", + "tampoc", + "tan", + "tant", + "tanta", + "tantes", + "teu", + "teus", + "teva", + "teves", + "ton", + "tons", + "tot", + "tota", + "totes", + "tots", + "un", + "una", + "unes", + "uns", + "us", + "va", + "vaig", + "vam", + "van", + "vas", + "veu", + "vosaltres", + "vostra", + "vostre", + "vostres", + "érem", + "éreu", + "és", + ], + "en": [ + "a", + "a's", + "able", + "about", + "above", + "according", + "accordingly", + "across", + "actually", + "after", + "afterwards", + "again", + "against", + "ain't", + "all", + "allow", + "allows", + "almost", + "alone", + "along", + "already", + "also", + "although", + "always", + "am", + "among", + "amongst", + "an", + "and", + "another", + "any", + "anybody", + "anyhow", + "anyone", + "anything", + "anyway", + "anyways", + "anywhere", + "apart", + "appear", + "appreciate", + "appropriate", + "are", + "aren't", + "around", + "as", + "aside", + "ask", + "asking", + "associated", + "at", + "available", + "away", + "awfully", + "b", + "be", + "became", + "because", + "become", + "becomes", + "becoming", + "been", + "before", + "beforehand", + "behind", + "being", + "believe", + "below", + "beside", + "besides", + "best", + "better", + "between", + "beyond", + "both", + "brief", + "but", + "by", + "c", + "c'mon", + "c's", + "came", + "can", + "can't", + "cannot", + "cant", + "cause", + "causes", + "certain", + "certainly", + "changes", + "clearly", + "co", + "com", + "come", + "comes", + "concerning", + "consequently", + "consider", + "considering", + "contain", + "containing", + "contains", + "corresponding", + "could", + "couldn't", + "course", + "currently", + "d", + "definitely", + "described", + "despite", + "did", + "didn't", + "different", + "do", + "does", + "doesn't", + "doing", + "don't", + "done", + "down", + "downwards", + "during", + "e", + "each", + "edu", + "eg", + "eight", + "either", + "else", + "elsewhere", + "enough", + "entirely", + "especially", + "et", + "etc", + "even", + "ever", + "every", + "everybody", + "everyone", + "everything", + "everywhere", + "ex", + "exactly", + "example", + "except", + "f", + "far", + "few", + "fifth", + "first", + "five", + "followed", + "following", + "follows", + "for", + "former", + "formerly", + "forth", + "four", + "from", + "further", + "furthermore", + "g", + "get", + "gets", + "getting", + "given", + "gives", + "go", + "goes", + "going", + "gone", + "got", + "gotten", + "greetings", + "h", + "had", + "hadn't", + "happens", + "hardly", + "has", + "hasn't", + "have", + "haven't", + "having", + "he", + "he's", + "hello", + "help", + "hence", + "her", + "here", + "here's", + "hereafter", + "hereby", + "herein", + "hereupon", + "hers", + "herself", + "hi", + "him", + "himself", + "his", + "hither", + "hopefully", + "how", + "howbeit", + "however", + "i", + "i'd", + "i'll", + "i'm", + "i've", + "ie", + "if", + "ignored", + "immediate", + "in", + "inasmuch", + "inc", + "indeed", + "indicate", + "indicated", + "indicates", + "inner", + "insofar", + "instead", + "into", + "inward", + "is", + "isn't", + "it", + "it'd", + "it'll", + "it's", + "its", + "itself", + "j", + "just", + "k", + "keep", + "keeps", + "kept", + "know", + "known", + "knows", + "l", + "last", + "lately", + "later", + "latter", + "latterly", + "least", + "less", + "lest", + "let", + "let's", + "like", + "liked", + "likely", + "little", + "look", + "looking", + "looks", + "ltd", + "m", + "mainly", + "many", + "may", + "maybe", + "me", + "mean", + "meanwhile", + "merely", + "might", + "more", + "moreover", + "most", + "mostly", + "much", + "must", + "my", + "myself", + "n", + "name", + "namely", + "nd", + "near", + "nearly", + "necessary", + "need", + "needs", + "neither", + "never", + "nevertheless", + "new", + "next", + "nine", + "no", + "nobody", + "non", + "none", + "noone", + "nor", + "normally", + "not", + "nothing", + "novel", + "now", + "nowhere", + "o", + "obviously", + "of", + "off", + "often", + "oh", + "ok", + "okay", + "old", + "on", + "once", + "one", + "ones", + "only", + "onto", + "or", + "other", + "others", + "otherwise", + "ought", + "our", + "ours", + "ourselves", + "out", + "outside", + "over", + "overall", + "own", + "p", + "particular", + "particularly", + "per", + "perhaps", + "placed", + "please", + "plus", + "possible", + "presumably", + "probably", + "provides", + "q", + "que", + "quite", + "qv", + "r", + "rather", + "rd", + "re", + "really", + "reasonably", + "regarding", + "regardless", + "regards", + "relatively", + "respectively", + "right", + "s", + "said", + "same", + "saw", + "say", + "saying", + "says", + "second", + "secondly", + "see", + "seeing", + "seem", + "seemed", + "seeming", + "seems", + "seen", + "self", + "selves", + "sensible", + "sent", + "serious", + "seriously", + "seven", + "several", + "shall", + "she", + "should", + "shouldn't", + "since", + "six", + "so", + "some", + "somebody", + "somehow", + "someone", + "something", + "sometime", + "sometimes", + "somewhat", + "somewhere", + "soon", + "sorry", + "specified", + "specify", + "specifying", + "still", + "sub", + "such", + "sup", + "sure", + "t", + "t's", + "take", + "taken", + "tell", + "tends", + "th", + "than", + "thank", + "thanks", + "thanx", + "that", + "that's", + "thats", + "the", + "their", + "theirs", + "them", + "themselves", + "then", + "thence", + "there", + "there's", + "thereafter", + "thereby", + "therefore", + "therein", + "theres", + "thereupon", + "these", + "they", + "they'd", + "they'll", + "they're", + "they've", + "think", + "third", + "this", + "thorough", + "thoroughly", + "those", + "though", + "three", + "through", + "throughout", + "thru", + "thus", + "to", + "together", + "too", + "took", + "toward", + "towards", + "tried", + "tries", + "truly", + "try", + "trying", + "twice", + "two", + "u", + "un", + "under", + "unfortunately", + "unless", + "unlikely", + "until", + "unto", + "up", + "upon", + "us", + "use", + "used", + "useful", + "uses", + "using", + "usually", + "uucp", + "v", + "value", + "various", + "very", + "via", + "viz", + "vs", + "w", + "want", + "wants", + "was", + "wasn't", + "way", + "we", + "we'd", + "we'll", + "we're", + "we've", + "welcome", + "well", + "went", + "were", + "weren't", + "what", + "what's", + "whatever", + "when", + "whence", + "whenever", + "where", + "where's", + "whereafter", + "whereas", + "whereby", + "wherein", + "whereupon", + "wherever", + "whether", + "which", + "while", + "whither", + "who", + "who's", + "whoever", + "whole", + "whom", + "whose", + "why", + "will", + "willing", + "wish", + "with", + "within", + "without", + "won't", + "wonder", + "would", + "wouldn't", + "x", + "y", + "yes", + "yet", + "you", + "you'd", + "you'll", + "you're", + "you've", + "your", + "yours", + "yourself", + "yourselves", + "z", + "zero", + ], + "es": [ + "a", + "actualmente", + "acuerdo", + "adelante", + "ademas", + "además", + "adrede", + "afirmó", + "agregó", + "ahi", + "ahora", + "ahí", + "al", + "algo", + "alguna", + "algunas", + "alguno", + "algunos", + "algún", + "alli", + "allí", + "alrededor", + "ambos", + "ampleamos", + "antano", + "antaño", + "ante", + "anterior", + "antes", + "apenas", + "aproximadamente", + "aquel", + "aquella", + "aquellas", + "aquello", + "aquellos", + "aqui", + "aquél", + "aquélla", + "aquéllas", + "aquéllos", + "aquí", + "arriba", + "arribaabajo", + "aseguró", + "asi", + "así", + "atras", + "aun", + "aunque", + "ayer", + "añadió", + "aún", + "b", + "bajo", + "bastante", + "bien", + "breve", + "buen", + "buena", + "buenas", + "bueno", + "buenos", + "c", + "cada", + "casi", + "cerca", + "cierta", + "ciertas", + "cierto", + "ciertos", + "cinco", + "claro", + "comentó", + "como", + "con", + "conmigo", + "conocer", + "conseguimos", + "conseguir", + "considera", + "consideró", + "consigo", + "consigue", + "consiguen", + "consigues", + "contigo", + "contra", + "cosas", + "creo", + "cual", + "cuales", + "cualquier", + "cuando", + "cuanta", + "cuantas", + "cuanto", + "cuantos", + "cuatro", + "cuenta", + "cuál", + "cuáles", + "cuándo", + "cuánta", + "cuántas", + "cuánto", + "cuántos", + "cómo", + "d", + "da", + "dado", + "dan", + "dar", + "de", + "debajo", + "debe", + "deben", + "debido", + "decir", + "dejó", + "del", + "delante", + "demasiado", + "demás", + "dentro", + "deprisa", + "desde", + "despacio", + "despues", + "después", + "detras", + "detrás", + "dia", + "dias", + "dice", + "dicen", + "dicho", + "dieron", + "diferente", + "diferentes", + "dijeron", + "dijo", + "dio", + "donde", + "dos", + "durante", + "día", + "días", + "dónde", + "e", + "ejemplo", + "el", + "ella", + "ellas", + "ello", + "ellos", + "embargo", + "empleais", + "emplean", + "emplear", + "empleas", + "empleo", + "en", + "encima", + "encuentra", + "enfrente", + "enseguida", + "entonces", + "entre", + "era", + "eramos", + "eran", + "eras", + "eres", + "es", + "esa", + "esas", + "ese", + "eso", + "esos", + "esta", + "estaba", + "estaban", + "estado", + "estados", + "estais", + "estamos", + "estan", + "estar", + "estará", + "estas", + "este", + "esto", + "estos", + "estoy", + "estuvo", + "está", + "están", + "ex", + "excepto", + "existe", + "existen", + "explicó", + "expresó", + "f", + "fin", + "final", + "fue", + "fuera", + "fueron", + "fui", + "fuimos", + "g", + "general", + "gran", + "grandes", + "gueno", + "h", + "ha", + "haber", + "habia", + "habla", + "hablan", + "habrá", + "había", + "habían", + "hace", + "haceis", + "hacemos", + "hacen", + "hacer", + "hacerlo", + "haces", + "hacia", + "haciendo", + "hago", + "han", + "hasta", + "hay", + "haya", + "he", + "hecho", + "hemos", + "hicieron", + "hizo", + "horas", + "hoy", + "hubo", + "i", + "igual", + "incluso", + "indicó", + "informo", + "informó", + "intenta", + "intentais", + "intentamos", + "intentan", + "intentar", + "intentas", + "intento", + "ir", + "j", + "junto", + "k", + "l", + "la", + "lado", + "largo", + "las", + "le", + "lejos", + "les", + "llegó", + "lleva", + "llevar", + "lo", + "los", + "luego", + "lugar", + "m", + "mal", + "manera", + "manifestó", + "mas", + "mayor", + "me", + "mediante", + "medio", + "mejor", + "mencionó", + "menos", + "menudo", + "mi", + "mia", + "mias", + "mientras", + "mio", + "mios", + "mis", + "misma", + "mismas", + "mismo", + "mismos", + "modo", + "momento", + "mucha", + "muchas", + "mucho", + "muchos", + "muy", + "más", + "mí", + "mía", + "mías", + "mío", + "míos", + "n", + "nada", + "nadie", + "ni", + "ninguna", + "ningunas", + "ninguno", + "ningunos", + "ningún", + "no", + "nos", + "nosotras", + "nosotros", + "nuestra", + "nuestras", + "nuestro", + "nuestros", + "nueva", + "nuevas", + "nuevo", + "nuevos", + "nunca", + "o", + "ocho", + "os", + "otra", + "otras", + "otro", + "otros", + "p", + "pais", + "para", + "parece", + "parte", + "partir", + "pasada", + "pasado", + "paìs", + "peor", + "pero", + "pesar", + "poca", + "pocas", + "poco", + "pocos", + "podeis", + "podemos", + "poder", + "podria", + "podriais", + "podriamos", + "podrian", + "podrias", + "podrá", + "podrán", + "podría", + "podrían", + "poner", + "por", + "porque", + "posible", + "primer", + "primera", + "primero", + "primeros", + "principalmente", + "pronto", + "propia", + "propias", + "propio", + "propios", + "proximo", + "próximo", + "próximos", + "pudo", + "pueda", + "puede", + "pueden", + "puedo", + "pues", + "q", + "qeu", + "que", + "quedó", + "queremos", + "quien", + "quienes", + "quiere", + "quiza", + "quizas", + "quizá", + "quizás", + "quién", + "quiénes", + "qué", + "r", + "raras", + "realizado", + "realizar", + "realizó", + "repente", + "respecto", + "s", + "sabe", + "sabeis", + "sabemos", + "saben", + "saber", + "sabes", + "salvo", + "se", + "sea", + "sean", + "segun", + "segunda", + "segundo", + "según", + "seis", + "ser", + "sera", + "será", + "serán", + "sería", + "señaló", + "si", + "sido", + "siempre", + "siendo", + "siete", + "sigue", + "siguiente", + "sin", + "sino", + "sobre", + "sois", + "sola", + "solamente", + "solas", + "solo", + "solos", + "somos", + "son", + "soy", + "soyos", + "su", + "supuesto", + "sus", + "suya", + "suyas", + "suyo", + "sé", + "sí", + "sólo", + "t", + "tal", + "tambien", + "también", + "tampoco", + "tan", + "tanto", + "tarde", + "te", + "temprano", + "tendrá", + "tendrán", + "teneis", + "tenemos", + "tener", + "tenga", + "tengo", + "tenido", + "tenía", + "tercera", + "ti", + "tiempo", + "tiene", + "tienen", + "toda", + "todas", + "todavia", + "todavía", + "todo", + "todos", + "total", + "trabaja", + "trabajais", + "trabajamos", + "trabajan", + "trabajar", + "trabajas", + "trabajo", + "tras", + "trata", + "través", + "tres", + "tu", + "tus", + "tuvo", + "tuya", + "tuyas", + "tuyo", + "tuyos", + "tú", + "u", + "ultimo", + "un", + "una", + "unas", + "uno", + "unos", + "usa", + "usais", + "usamos", + "usan", + "usar", + "usas", + "uso", + "usted", + "ustedes", + "v", + "va", + "vais", + "valor", + "vamos", + "van", + "varias", + "varios", + "vaya", + "veces", + "ver", + "verdad", + "verdadera", + "verdadero", + "vez", + "vosotras", + "vosotros", + "voy", + "vuestra", + "vuestras", + "vuestro", + "vuestros", + "w", + "x", + "y", + "ya", + "yo", + "z", + "él", + "ésa", + "ésas", + "ése", + "ésos", + "ésta", + "éstas", + "éste", + "éstos", + "última", + "últimas", + "último", + "últimos", + ], + "eu": [ + "al", + "anitz", + "arabera", + "asko", + "baina", + "bat", + "batean", + "batek", + "bati", + "batzuei", + "batzuek", + "batzuetan", + "batzuk", + "bera", + "beraiek", + "berau", + "berauek", + "bere", + "berori", + "beroriek", + "beste", + "bezala", + "da", + "dago", + "dira", + "ditu", + "du", + "dute", + "edo", + "egin", + "ere", + "eta", + "eurak", + "ez", + "gainera", + "gu", + "gutxi", + "guzti", + "haiei", + "haiek", + "haietan", + "hainbeste", + "hala", + "han", + "handik", + "hango", + "hara", + "hari", + "hark", + "hartan", + "hau", + "hauei", + "hauek", + "hauetan", + "hemen", + "hemendik", + "hemengo", + "hi", + "hona", + "honek", + "honela", + "honetan", + "honi", + "hor", + "hori", + "horiei", + "horiek", + "horietan", + "horko", + "horra", + "horrek", + "horrela", + "horretan", + "horri", + "hortik", + "hura", + "izan", + "ni", + "noiz", + "nola", + "non", + "nondik", + "nongo", + "nor", + "nora", + "ze", + "zein", + "zen", + "zenbait", + "zenbat", + "zer", + "zergatik", + "ziren", + "zituen", + "zu", + "zuek", + "zuen", + "zuten", + ], + "fr": [ + "a", + "abord", + "absolument", + "afin", + "ah", + "ai", + "aie", + "ailleurs", + "ainsi", + "ait", + "allaient", + "allo", + "allons", + "allô", + "alors", + "anterieur", + "anterieure", + "anterieures", + "apres", + "après", + "as", + "assez", + "attendu", + "au", + "aucun", + "aucune", + "aujourd", + "aujourd'hui", + "aupres", + "auquel", + "aura", + "auraient", + "aurait", + "auront", + "aussi", + "autre", + "autrefois", + "autrement", + "autres", + "autrui", + "aux", + "auxquelles", + "auxquels", + "avaient", + "avais", + "avait", + "avant", + "avec", + "avoir", + "avons", + "ayant", + "b", + "bah", + "bas", + "basee", + "bat", + "beau", + "beaucoup", + "bien", + "bigre", + "boum", + "bravo", + "brrr", + "c", + "car", + "ce", + "ceci", + "cela", + "celle", + "celle-ci", + "celle-là", + "celles", + "celles-ci", + "celles-là", + "celui", + "celui-ci", + "celui-là", + "cent", + "cependant", + "certain", + "certaine", + "certaines", + "certains", + "certes", + "ces", + "cet", + "cette", + "ceux", + "ceux-ci", + "ceux-là", + "chacun", + "chacune", + "chaque", + "cher", + "chers", + "chez", + "chiche", + "chut", + "chère", + "chères", + "ci", + "cinq", + "cinquantaine", + "cinquante", + "cinquantième", + "cinquième", + "clac", + "clic", + "combien", + "comme", + "comment", + "comparable", + "comparables", + "compris", + "concernant", + "contre", + "couic", + "crac", + "d", + "da", + "dans", + "de", + "debout", + "dedans", + "dehors", + "deja", + "delà", + "depuis", + "dernier", + "derniere", + "derriere", + "derrière", + "des", + "desormais", + "desquelles", + "desquels", + "dessous", + "dessus", + "deux", + "deuxième", + "deuxièmement", + "devant", + "devers", + "devra", + "different", + "differentes", + "differents", + "différent", + "différente", + "différentes", + "différents", + "dire", + "directe", + "directement", + "dit", + "dite", + "dits", + "divers", + "diverse", + "diverses", + "dix", + "dix-huit", + "dix-neuf", + "dix-sept", + "dixième", + "doit", + "doivent", + "donc", + "dont", + "douze", + "douzième", + "dring", + "du", + "duquel", + "durant", + "dès", + "désormais", + "e", + "effet", + "egale", + "egalement", + "egales", + "eh", + "elle", + "elle-même", + "elles", + "elles-mêmes", + "en", + "encore", + "enfin", + "entre", + "envers", + "environ", + "es", + "est", + "et", + "etant", + "etc", + "etre", + "eu", + "euh", + "eux", + "eux-mêmes", + "exactement", + "excepté", + "extenso", + "exterieur", + "f", + "fais", + "faisaient", + "faisant", + "fait", + "façon", + "feront", + "fi", + "flac", + "floc", + "font", + "g", + "gens", + "h", + "ha", + "hein", + "hem", + "hep", + "hi", + "ho", + "holà", + "hop", + "hormis", + "hors", + "hou", + "houp", + "hue", + "hui", + "huit", + "huitième", + "hum", + "hurrah", + "hé", + "hélas", + "i", + "il", + "ils", + "importe", + "j", + "je", + "jusqu", + "jusque", + "juste", + "k", + "l", + "la", + "laisser", + "laquelle", + "las", + "le", + "lequel", + "les", + "lesquelles", + "lesquels", + "leur", + "leurs", + "longtemps", + "lors", + "lorsque", + "lui", + "lui-meme", + "lui-même", + "là", + "lès", + "m", + "ma", + "maint", + "maintenant", + "mais", + "malgre", + "malgré", + "maximale", + "me", + "meme", + "memes", + "merci", + "mes", + "mien", + "mienne", + "miennes", + "miens", + "mille", + "mince", + "minimale", + "moi", + "moi-meme", + "moi-même", + "moindres", + "moins", + "mon", + "moyennant", + "multiple", + "multiples", + "même", + "mêmes", + "n", + "na", + "naturel", + "naturelle", + "naturelles", + "ne", + "neanmoins", + "necessaire", + "necessairement", + "neuf", + "neuvième", + "ni", + "nombreuses", + "nombreux", + "non", + "nos", + "notamment", + "notre", + "nous", + "nous-mêmes", + "nouveau", + "nul", + "néanmoins", + "nôtre", + "nôtres", + "o", + "oh", + "ohé", + "ollé", + "olé", + "on", + "ont", + "onze", + "onzième", + "ore", + "ou", + "ouf", + "ouias", + "oust", + "ouste", + "outre", + "ouvert", + "ouverte", + "ouverts", + "o|", + "où", + "p", + "paf", + "pan", + "par", + "parce", + "parfois", + "parle", + "parlent", + "parler", + "parmi", + "parseme", + "partant", + "particulier", + "particulière", + "particulièrement", + "pas", + "passé", + "pendant", + "pense", + "permet", + "personne", + "peu", + "peut", + "peuvent", + "peux", + "pff", + "pfft", + "pfut", + "pif", + "pire", + "plein", + "plouf", + "plus", + "plusieurs", + "plutôt", + "possessif", + "possessifs", + "possible", + "possibles", + "pouah", + "pour", + "pourquoi", + "pourrais", + "pourrait", + "pouvait", + "prealable", + "precisement", + "premier", + "première", + "premièrement", + "pres", + "probable", + "probante", + "procedant", + "proche", + "près", + "psitt", + "pu", + "puis", + "puisque", + "pur", + "pure", + "q", + "qu", + "quand", + "quant", + "quant-à-soi", + "quanta", + "quarante", + "quatorze", + "quatre", + "quatre-vingt", + "quatrième", + "quatrièmement", + "que", + "quel", + "quelconque", + "quelle", + "quelles", + "quelqu'un", + "quelque", + "quelques", + "quels", + "qui", + "quiconque", + "quinze", + "quoi", + "quoique", + "r", + "rare", + "rarement", + "rares", + "relative", + "relativement", + "remarquable", + "rend", + "rendre", + "restant", + "reste", + "restent", + "restrictif", + "retour", + "revoici", + "revoilà", + "rien", + "s", + "sa", + "sacrebleu", + "sait", + "sans", + "sapristi", + "sauf", + "se", + "sein", + "seize", + "selon", + "semblable", + "semblaient", + "semble", + "semblent", + "sent", + "sept", + "septième", + "sera", + "seraient", + "serait", + "seront", + "ses", + "seul", + "seule", + "seulement", + "si", + "sien", + "sienne", + "siennes", + "siens", + "sinon", + "six", + "sixième", + "soi", + "soi-même", + "soit", + "soixante", + "son", + "sont", + "sous", + "souvent", + "specifique", + "specifiques", + "speculatif", + "stop", + "strictement", + "subtiles", + "suffisant", + "suffisante", + "suffit", + "suis", + "suit", + "suivant", + "suivante", + "suivantes", + "suivants", + "suivre", + "superpose", + "sur", + "surtout", + "t", + "ta", + "tac", + "tant", + "tardive", + "te", + "tel", + "telle", + "tellement", + "telles", + "tels", + "tenant", + "tend", + "tenir", + "tente", + "tes", + "tic", + "tien", + "tienne", + "tiennes", + "tiens", + "toc", + "toi", + "toi-même", + "ton", + "touchant", + "toujours", + "tous", + "tout", + "toute", + "toutefois", + "toutes", + "treize", + "trente", + "tres", + "trois", + "troisième", + "troisièmement", + "trop", + "très", + "tsoin", + "tsouin", + "tu", + "té", + "u", + "un", + "une", + "unes", + "uniformement", + "unique", + "uniques", + "uns", + "v", + "va", + "vais", + "vas", + "vers", + "via", + "vif", + "vifs", + "vingt", + "vivat", + "vive", + "vives", + "vlan", + "voici", + "voilà", + "vont", + "vos", + "votre", + "vous", + "vous-mêmes", + "vu", + "vé", + "vôtre", + "vôtres", + "w", + "x", + "y", + "z", + "zut", + "à", + "â", + "ça", + "ès", + "étaient", + "étais", + "était", + "étant", + "été", + "être", + "ô", + ], + "hi": [ + "अंदर", + "अत", + "अदि", + "अप", + "अपना", + "अपनि", + "अपनी", + "अपने", + "अभि", + "अभी", + "आदि", + "आप", + "इंहिं", + "इंहें", + "इंहों", + "इतयादि", + "इत्यादि", + "इन", + "इनका", + "इन्हीं", + "इन्हें", + "इन्हों", + "इस", + "इसका", + "इसकि", + "इसकी", + "इसके", + "इसमें", + "इसि", + "इसी", + "इसे", + "उंहिं", + "उंहें", + "उंहों", + "उन", + "उनका", + "उनकि", + "उनकी", + "उनके", + "उनको", + "उन्हीं", + "उन्हें", + "उन्हों", + "उस", + "उसके", + "उसि", + "उसी", + "उसे", + "एक", + "एवं", + "एस", + "एसे", + "ऐसे", + "ओर", + "और", + "कइ", + "कई", + "कर", + "करता", + "करते", + "करना", + "करने", + "करें", + "कहते", + "कहा", + "का", + "काफि", + "काफ़ी", + "कि", + "किंहें", + "किंहों", + "कितना", + "किन्हें", + "किन्हों", + "किया", + "किर", + "किस", + "किसि", + "किसी", + "किसे", + "की", + "कुछ", + "कुल", + "के", + "को", + "कोइ", + "कोई", + "कोन", + "कोनसा", + "कौन", + "कौनसा", + "गया", + "घर", + "जब", + "जहाँ", + "जहां", + "जा", + "जिंहें", + "जिंहों", + "जितना", + "जिधर", + "जिन", + "जिन्हें", + "जिन्हों", + "जिस", + "जिसे", + "जीधर", + "जेसा", + "जेसे", + "जैसा", + "जैसे", + "जो", + "तक", + "तब", + "तरह", + "तिंहें", + "तिंहों", + "तिन", + "तिन्हें", + "तिन्हों", + "तिस", + "तिसे", + "तो", + "था", + "थि", + "थी", + "थे", + "दबारा", + "दवारा", + "दिया", + "दुसरा", + "दुसरे", + "दूसरे", + "दो", + "द्वारा", + "न", + "नहिं", + "नहीं", + "ना", + "निचे", + "निहायत", + "नीचे", + "ने", + "पर", + "पहले", + "पुरा", + "पूरा", + "पे", + "फिर", + "बनि", + "बनी", + "बहि", + "बही", + "बहुत", + "बाद", + "बाला", + "बिलकुल", + "भि", + "भितर", + "भी", + "भीतर", + "मगर", + "मानो", + "मे", + "में", + "यदि", + "यह", + "यहाँ", + "यहां", + "यहि", + "यही", + "या", + "यिह", + "ये", + "रखें", + "रवासा", + "रहा", + "रहे", + "ऱ्वासा", + "लिए", + "लिये", + "लेकिन", + "व", + "वगेरह", + "वरग", + "वर्ग", + "वह", + "वहाँ", + "वहां", + "वहिं", + "वहीं", + "वाले", + "वुह", + "वे", + "वग़ैरह", + "संग", + "सकता", + "सकते", + "सबसे", + "सभि", + "सभी", + "साथ", + "साबुत", + "साभ", + "सारा", + "से", + "सो", + "हि", + "ही", + "हुअ", + "हुआ", + "हुइ", + "हुई", + "हुए", + "हे", + "हें", + "है", + "हैं", + "हो", + "होता", + "होति", + "होती", + "होते", + "होना", + "होने", + ], + "id": [ + "ada", + "adalah", + "adanya", + "adapun", + "agak", + "agaknya", + "agar", + "akan", + "akankah", + "akhirnya", + "aku", + "akulah", + "amat", + "amatlah", + "anda", + "andalah", + "antar", + "antara", + "antaranya", + "apa", + "apaan", + "apabila", + "apakah", + "apalagi", + "apatah", + "atau", + "ataukah", + "ataupun", + "bagai", + "bagaikan", + "bagaimana", + "bagaimanakah", + "bagaimanapun", + "bagi", + "bahkan", + "bahwa", + "bahwasanya", + "banyak", + "beberapa", + "begini", + "beginian", + "beginikah", + "beginilah", + "begitu", + "begitukah", + "begitulah", + "begitupun", + "belum", + "belumlah", + "berapa", + "berapakah", + "berapalah", + "berapapun", + "bermacam", + "bersama", + "betulkah", + "biasa", + "biasanya", + "bila", + "bilakah", + "bisa", + "bisakah", + "boleh", + "bolehkah", + "bolehlah", + "buat", + "bukan", + "bukankah", + "bukanlah", + "bukannya", + "cuma", + "dahulu", + "dalam", + "dan", + "dapat", + "dari", + "daripada", + "dekat", + "demi", + "demikian", + "demikianlah", + "dengan", + "depan", + "di", + "dia", + "dialah", + "diantara", + "diantaranya", + "dikarenakan", + "dini", + "diri", + "dirinya", + "disini", + "disinilah", + "dong", + "dulu", + "enggak", + "enggaknya", + "entah", + "entahlah", + "hal", + "hampir", + "hanya", + "hanyalah", + "harus", + "haruslah", + "harusnya", + "hendak", + "hendaklah", + "hendaknya", + "hingga", + "ia", + "ialah", + "ibarat", + "ingin", + "inginkah", + "inginkan", + "ini", + "inikah", + "inilah", + "itu", + "itukah", + "itulah", + "jangan", + "jangankan", + "janganlah", + "jika", + "jikalau", + "juga", + "justru", + "kala", + "kalau", + "kalaulah", + "kalaupun", + "kalian", + "kami", + "kamilah", + "kamu", + "kamulah", + "kan", + "kapan", + "kapankah", + "kapanpun", + "karena", + "karenanya", + "ke", + "kecil", + "kemudian", + "kenapa", + "kepada", + "kepadanya", + "ketika", + "khususnya", + "kini", + "kinilah", + "kiranya", + "kita", + "kitalah", + "kok", + "lagi", + "lagian", + "lah", + "lain", + "lainnya", + "lalu", + "lama", + "lamanya", + "lebih", + "macam", + "maka", + "makanya", + "makin", + "malah", + "malahan", + "mampu", + "mampukah", + "mana", + "manakala", + "manalagi", + "masih", + "masihkah", + "masing", + "mau", + "maupun", + "melainkan", + "melalui", + "memang", + "mengapa", + "mereka", + "merekalah", + "merupakan", + "meski", + "meskipun", + "mungkin", + "mungkinkah", + "nah", + "namun", + "nanti", + "nantinya", + "nyaris", + "oleh", + "olehnya", + "pada", + "padahal", + "padanya", + "paling", + "pantas", + "para", + "pasti", + "pastilah", + "per", + "percuma", + "pernah", + "pula", + "pun", + "rupanya", + "saat", + "saatnya", + "saja", + "sajalah", + "saling", + "sama", + "sambil", + "sampai", + "sana", + "sangat", + "sangatlah", + "saya", + "sayalah", + "se", + "sebab", + "sebabnya", + "sebagai", + "sebagaimana", + "sebagainya", + "sebaliknya", + "sebanyak", + "sebegini", + "sebegitu", + "sebelum", + "sebelumnya", + "sebenarnya", + "seberapa", + "sebetulnya", + "sebisanya", + "sebuah", + "sedang", + "sedangkan", + "sedemikian", + "sedikit", + "sedikitnya", + "segala", + "segalanya", + "segera", + "seharusnya", + "sehingga", + "sejak", + "sejenak", + "sekali", + "sekalian", + "sekaligus", + "sekalipun", + "sekarang", + "seketika", + "sekiranya", + "sekitar", + "sekitarnya", + "sela", + "selagi", + "selain", + "selaku", + "selalu", + "selama", + "selamanya", + "seluruh", + "seluruhnya", + "semacam", + "semakin", + "semasih", + "semaunya", + "sementara", + "sempat", + "semua", + "semuanya", + "semula", + "sendiri", + "sendirinya", + "seolah", + "seorang", + "sepanjang", + "sepantasnya", + "sepantasnyalah", + "seperti", + "sepertinya", + "sering", + "seringnya", + "serta", + "serupa", + "sesaat", + "sesama", + "sesegera", + "sesekali", + "seseorang", + "sesuatu", + "sesuatunya", + "sesudah", + "sesudahnya", + "setelah", + "seterusnya", + "setiap", + "setidaknya", + "sewaktu", + "siapa", + "siapakah", + "siapapun", + "sini", + "sinilah", + "suatu", + "sudah", + "sudahkah", + "sudahlah", + "supaya", + "tadi", + "tadinya", + "tak", + "tanpa", + "tapi", + "telah", + "tentang", + "tentu", + "tentulah", + "tentunya", + "terdiri", + "terhadap", + "terhadapnya", + "terlalu", + "terlebih", + "tersebut", + "tersebutlah", + "tertentu", + "tetapi", + "tiap", + "tidak", + "tidakkah", + "tidaklah", + "toh", + "waduh", + "wah", + "wahai", + "walau", + "walaupun", + "wong", + "yaitu", + "yakni", + "yang", + ], + "mr": [ + "अधिक", + "अनेक", + "अशी", + "असलयाचे", + "असलेल्या", + "असा", + "असून", + "असे", + "आज", + "आणि", + "आता", + "आपल्या", + "आला", + "आली", + "आले", + "आहे", + "आहेत", + "एक", + "एका", + "कमी", + "करणयात", + "करून", + "का", + "काम", + "काय", + "काही", + "किवा", + "की", + "केला", + "केली", + "केले", + "कोटी", + "गेल्या", + "घेऊन", + "जात", + "झाला", + "झाली", + "झाले", + "झालेल्या", + "टा", + "डॉ", + "तर", + "तरी", + "तसेच", + "ता", + "ती", + "तीन", + "ते", + "तो", + "त्या", + "त्याचा", + "त्याची", + "त्याच्या", + "त्याना", + "त्यानी", + "त्यामुळे", + "त्री", + "दिली", + "दोन", + "न", + "नाही", + "निर्ण्य", + "पण", + "पम", + "परयतन", + "पाटील", + "म", + "मात्र", + "माहिती", + "मी", + "मुबी", + "म्हणजे", + "म्हणाले", + "म्हणून", + "या", + "याचा", + "याची", + "याच्या", + "याना", + "यानी", + "येणार", + "येत", + "येथील", + "येथे", + "लाख", + "व", + "व्यकत", + "सर्व", + "सागित्ले", + "सुरू", + "हजार", + "हा", + "ही", + "हे", + "होणार", + "होत", + "होता", + "होती", + "होते", + ], + "pt": [ + "a", + "acerca", + "adeus", + "agora", + "ainda", + "algmas", + "algo", + "algumas", + "alguns", + "ali", + "além", + "ambos", + "ano", + "anos", + "antes", + "ao", + "aos", + "apenas", + "apoio", + "apontar", + "após", + "aquela", + "aquelas", + "aquele", + "aqueles", + "aqui", + "aquilo", + "as", + "assim", + "através", + "atrás", + "até", + "aí", + "baixo", + "bastante", + "bem", + "bom", + "breve", + "cada", + "caminho", + "catorze", + "cedo", + "cento", + "certamente", + "certeza", + "cima", + "cinco", + "coisa", + "com", + "como", + "comprido", + "conhecido", + "conselho", + "contra", + "corrente", + "custa", + "cá", + "da", + "daquela", + "daquele", + "dar", + "das", + "de", + "debaixo", + "demais", + "dentro", + "depois", + "desde", + "desligado", + "dessa", + "desse", + "desta", + "deste", + "deve", + "devem", + "deverá", + "dez", + "dezanove", + "dezasseis", + "dezassete", + "dezoito", + "dia", + "diante", + "direita", + "diz", + "dizem", + "dizer", + "do", + "dois", + "dos", + "doze", + "duas", + "dá", + "dão", + "dúvida", + "e", + "ela", + "elas", + "ele", + "eles", + "em", + "embora", + "enquanto", + "entre", + "então", + "era", + "essa", + "essas", + "esse", + "esses", + "esta", + "estado", + "estar", + "estará", + "estas", + "estava", + "este", + "estes", + "esteve", + "estive", + "estivemos", + "estiveram", + "estiveste", + "estivestes", + "estou", + "está", + "estás", + "estão", + "eu", + "exemplo", + "falta", + "fará", + "favor", + "faz", + "fazeis", + "fazem", + "fazemos", + "fazer", + "fazes", + "fazia", + "faço", + "fez", + "fim", + "final", + "foi", + "fomos", + "for", + "fora", + "foram", + "forma", + "foste", + "fostes", + "fui", + "geral", + "grande", + "grandes", + "grupo", + "hoje", + "horas", + "há", + "iniciar", + "inicio", + "ir", + "irá", + "isso", + "ista", + "iste", + "isto", + "já", + "lado", + "ligado", + "local", + "logo", + "longe", + "lugar", + "lá", + "maior", + "maioria", + "maiorias", + "mais", + "mal", + "mas", + "me", + "meio", + "menor", + "menos", + "meses", + "mesmo", + "meu", + "meus", + "mil", + "minha", + "minhas", + "momento", + "muito", + "muitos", + "máximo", + "mês", + "na", + "nada", + "naquela", + "naquele", + "nas", + "nem", + "nenhuma", + "nessa", + "nesse", + "nesta", + "neste", + "no", + "noite", + "nome", + "nos", + "nossa", + "nossas", + "nosso", + "nossos", + "nova", + "nove", + "novo", + "novos", + "num", + "numa", + "nunca", + "não", + "nível", + "nós", + "número", + "o", + "obra", + "obrigada", + "obrigado", + "oitava", + "oitavo", + "oito", + "onde", + "ontem", + "onze", + "os", + "ou", + "outra", + "outras", + "outro", + "outros", + "para", + "parece", + "parte", + "partir", + "pegar", + "pela", + "pelas", + "pelo", + "pelos", + "perto", + "pessoas", + "pode", + "podem", + "poder", + "poderá", + "podia", + "ponto", + "pontos", + "por", + "porque", + "porquê", + "posição", + "possivelmente", + "posso", + "possível", + "pouca", + "pouco", + "povo", + "primeira", + "primeiro", + "promeiro", + "próprio", + "próximo", + "puderam", + "pôde", + "põe", + "põem", + "qual", + "qualquer", + "quando", + "quanto", + "quarta", + "quarto", + "quatro", + "que", + "quem", + "quer", + "quero", + "questão", + "quieto", + "quinta", + "quinto", + "quinze", + "quê", + "relação", + "sabe", + "saber", + "se", + "segunda", + "segundo", + "sei", + "seis", + "sem", + "sempre", + "ser", + "seria", + "sete", + "seu", + "seus", + "sexta", + "sexto", + "sim", + "sistema", + "sob", + "sobre", + "sois", + "somente", + "somos", + "sou", + "sua", + "suas", + "são", + "sétima", + "sétimo", + "tal", + "talvez", + "também", + "tanto", + "tarde", + "te", + "tem", + "temos", + "tempo", + "tendes", + "tenho", + "tens", + "tentar", + "tentaram", + "tente", + "tentei", + "ter", + "terceira", + "terceiro", + "teu", + "teus", + "teve", + "tipo", + "tive", + "tivemos", + "tiveram", + "tiveste", + "tivestes", + "toda", + "todas", + "todo", + "todos", + "trabalhar", + "trabalho", + "treze", + "três", + "tu", + "tua", + "tuas", + "tudo", + "tão", + "têm", + "um", + "uma", + "umas", + "uns", + "usa", + "usar", + "vai", + "vais", + "valor", + "veja", + "vem", + "vens", + "ver", + "verdade", + "verdadeiro", + "vez", + "vezes", + "viagem", + "vindo", + "vinte", + "você", + "vocês", + "vos", + "vossa", + "vossas", + "vosso", + "vossos", + "vários", + "vão", + "vêm", + "vós", + "zero", + "à", + "às", + "área", + "é", + "és", + "último", + ], + "so": [ + "aad", + "albaabkii", + "atabo", + "ay", + "ayaa", + "ayee", + "ayuu", + "dhan", + "hadana", + "in", + "inuu", + "isku", + "jiray", + "jirtay", + "ka", + "kale", + "kasoo", + "ku", + "kuu", + "lakin", + "markii", + "oo", + "si", + "soo", + "uga", + "ugu", + "uu", + "waa", + "waxa", + "waxuu", + ], + "sw": [ + "akasema", + "alikuwa", + "alisema", + "baada", + "basi", + "bila", + "cha", + "chini", + "hadi", + "hapo", + "hata", + "hivyo", + "hiyo", + "huku", + "huo", + "ili", + "ilikuwa", + "juu", + "kama", + "karibu", + "katika", + "kila", + "kima", + "kisha", + "kubwa", + "kutoka", + "kuwa", + "kwa", + "kwamba", + "kwenda", + "kwenye", + "la", + "lakini", + "mara", + "mdogo", + "mimi", + "mkubwa", + "mmoja", + "moja", + "muda", + "mwenye", + "na", + "naye", + "ndani", + "ng", + "ni", + "nini", + "nonkungu", + "pamoja", + "pia", + "sana", + "sasa", + "sauti", + "tafadhali", + "tena", + "tu", + "vile", + "wa", + "wakati", + "wake", + "walikuwa", + "wao", + "watu", + "wengine", + "wote", + "ya", + "yake", + "yangu", + "yao", + "yeye", + "yule", + "za", + "zaidi", + "zake", + ], + "ur": [ + "آئی", + "آئے", + "آج", + "آخر", + "آخرکبر", + "آدهی", + "آًب", + "آٹھ", + "آیب", + "اة", + "اخبزت", + "اختتبم", + "ادھر", + "ارد", + "اردگرد", + "ارکبى", + "اش", + "اضتعوبل", + "اضتعوبلات", + "اضطرذ", + "اضکب", + "اضکی", + "اضکے", + "اطراف", + "اغیب", + "افراد", + "الگ", + "اور", + "اوًچب", + "اوًچبئی", + "اوًچی", + "اوًچے", + "اى", + "اً", + "اًذر", + "اًہیں", + "اٹھبًب", + "اپٌب", + "اپٌے", + "اچھب", + "اچھی", + "اچھے", + "اکثر", + "اکٹھب", + "اکٹھی", + "اکٹھے", + "اکیلا", + "اکیلی", + "اکیلے", + "اگرچہ", + "اہن", + "ایطے", + "ایک", + "ب", + "ت", + "تبزٍ", + "تت", + "تر", + "ترتیت", + "تریي", + "تعذاد", + "تن", + "تو", + "توبم", + "توہی", + "توہیں", + "تٌہب", + "تک", + "تھب", + "تھوڑا", + "تھوڑی", + "تھوڑے", + "تھی", + "تھے", + "تیي", + "ثب", + "ثبئیں", + "ثبترتیت", + "ثبری", + "ثبرے", + "ثبعث", + "ثبلا", + "ثبلترتیت", + "ثبہر", + "ثدبئے", + "ثرآں", + "ثراں", + "ثرش", + "ثعذ", + "ثغیر", + "ثلٌذ", + "ثلٌذوثبلا", + "ثلکہ", + "ثي", + "ثٌب", + "ثٌبرہب", + "ثٌبرہی", + "ثٌبرہے", + "ثٌبًب", + "ثٌذ", + "ثٌذکرو", + "ثٌذکرًب", + "ثٌذی", + "ثڑا", + "ثڑوں", + "ثڑی", + "ثڑے", + "ثھر", + "ثھرا", + "ثھراہوا", + "ثھرپور", + "ثھی", + "ثہت", + "ثہتر", + "ثہتری", + "ثہتریي", + "ثیچ", + "ج", + "خب", + "خبرہب", + "خبرہی", + "خبرہے", + "خبهوظ", + "خبًب", + "خبًتب", + "خبًتی", + "خبًتے", + "خبًٌب", + "خت", + "ختن", + "خجکہ", + "خص", + "خططرذ", + "خلذی", + "خو", + "خواى", + "خوًہی", + "خوکہ", + "خٌبة", + "خگہ", + "خگہوں", + "خگہیں", + "خیطب", + "خیطبکہ", + "در", + "درخبت", + "درخہ", + "درخے", + "درزقیقت", + "درضت", + "دش", + "دفعہ", + "دلچطپ", + "دلچطپی", + "دلچطپیبں", + "دو", + "دور", + "دوراى", + "دوضرا", + "دوضروں", + "دوضری", + "دوضرے", + "دوًوں", + "دکھبئیں", + "دکھبتب", + "دکھبتی", + "دکھبتے", + "دکھبو", + "دکھبًب", + "دکھبیب", + "دی", + "دیب", + "دیتب", + "دیتی", + "دیتے", + "دیر", + "دیٌب", + "دیکھو", + "دیکھٌب", + "دیکھی", + "دیکھیں", + "دے", + "ر", + "راضتوں", + "راضتہ", + "راضتے", + "رریعہ", + "رریعے", + "رکي", + "رکھ", + "رکھب", + "رکھتب", + "رکھتبہوں", + "رکھتی", + "رکھتے", + "رکھی", + "رکھے", + "رہب", + "رہی", + "رہے", + "ز", + "زبصل", + "زبضر", + "زبل", + "زبلات", + "زبلیہ", + "زصوں", + "زصہ", + "زصے", + "زقبئق", + "زقیتیں", + "زقیقت", + "زکن", + "زکویہ", + "زیبدٍ", + "صبف", + "صسیر", + "صفر", + "صورت", + "صورتسبل", + "صورتوں", + "صورتیں", + "ض", + "ضبت", + "ضبتھ", + "ضبدٍ", + "ضبرا", + "ضبرے", + "ضبل", + "ضبلوں", + "ضت", + "ضرور", + "ضرورت", + "ضروری", + "ضلطلہ", + "ضوچ", + "ضوچب", + "ضوچتب", + "ضوچتی", + "ضوچتے", + "ضوچو", + "ضوچٌب", + "ضوچی", + "ضوچیں", + "ضکب", + "ضکتب", + "ضکتی", + "ضکتے", + "ضکٌب", + "ضکی", + "ضکے", + "ضیذھب", + "ضیذھی", + "ضیذھے", + "ضیکٌڈ", + "ضے", + "طرف", + "طریق", + "طریقوں", + "طریقہ", + "طریقے", + "طور", + "طورپر", + "ظبہر", + "ع", + "عذد", + "عظین", + "علاقوں", + "علاقہ", + "علاقے", + "علاوٍ", + "عووهی", + "غبیذ", + "غخص", + "غذ", + "غروع", + "غروعبت", + "غے", + "فرد", + "فی", + "ق", + "قجل", + "قجیلہ", + "قطن", + "لئے", + "لا", + "لازهی", + "لو", + "لوجب", + "لوجی", + "لوجے", + "لوسبت", + "لوسہ", + "لوگ", + "لوگوں", + "لڑکپي", + "لگتب", + "لگتی", + "لگتے", + "لگٌب", + "لگی", + "لگیں", + "لگے", + "لی", + "لیب", + "لیٌب", + "لیں", + "لے", + "ه", + "هتعلق", + "هختلف", + "هسترم", + "هسترهہ", + "هسطوش", + "هسیذ", + "هطئلہ", + "هطئلے", + "هطبئل", + "هطتعول", + "هطلق", + "هعلوم", + "هػتول", + "هلا", + "هوکي", + "هوکٌبت", + "هوکٌہ", + "هٌبضت", + "هڑا", + "هڑًب", + "هڑے", + "هکول", + "هگر", + "هہرثبى", + "هیرا", + "هیری", + "هیرے", + "هیں", + "و", + "وار", + "والے", + "وٍ", + "ًئی", + "ًئے", + "ًب", + "ًبپطٌذ", + "ًبگسیر", + "ًطجت", + "ًقطہ", + "ًو", + "ًوخواى", + "ًکبلٌب", + "ًکتہ", + "ًہ", + "ًہیں", + "ًیب", + "ًے", + "ٓ آش", + "ٹھیک", + "پبئے", + "پبش", + "پبًب", + "پبًچ", + "پر", + "پراًب", + "پطٌذ", + "پل", + "پورا", + "پوچھب", + "پوچھتب", + "پوچھتی", + "پوچھتے", + "پوچھو", + "پوچھوں", + "پوچھٌب", + "پوچھیں", + "پچھلا", + "پھر", + "پہلا", + "پہلی", + "پہلےضی", + "پہلےضے", + "پہلےضےہی", + "پیع", + "چبر", + "چبہب", + "چبہٌب", + "چبہے", + "چلا", + "چلو", + "چلیں", + "چلے", + "چکب", + "چکی", + "چکیں", + "چکے", + "چھوٹب", + "چھوٹوں", + "چھوٹی", + "چھوٹے", + "چھہ", + "چیسیں", + "ڈھوًڈا", + "ڈھوًڈلیب", + "ڈھوًڈو", + "ڈھوًڈًب", + "ڈھوًڈی", + "ڈھوًڈیں", + "ک", + "کئی", + "کئے", + "کب", + "کبفی", + "کبم", + "کت", + "کجھی", + "کرا", + "کرتب", + "کرتبہوں", + "کرتی", + "کرتے", + "کرتےہو", + "کررہب", + "کررہی", + "کررہے", + "کرو", + "کرًب", + "کریں", + "کرے", + "کطی", + "کل", + "کن", + "کوئی", + "کوتر", + "کورا", + "کوروں", + "کورٍ", + "کورے", + "کوطي", + "کوى", + "کوًطب", + "کوًطی", + "کوًطے", + "کھولا", + "کھولو", + "کھولٌب", + "کھولی", + "کھولیں", + "کھولے", + "کہ", + "کہب", + "کہتب", + "کہتی", + "کہتے", + "کہو", + "کہوں", + "کہٌب", + "کہی", + "کہیں", + "کہے", + "کی", + "کیب", + "کیطب", + "کیطرف", + "کیطے", + "کیلئے", + "کیوًکہ", + "کیوں", + "کیے", + "کے", + "کےثعذ", + "کےرریعے", + "گئی", + "گئے", + "گب", + "گرد", + "گروٍ", + "گروپ", + "گروہوں", + "گٌتی", + "گی", + "گیب", + "گے", + "ہر", + "ہن", + "ہو", + "ہوئی", + "ہوئے", + "ہوا", + "ہوبرا", + "ہوبری", + "ہوبرے", + "ہوتب", + "ہوتی", + "ہوتے", + "ہورہب", + "ہورہی", + "ہورہے", + "ہوضکتب", + "ہوضکتی", + "ہوضکتے", + "ہوًب", + "ہوًی", + "ہوًے", + "ہوچکب", + "ہوچکی", + "ہوچکے", + "ہوگئی", + "ہوگئے", + "ہوگیب", + "ہوں", + "ہی", + "ہیں", + "ہے", + "ی", + "یقیٌی", + "یہ", + "یہبں", + ], + "vi": [ + "a ha", + "a-lô", + "ai", + "ai ai", + "ai nấy", + "alô", + "amen", + "anh", + "bao giờ", + "bao lâu", + "bao nhiêu", + "bao nả", + "bay biến", + "biết", + "biết bao", + "biết bao nhiêu", + "biết chừng nào", + "biết mấy", + "biết đâu", + "biết đâu chừng", + "biết đâu đấy", + "bà", + "bài", + "bác", + "bây bẩy", + "bây chừ", + "bây giờ", + "bây nhiêu", + "bèn", + "béng", + "bông", + "bạn", + "bản", + "bất chợt", + "bất cứ", + "bất giác", + "bất kì", + "bất kể", + "bất kỳ", + "bất luận", + "bất nhược", + "bất quá", + "bất thình lình", + "bất tử", + "bất đồ", + "bấy", + "bấy chầy", + "bấy chừ", + "bấy giờ", + "bấy lâu", + "bấy lâu nay", + "bấy nay", + "bấy nhiêu", + "bập bà bập bõm", + "bập bõm", + "bắt đầu từ", + "bằng", + "bằng không", + "bằng nấy", + "bằng ấy", + "bển", + "bệt", + "bị", + "bỏ mẹ", + "bỗng", + "bỗng chốc", + "bỗng dưng", + "bỗng không", + "bỗng nhiên", + "bỗng đâu", + "bộ", + "bội phần", + "bớ", + "bởi", + "bởi chưng", + "bởi nhưng", + "bởi thế", + "bởi vì", + "bởi vậy", + "bức", + "cao", + "cha", + "cha chả", + "chao ôi", + "chiếc", + "cho", + "cho nên", + "cho tới", + "cho tới khi", + "cho đến", + "cho đến khi", + "choa", + "chu cha", + "chui cha", + "chung cục", + "chung qui", + "chung quy", + "chung quy lại", + "chuyện", + "chành chạnh", + "chí chết", + "chính", + "chính là", + "chính thị", + "chùn chùn", + "chùn chũn", + "chú", + "chú mày", + "chú mình", + "chúng mình", + "chúng ta", + "chúng tôi", + "chăn chắn", + "chăng", + "chưa", + "chầm chập", + "chậc", + "chắc", + "chắc hẳn", + "chẳng lẽ", + "chẳng những", + "chẳng nữa", + "chẳng phải", + "chết nỗi", + "chết thật", + "chết tiệt", + "chỉ", + "chỉn", + "chốc chốc", + "chớ", + "chớ chi", + "chợt", + "chủn", + "chứ", + "chứ lị", + "coi bộ", + "coi mòi", + "con", + "cu cậu", + "cuốn", + "cuộc", + "càng", + "các", + "cái", + "cây", + "còn", + "có", + "có chăng là", + "có dễ", + "có thể", + "có vẻ", + "cóc khô", + "cô", + "cô mình", + "công nhiên", + "cùng", + "cùng cực", + "cùng nhau", + "cùng với", + "căn", + "căn cắt", + "cũng", + "cũng như", + "cũng vậy", + "cũng vậy thôi", + "cơ", + "cơ chừng", + "cơ hồ", + "cơ mà", + "cơn", + "cả", + "cả thảy", + "cả thể", + "cảm ơn", + "cần", + "cật lực", + "cật sức", + "cậu", + "cổ lai", + "của", + "cứ", + "cứ việc", + "cực lực", + "do", + "do vì", + "do vậy", + "do đó", + "duy", + "dào", + "dì", + "dù cho", + "dù rằng", + "dưới", + "dạ", + "dần dà", + "dần dần", + "dầu sao", + "dẫu", + "dẫu sao", + "dễ sợ", + "dễ thường", + "dở chừng", + "dữ", + "em", + "giữa", + "gì", + "hay", + "hoàn toàn", + "hoặc", + "hơn", + "hầu hết", + "họ", + "hỏi", + "khi", + "khác", + "không", + "luôn", + "là", + "làm", + "lên", + "lúc", + "lại", + "lần", + "lớn", + "muốn", + "mà", + "mình", + "mỗi", + "một", + "một cách", + "mới", + "mợ", + "ngay", + "ngay cả", + "ngay khi", + "ngay lúc", + "ngay lập tức", + "ngay tức khắc", + "ngay từ", + "nghe chừng", + "nghe đâu", + "nghen", + "nghiễm nhiên", + "nghỉm", + "ngoài", + "ngoài ra", + "ngoải", + "ngày", + "ngày càng", + "ngày ngày", + "ngày xưa", + "ngày xửa", + "ngôi", + "ngõ hầu", + "ngăn ngắt", + "ngươi", + "người", + "ngọn", + "ngọt", + "ngộ nhỡ", + "nh", + "nhau", + "nhiên hậu", + "nhiều", + "nhiệt liệt", + "nhung nhăng", + "nhà", + "nhân dịp", + "nhân tiện", + "nhé", + "nhón nhén", + "như", + "như chơi", + "như không", + "như quả", + "như thể", + "như tuồng", + "như vậy", + "nhưng", + "nhưng mà", + "nhược bằng", + "nhất", + "nhất loạt", + "nhất luật", + "nhất mực", + "nhất nhất", + "nhất quyết", + "nhất sinh", + "nhất thiết", + "nhất tâm", + "nhất tề", + "nhất đán", + "nhất định", + "nhận", + "nhỉ", + "nhỡ ra", + "những", + "những ai", + "những như", + "nào", + "này", + "nên", + "nên chi", + "nó", + "nóc", + "nói", + "năm", + "nơi", + "nấy", + "nếu", + "nếu như", + "nền", + "nọ", + "nớ", + "nức nở", + "nữa", + "oai oái", + "oái", + "pho", + "phè", + "phóc", + "phót", + "phăn phắt", + "phương chi", + "phải", + "phải chi", + "phải chăng", + "phắt", + "phỉ phui", + "phỏng", + "phỏng như", + "phốc", + "phụt", + "phứt", + "qua", + "qua quít", + "qua quýt", + "quyết", + "quyết nhiên", + "quyển", + "quá", + "quá chừng", + "quá lắm", + "quá sá", + "quá thể", + "quá trời", + "quá xá", + "quá đỗi", + "quá độ", + "quá ư", + "quý hồ", + "quả", + "quả là", + "quả tang", + "quả thật", + "quả tình", + "quả vậy", + "quả đúng", + "ra", + "ra phết", + "ra sao", + "ra trò", + "ren rén", + "riu ríu", + "riêng", + "riệt", + "rày", + "ráo", + "ráo trọi", + "rén", + "rích", + "rón rén", + "rút cục", + "răng", + "rất", + "rằng", + "rằng là", + "rốt cuộc", + "rốt cục", + "rồi", + "rứa", + "sa sả", + "sao", + "sau", + "sau chót", + "sau cuối", + "sau cùng", + "sau đó", + "so", + "song le", + "suýt", + "sì", + "sạch", + "sất", + "sắp", + "sẽ", + "số", + "số là", + "sốt sột", + "sở dĩ", + "sự", + "tanh", + "tha hồ", + "than ôi", + "thanh", + "theo", + "thi thoảng", + "thoạt", + "thoạt nhiên", + "thoắt", + "thuần", + "thà", + "thà là", + "thà rằng", + "thành ra", + "thành thử", + "thái quá", + "tháng", + "thì", + "thì thôi", + "thình lình", + "thím", + "thôi", + "thúng thắng", + "thương ôi", + "thường", + "thảo hèn", + "thảo nào", + "thấy", + "thẩy", + "thậm", + "thậm chí", + "thật lực", + "thật ra", + "thật vậy", + "thế", + "thế là", + "thế mà", + "thế nào", + "thế nên", + "thế ra", + "thế thì", + "thế à", + "thếch", + "thỉnh thoảng", + "thỏm", + "thốc", + "thốc tháo", + "thốt", + "thốt nhiên", + "thộc", + "thời gian", + "thục mạng", + "thửa", + "thực ra", + "thực sự", + "thực vậy", + "tiếp theo", + "tiếp đó", + "tiện thể", + "toà", + "toé khói", + "toẹt", + "trong", + "trên", + "trước", + "trước kia", + "trước nay", + "trước tiên", + "trước đây", + "trước đó", + "trếu tráo", + "trển", + "trệt", + "trệu trạo", + "trỏng", + "trời đất ơi", + "trừ phi", + "tuy", + "tuy nhiên", + "tuy rằng", + "tuy thế", + "tuy vậy", + "tuyệt nhiên", + "tuần tự", + "tuốt luốt", + "tuốt tuồn tuột", + "tuốt tuột", + "tà tà", + "tênh", + "tít mù", + "tò te", + "tôi", + "tông tốc", + "tù tì", + "tăm tắp", + "tại", + "tại vì", + "tấm", + "tấn", + "tất cả", + "tất thảy", + "tất tần tật", + "tất tật", + "tắp", + "tắp lự", + "tọt", + "tỏ ra", + "tỏ vẻ", + "tốc tả", + "tối ư", + "tột", + "tớ", + "tới", + "tức thì", + "tức tốc", + "từ", + "từng", + "tự vì", + "tựu trung", + "veo", + "veo veo", + "việc", + "vung thiên địa", + "vung tàn tán", + "vung tán tàn", + "và", + "vào", + "vâng", + "vèo", + "vì", + "vì chưng", + "vì thế", + "vì vậy", + "ví bằng", + "ví dù", + "ví phỏng", + "ví thử", + "vô hình trung", + "vô kể", + "vô luận", + "vô vàn", + "văng tê", + "vạn nhất", + "vả chăng", + "vả lại", + "vẫn", + "vậy", + "vậy là", + "vậy thì", + "về", + "vị tất", + "vốn dĩ", + "với", + "với lại", + "vở", + "vụt", + "vừa", + "vừa mới", + "xa xả", + "xiết bao", + "xon xón", + "xoành xoạch", + "xoét", + "xoẳn", + "xoẹt", + "xuất kì bất ý", + "xuất kỳ bất ý", + "xuể", + "xuống", + "xăm xúi", + "xăm xăm", + "xăm xắm", + "xềnh xệch", + "xệp", + "à", + "à ơi", + "ào", + "á", + "á à", + "ái", + "ái chà", + "ái dà", + "áng", + "âu là", + "ô hay", + "ô hô", + "ô kê", + "ô kìa", + "ôi chao", + "ôi thôi", + "ông", + "úi", + "úi chà", + "úi dào", + "ý", + "ý chừng", + "ý da", + "đang", + "đi", + "điều", + "đành đạch", + "đáng lí", + "đáng lý", + "đáng lẽ", + "đánh đùng", + "đáo để", + "đây", + "đã", + "đó", + "được", + "đại loại", + "đại nhân", + "đại phàm", + "đại để", + "đến", + "đến nỗi", + "đều", + "để", + "ơ", + "ơ hay", + "ơ kìa", + "ơi", + "ư", + "ạ", + "ạ ơi", + "ấy", + "ầu ơ", + "ắt", + "ắt hẳn", + "ắt là", + "ối dào", + "ối giời", + "ối giời ơi", + "ồ", + "ổng", + "ớ", + "ờ", + "ở", + "ở trên", + "ủa", + "ứ hự", + "ứ ừ", + "ừ", + "ử", + ], + "yo": [ + "a", + "an", + "bá", + "bí", + "bẹ̀rẹ̀", + "fún", + "fẹ́", + "gbogbo", + "inú", + "jù", + "jẹ", + "jẹ́", + "kan", + "kì", + "kí", + "kò", + "láti", + "lè", + "lọ", + "mi", + "mo", + "máa", + "mọ̀", + "ni", + "náà", + "ní", + "nígbà", + "nítorí", + "nǹkan", + "o", + "padà", + "pé", + "púpọ̀", + "pẹ̀lú", + "rẹ̀", + "sì", + "sí", + "sínú", + "ṣ", + "ti", + "tí", + "wà", + "wá", + "wọn", + "wọ́n", + "yìí", + "àti", + "àwọn", + "é", + "í", + "òun", + "ó", + "ń", + "ńlá", + "ṣe", + "ṣé", + "ṣùgbọ́n", + "ẹmọ́", + "ọjọ́", + "ọ̀pọ̀lọpọ̀", + ], + "zh": [ + "、", + "。", + "〈", + "〉", + "《", + "》", + "一", + "一切", + "一则", + "一方面", + "一旦", + "一来", + "一样", + "一般", + "七", + "万一", + "三", + "上下", + "不仅", + "不但", + "不光", + "不单", + "不只", + "不如", + "不怕", + "不惟", + "不成", + "不拘", + "不比", + "不然", + "不特", + "不独", + "不管", + "不论", + "不过", + "不问", + "与", + "与其", + "与否", + "与此同时", + "且", + "两者", + "个", + "临", + "为", + "为了", + "为什么", + "为何", + "为着", + "乃", + "乃至", + "么", + "之", + "之一", + "之所以", + "之类", + "乌乎", + "乎", + "乘", + "九", + "也", + "也好", + "也罢", + "了", + "二", + "于", + "于是", + "于是乎", + "云云", + "五", + "人家", + "什么", + "什么样", + "从", + "从而", + "他", + "他人", + "他们", + "以", + "以便", + "以免", + "以及", + "以至", + "以至于", + "以致", + "们", + "任", + "任何", + "任凭", + "似的", + "但", + "但是", + "何", + "何况", + "何处", + "何时", + "作为", + "你", + "你们", + "使得", + "例如", + "依", + "依照", + "俺", + "俺们", + "倘", + "倘使", + "倘或", + "倘然", + "倘若", + "借", + "假使", + "假如", + "假若", + "像", + "八", + "六", + "兮", + "关于", + "其", + "其一", + "其中", + "其二", + "其他", + "其余", + "其它", + "其次", + "具体地说", + "具体说来", + "再者", + "再说", + "冒", + "冲", + "况且", + "几", + "几时", + "凭", + "凭借", + "则", + "别", + "别的", + "别说", + "到", + "前后", + "前者", + "加之", + "即", + "即令", + "即使", + "即便", + "即或", + "即若", + "又", + "及", + "及其", + "及至", + "反之", + "反过来", + "反过来说", + "另", + "另一方面", + "另外", + "只是", + "只有", + "只要", + "只限", + "叫", + "叮咚", + "可", + "可以", + "可是", + "可见", + "各", + "各个", + "各位", + "各种", + "各自", + "同", + "同时", + "向", + "向着", + "吓", + "吗", + "否则", + "吧", + "吧哒", + "吱", + "呀", + "呃", + "呕", + "呗", + "呜", + "呜呼", + "呢", + "呵", + "呸", + "呼哧", + "咋", + "和", + "咚", + "咦", + "咱", + "咱们", + "咳", + "哇", + "哈", + "哈哈", + "哉", + "哎", + "哎呀", + "哎哟", + "哗", + "哟", + "哦", + "哩", + "哪", + "哪个", + "哪些", + "哪儿", + "哪天", + "哪年", + "哪怕", + "哪样", + "哪边", + "哪里", + "哼", + "哼唷", + "唉", + "啊", + "啐", + "啥", + "啦", + "啪达", + "喂", + "喏", + "喔唷", + "嗡嗡", + "嗬", + "嗯", + "嗳", + "嘎", + "嘎登", + "嘘", + "嘛", + "嘻", + "嘿", + "四", + "因", + "因为", + "因此", + "因而", + "固然", + "在", + "在下", + "地", + "多", + "多少", + "她", + "她们", + "如", + "如上所述", + "如何", + "如其", + "如果", + "如此", + "如若", + "宁", + "宁可", + "宁愿", + "宁肯", + "它", + "它们", + "对", + "对于", + "将", + "尔后", + "尚且", + "就", + "就是", + "就是说", + "尽", + "尽管", + "岂但", + "己", + "并", + "并且", + "开外", + "开始", + "归", + "当", + "当着", + "彼", + "彼此", + "往", + "待", + "得", + "怎", + "怎么", + "怎么办", + "怎么样", + "怎样", + "总之", + "总的来看", + "总的来说", + "总的说来", + "总而言之", + "恰恰相反", + "您", + "慢说", + "我", + "我们", + "或", + "或是", + "或者", + "所", + "所以", + "打", + "把", + "抑或", + "拿", + "按", + "按照", + "换句话说", + "换言之", + "据", + "接着", + "故", + "故此", + "旁人", + "无宁", + "无论", + "既", + "既是", + "既然", + "时候", + "是", + "是的", + "替", + "有", + "有些", + "有关", + "有的", + "望", + "朝", + "朝着", + "本", + "本着", + "来", + "来着", + "极了", + "果然", + "果真", + "某", + "某个", + "某些", + "根据", + "正如", + "此", + "此外", + "此间", + "毋宁", + "每", + "每当", + "比", + "比如", + "比方", + "沿", + "沿着", + "漫说", + "焉", + "然则", + "然后", + "然而", + "照", + "照着", + "甚么", + "甚而", + "甚至", + "用", + "由", + "由于", + "由此可见", + "的", + "的话", + "相对而言", + "省得", + "着", + "着呢", + "矣", + "离", + "第", + "等", + "等等", + "管", + "紧接着", + "纵", + "纵令", + "纵使", + "纵然", + "经", + "经过", + "结果", + "给", + "继而", + "综上所述", + "罢了", + "者", + "而", + "而且", + "而况", + "而外", + "而已", + "而是", + "而言", + "能", + "腾", + "自", + "自个儿", + "自从", + "自各儿", + "自家", + "自己", + "自身", + "至", + "至于", + "若", + "若是", + "若非", + "莫若", + "虽", + "虽则", + "虽然", + "虽说", + "被", + "要", + "要不", + "要不是", + "要不然", + "要么", + "要是", + "让", + "论", + "设使", + "设若", + "该", + "诸位", + "谁", + "谁知", + "赶", + "起", + "起见", + "趁", + "趁着", + "越是", + "跟", + "较", + "较之", + "边", + "过", + "还是", + "还有", + "这", + "这个", + "这么", + "这么些", + "这么样", + "这么点儿", + "这些", + "这会儿", + "这儿", + "这就是说", + "这时", + "这样", + "这边", + "这里", + "进而", + "连", + "连同", + "通过", + "遵照", + "那", + "那个", + "那么", + "那么些", + "那么样", + "那些", + "那会儿", + "那儿", + "那时", + "那样", + "那边", + "那里", + "鄙人", + "鉴于", + "阿", + "除", + "除了", + "除此之外", + "除非", + "随", + "随着", + "零", + "非但", + "非徒", + "靠", + "顺", + "顺着", + "首先", + "︿", + "!", + "#", + "$", + "%", + "&", + "(", + ")", + "*", + "+", + ",", + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + ":", + ";", + "<", + ">", + "?", + "@", + "[", + "]", + "{", + "|", + "}", + "~", + "¥", + ], +}