| """ | |
| Moduł zawierający stałe, listy słów i prekompilowane wyrażenia regularne | |
| używane w całej bibliotece do analizy tekstu. | |
| """ | |
| import re | |
| BAD_WORDS = ['burdel', 'burdelmama', 'chuj', 'chujnia', 'ciota', 'cipa', 'cyc', 'debil', 'dmuchać', 'do kurwy nędzy', | |
| 'dupa', 'dupek', 'duperele', 'dziwka', 'fiut', 'gówno', 'gówno prawda', 'huj', 'huj ci w dupę', 'jajco', | |
| 'ja pierdolę', 'jebać', 'jebany', 'kurwa', 'kurwy', 'kutafon', 'kutas', 'lizać pałę', 'obciągać chuja', | |
| 'obciągać fiuta', 'pierdolec', 'pierdolić', 'pierdolnąć', 'pierdolnięty', 'pizda', 'pojeb', 'pojebany', | |
| 'popierdolony', 'robić loda', 'ruchać', 'rzygać', 'skurwysyn', 'sraczka', 'srać', 'suka', 'syf', 'wkurwiać', 'zajebisty'] | |
| STOP_WORDS = ["a","aby","ach","acz","aczkolwiek","aj","albo","ale","ależ","ani","aż","bardziej","bardzo","bez","bo","bowiem", | |
| "by","byli","bym","bynajmniej","być","był","była","było","były","będzie","będą","cali","cała","cały","chce","choć","ci", | |
| "ciebie","cię","co","cokolwiek","coraz","coś","czasami","czasem","czemu","czy","czyli","często","daleko","dla","dlaczego", | |
| "dlatego","do","dobrze","dokąd","dość","dr","dużo","dwa","dwaj","dwie","dwoje","dzisiaj","dziś","gdy","gdyby","gdyż","gdzie", | |
| "gdziekolwiek","gdzieś","go","godz","hab","i","ich","ii","iii","ile","im","inna","inne","inny","innych","inż","iv","ix","iż", | |
| "ja","jak","jakaś","jakby","jaki","jakichś","jakie","jakiś","jakiż","jakkolwiek","jako","jakoś","je","jeden","jedna","jednak", | |
| "jednakże","jedno","jednym","jedynie","jego","jej","jemu","jest","jestem","jeszcze","jeśli","jeżeli","już","ją","każdy","kiedy", | |
| "kierunku","kilka","kilku","kimś","kto","ktokolwiek","ktoś","która","które","którego","której","który","których","którym","którzy", | |
| "ku","lat","lecz","lub","ma","mają","mam","mamy","mało","mgr","mi","miał","mimo","między","mnie","mną","mogą","moi","moim","moja", | |
| "moje","może","możliwe","można","mu","musi","my","mój","na","nad","nam","nami","nas","nasi","nasz","nasza","nasze","naszego","naszych", | |
| "natomiast","natychmiast","nawet","nic","nich","nie","niech","niego","niej","niemu","nigdy","nim","nimi","nią","niż","no","nowe","np", | |
| "nr","o","o.o.","obok","od","ok","około","on","ona","one","oni","ono","oraz","oto","owszem","pan","pana","pani","pl","po","pod", | |
| "podczas","pomimo","ponad","ponieważ","powinien","powinna","powinni","powinno","poza","prawie","prof","przecież","przed","przede","przedtem", | |
| "przez","przy","raz","razie","roku","również","sam","sama","się","skąd","sobie","sobą","sposób","swoje","są","ta","tak","taka","taki","takich", | |
| "takie","także","tam","te","tego","tej","tel","temu","ten","teraz","też","to","tobie","tobą","toteż","totobą","trzeba","tu","tutaj","twoi","twoim", | |
| "twoja","twoje","twym","twój","ty","tych","tylko","tym","tys","tzw","u","ul","vi","vii","viii","vol","w","wam","wami","was","wasi","wasz","wasza", | |
| "wasze","we","według","wie","wiele","wielu","więc","więcej","wszyscy","wszystkich","wszystkie","wszystkim","wszystko","wtedy","www","wy","właśnie", | |
| "wśród","xi","xii","xiii","xiv","xv","z","za","zapewne","zawsze","zaś","ze","zeznowu","znowu","znów","został","zł","żaden","żadna","żadne","żadnych", | |
| "że","żeby"] | |
| PII_REGEX_PATTERNS = { | |
| 'date_reg' : re.compile(r'(?<!\w)([1-2]\d{3}[\/.\\-](12|11|[0-1]?[0-9])[\/.\\-](3[0-1]|2[0-9]|1[0-9]|0?[1-9]))|((?<!\w)(3[0-1]|2[0-9]|1[0-9]|0?[1-9])([\/.\\-](12|11|1[0-9]|0?[1-9])|([\/.\\ -](stycz\w+|lut\w+|mar[cz]\w+|kwie\w+|maj\w|czerw\w+|lip\w+|sierp\w+|wrze[sś]\w+|paź\w+|listop\w+|grud\w+)))(([\/.\\ -]([0-9]){2}|[\/.\\ -]([1-2][0-9]{3})))?)(?![\d])'), | |
| 'address_reg' : re.compile(r"(?<=\W)(?:ul\.|pl\.|al\.|bulw\.|os\.|aleja|bulwar|ulica|skwer|park|rondo)[^\S\r\n]?" | |
| r"(?:(?:świętego|trasa|bł\.|im\.|gen\.|dr\.|ks\.|ks\.[^\S\r\n]bp\.|kpt\.|ks\.[^\S\r\n]kan\.|" | |
| r"ks\.[^\S\r\n]+kard\.|marsz\.|mjr\.|o\.|ppłk\.|prof\.|św\.|[^\S\r\n])*)" | |
| r"(?:(?:\d{1,3}\s+)?[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+(?:\s*[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+)*" | |
| r"(?:\s*\d{1,3})*){1,}(?:(?:[^\S\r\n]+(?:i|z|im\.)[^\S\r\n]+)?" | |
| r"(?:[^\S\r\n]*(?:\d{1,3}[^\S\r\n]+)?[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+(?:\s*[A-ZĄĆĘŁŃÓŚŻŹ][\w'-]+)*" | |
| r"(?:\s*\d{1,3})*)*)*(?!\n\d{2}-\d{3})"), | |
| 'post_code_reg' : re.compile(r'(?<=\s)([0-9]{2}-[0-9]{3})(?!\w)'), | |
| 'ip_reg' : re.compile(r'(?<=\s)((25[0-5]|2[0-4][0-9]|\d?[1-9][0-9]|\d)(\.|$)){3}(25[0-5]|2[0-4][0-9]|\d?[1-9][0-9]|\d)(?!:\s)'), | |
| 'nip_reg' : re.compile(r'(?:(?<=NIP: )|(?<=NIP ))([0-9]{10}|\d{3}-\d{2}-\d{2}-\d{3}|\d{3}-\d{3}-\d{2}-\d{2})(?!\w)'), | |
| 'regon_reg' : re.compile(r'(?:(?<=REGON: )|(?<=REGON ))([0-9]{9})(?!\w)'), | |
| 'phone_reg' : re.compile(r'(?<=\s)((?P<a>\()?(\+|00)?48(?(2)\)|)?)?(\s{0,}(4|5|6|7|8)\d{2})((?P<char>[ -]?)\d{3})((?P=char)\d{3})(?!\w)'), | |
| 'domestic_phone_reg' : re.compile(r'(?!\s)((?P<a>\()?(\+|00)?48(?(2)\)|)?)?(\s*\d{2})?((\s*[2-9]\d{2})((?P<char>[ -]+)\d{2})((?P=char)\d{2})|(\s*[2-9]\d{1})((?P<char2>[ -]+)\d{3})((?P=char2)\d{2}))\s'), | |
| 'iban_reg' : re.compile(r'(?<=\s)(?:PL)?(?:\d{2}(?:[\s]*\d{4}){6})(?!\w)'), | |
| 'email_reg' : re.compile(r'(?<=\s)([a-zA-Z0-9][\w.+-]*@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,20})(?!\w)'), | |
| 'pesel_reg' : re.compile(r'(?<=\s)(\d{2})(1[0-2]|2[1-9]|3[1-2]|0[1-9])(3[0-1]|2\d|1\d|0[1-9])(\d{5})(?!\w)'), | |
| 'currency_reg' : re.compile(r'(?<=\s)(([1-9]\d{0,2}[,. ]){0,2}[,. ]?([1-9]\d{1,2}|\d{1,3})([,.]\d{2}|\d+)\s*(zł|Zł|pln|PLN|\$|€|£)|(\$|€|£)\s*([1-9]\d{0,2}[,. ]){0,2}[,. ]?([1-9]\d{1,2}|\d{1,3})([,.]\d{2}|\d+))'), | |
| 'isbn_reg' : re.compile(r'(?<=\s)978(?:-?\d){10}(?!\w)'), | |
| 'issn_reg' : re.compile(r'(?<=\s)((ISSN|eISSN) [\S]{4}\-[\S]{4})(?!\w)') | |
| } | |
| MARKDOWN_PATTERNS = { | |
| 'header': re.compile(r'^#+\s', re.MULTILINE), | |
| 'bold': re.compile(r'\*\*[^*\n]+?\*\*'), | |
| 'italic': re.compile(r'\*[^*\n]+?\*'), | |
| 'unordered_list': re.compile(r'^\s*[-*+]\s', re.MULTILINE), | |
| 'ordered_list': re.compile(r'^\s*\d+\.\s', re.MULTILINE), | |
| 'link': re.compile(r'\[([^\]]+)\]\(([^\)]+)\)'), | |
| 'image': re.compile(r'!\[([^\]]*)\]\(([^\)]+)\)'), | |
| 'inline_code': re.compile(r'`[^`\n]+`'), | |
| 'code_block': re.compile(r'```[\s\S]*?```'), | |
| 'blockquote': re.compile(r'^>\s', re.MULTILINE), | |
| 'horizontal_rule': re.compile(r'^([-*_])\1{2,}$', re.MULTILINE) | |
| } | |
| PUNCTUATION_PATTERN = re.compile(r'[.,!?;:]') | |
| NON_WORD_CHARS_PATTERN = re.compile(r'[^\w\s]') | |
| EXCESSIVE_SPACES_PATTERN = re.compile(r' {4,}') | |
| CAMEL_CASE_PATTERN = re.compile(r"\b[a-ząęćłńóśżź]+[A-ZĄĘĆŁŃÓŚŻŹ]+[a-ząęćłńóśżź]+[a-ząęćłńóśżźA-ZĄĘĆŁŃÓŚŻŹ]*\b") | |
| ALLOWED_CHARS_PATTERN = re.compile(r'[a-zA-Z0-9ąćęłńóśźż\s.,;:\-!?]') | |
| COMMON_CHARACTERS = list(" \t\n!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^`°abcdefghijklmnopqrstuvwxyz{|}~ÓóĄąĆćĘꣳŃńŚśŹźŻż") | |
| SPACY_MODEL_PL = 'pl_core_news_md' | |
| NLP_MAX_LENGTH = 5_000_000 | |
| COLUMN_ORDER = [ | |
| 'characters', | |
| 'words', | |
| 'sentences', | |
| 'avg_sentence_length', | |
| 'nouns', | |
| 'verbs', | |
| 'adjectives', | |
| 'adverbs', | |
| 'punctuations', | |
| 'symbols', | |
| 'stopwords', | |
| 'oovs', | |
| 'avg_word_length', | |
| 'noun_ratio', | |
| 'verb_ratio', | |
| 'adj_ratio', | |
| 'lexical_density', | |
| 'gunning_fog', | |
| 'camel_case', | |
| 'pos_x', | |
| 'pos_num', | |
| 'capitalized_words', | |
| 'unique_characters_all', | |
| 'unique_characters_lower', | |
| 'characters_out_of_common', | |
| 'word_isupper<5', | |
| 'word_isupper>5', | |
| 'count_caps', | |
| 'single_char_count', | |
| 'single_char_ratio', | |
| 'digit_count', | |
| 'digit_ratio', | |
| 'punct_frequency', | |
| 'count_digit_to_caps', | |
| 'bracet_count', | |
| 'bracket_ratio', | |
| 'average_lines', | |
| 'short_line_count_3', | |
| 'short_line_count_5', | |
| 'short_line_count_10', | |
| 'short_line_ratio_3', | |
| 'short_line_ratio_5', | |
| 'short_line_ratio_10', | |
| 'lexical_diversity', | |
| 'contextual_word_repetitions_count', | |
| 'contextual_word_repetitions_ratio', | |
| 'html_tags', | |
| 'bbcode_tags', | |
| 'urls', | |
| 'text_to_markup_ratio', | |
| 'emoticons', | |
| 'slang_words', | |
| 'slang_words_ratio', | |
| 'incomplete_sentences', | |
| 'excessive_chars', | |
| 'blank_lines', | |
| 'blank_lines_ratio', | |
| 'duplicated_lines', | |
| 'duplicate_line_ratio', | |
| 'count_special_chars', | |
| 'tabs', | |
| 'multispaces', | |
| 'short_line_count_20', | |
| 'date_reg', | |
| 'address_reg', | |
| 'post_code_reg', | |
| 'ip_reg', | |
| 'nip_reg', | |
| 'regon_reg', | |
| 'phone_reg', | |
| 'iban_reg', | |
| 'email_reg', | |
| 'pesel_reg', | |
| 'currency_reg', | |
| 'isbn_reg', | |
| 'issn_reg', | |
| 'short_line_ratio_20', | |
| 'ellipsis_fractions', | |
| 'line_counts', | |
| 'non_alpha_word_fractions', | |
| 'lorem_ipsum_ratio', | |
| 'mean_word_length', | |
| 'stop_word_ratio', | |
| 'entropy', | |
| 'javascript_counts_per_line', | |
| 'lines_with_bullet', | |
| 'ratio_of_bulletpoints', | |
| 'overall_uppercase_ratio', | |
| 'bad_word_count', | |
| 'fraction_duplicate_5_ngram', | |
| 'fraction_duplicate_6_ngram', | |
| 'fraction_duplicate_7_ngram', | |
| 'fraction_duplicate_8_ngram', | |
| 'fraction_duplicate_9_ngram', | |
| 'fraction_duplicate_10_ngram', | |
| 'fraction_top_2_ngram', | |
| 'fraction_top_3_ngram', | |
| 'fraction_top_4_ngram', | |
| 'fraction_top_5_ngram', | |
| 'symbol_to_word_ratio', | |
| 'avg_paragraph_length', | |
| 'paragraph_length_variance', | |
| 'unique_sentence_beginnings_ratio', | |
| 'semicolons_per_sentence', | |
| 'dashes_per_sentence', | |
| 'colons_per_sentence', | |
| 'formal_words_ratio', | |
| 'commas_per_sentence', | |
| 'short_sentences_ratio', | |
| 'long_sentences_ratio', | |
| 'cohesive_words_per_sentence', | |
| 'quotes_and_references_per_sentence', | |
| 'headers_per_1000_chars_md', | |
| 'average_header_level_md', | |
| 'bold_per_1000_chars_md', | |
| 'italic_per_1000_chars_md', | |
| 'unordered_list_items_per_1000_chars_md', | |
| 'ordered_list_items_per_1000_chars_md', | |
| 'links_per_1000_chars_md', | |
| 'images_per_1000_chars_md', | |
| 'inline_code_fragments_per_1000_chars_md', | |
| 'code_blocks_per_1000_chars_md', | |
| 'blockquotes_per_1000_chars_md', | |
| 'horizontal_rules_per_1000_chars_md', | |
| 'char_ratio_#', | |
| 'char_ratio_*', | |
| 'char_ratio_-', | |
| 'char_ratio_+', | |
| 'char_ratio_[', | |
| 'char_ratio_]', | |
| 'char_ratio_(', | |
| 'char_ratio_)', | |
| 'char_ratio_`', | |
| 'char_ratio_>', | |
| 'char_ratio__', | |
| 'char_ratio_!', | |
| 'special_chars_ratio_md', | |
| 'lowercase_ratio_md', | |
| 'uppercase_ratio_md', | |
| 'digit_ratio_md', | |
| 'whitespace_ratio_md', | |
| 'table_pipe_count', | |
| 'table_pipe_ratio', | |
| 'table_pipe_per_1000_chars', | |
| 'table_lines_count', | |
| 'table_lines_ratio', | |
| 'table_header_separators_count', | |
| 'avg_pipes_per_table_line', | |
| 'estimated_avg_columns', | |
| 'word_count', | |
| 'unique_word_count', | |
| 'top_word_count', | |
| 'top_word_ratio', | |
| 'top_5_ratio', | |
| 'top_10_ratio', | |
| 'hapax_legomena_ratio', | |
| 'looping_suspicion', | |
| 'polish_diacritics_count', | |
| 'polish_diacritics_ratio', | |
| 'polish_diacritics_per_word', | |
| 'diacritics_to_letters_ratio', | |
| 'replacement_char_count', | |
| 'replacement_char_ratio', | |
| 'not_allowed_chars_count', | |
| 'not_allowed_chars_ratio', | |
| 'encoding_suspicion', | |
| 'single_char_word_count', | |
| 'single_char_unique_count', | |
| 'single_char_upper_count', | |
| 'single_char_lower_count', | |
| 'single_char_upper_unique_count', | |
| 'single_char_lower_unique_count', | |
| 'single_char_top_1_codepoint', | |
| 'single_char_top_2_codepoint', | |
| 'single_char_top_3_codepoint', | |
| 'question_sentence_ratio', | |
| 'single_word_line_ratio', | |
| 'repeated_word_line_ratio', | |
| 'lix', | |
| 'rix', | |
| 'diacritics_std_dev', | |
| 'ner_count', | |
| 'ner_person_ratio', | |
| 'ner_org_ratio', | |
| 'ner_loc_ratio', | |
| 'ner_misc_ratio', | |
| 'case_diversity', | |
| 'tense_diversity', | |
| 'mood_diversity', | |
| 'top_words_total_count', | |
| 'top_words_noun_ratio', | |
| 'top_words_verb_ratio', | |
| 'top_words_adj_ratio', | |
| 'top_words_other_ratio', | |
| 'top_words_noun_prop_of_all_nouns', | |
| 'top_words_verb_prop_of_all_verbs', | |
| 'top_words_adj_prop_of_all_adjs', | |
| 'top_words_other_prop_of_all_others', | |
| 'avg_dependency_tree_depth', | |
| 'digit_start_lines' | |
| ] | |