Spaces:
Runtime error
Runtime error
Commit
·
150f094
1
Parent(s):
78a727e
Update parameters for Indonesian (and default ones)
Browse files- parameters_filtering.py +59 -59
parameters_filtering.py
CHANGED
|
@@ -26,34 +26,34 @@ parameters_filtering_default = {
|
|
| 26 |
"cond_check_number_words": True,
|
| 27 |
"tokenization": False,
|
| 28 |
"strip_characters": special_characters_default,
|
| 29 |
-
"number_words_min_cutoff":
|
| 30 |
"number_words_max_cutoff": 100000,
|
| 31 |
"cond_check_character_repetition_removal": True,
|
| 32 |
"character_repetition_length": 10,
|
| 33 |
-
"character_repetition_max_cutoff": 0.
|
| 34 |
"cond_check_word_repetition_removal": True,
|
| 35 |
"word_repetition_length": 5,
|
| 36 |
-
"word_repetition_max_cutoff": 0.
|
| 37 |
"cond_check_special_characters": True,
|
| 38 |
"special_characters": special_characters_default,
|
| 39 |
"special_characters_max_cutoff": 0.4,
|
| 40 |
"cond_words_augmentation": False,
|
| 41 |
"words_augmentation_group_sizes": [],
|
| 42 |
"words_augmentation_join_char": "",
|
| 43 |
-
"cond_check_stopwords":
|
| 44 |
-
"stopwords_min_cutoff": 0,
|
| 45 |
-
"cond_check_flagged_words":
|
| 46 |
-
"flagged_words_max_cutoff": 0.
|
| 47 |
"cond_check_lang_id": True,
|
| 48 |
"lang_id_min_cutoff": 0.70,
|
| 49 |
"cond_check_perplexity": False,
|
| 50 |
-
"perplexity_max_cutoff":
|
| 51 |
}
|
| 52 |
|
| 53 |
parameters_filtering_af = {
|
| 54 |
"cond_uniform_whitespace": True,
|
| 55 |
"cond_replace_unicode_punctuation": False,
|
| 56 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 57 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 58 |
"cond_remove_long_words": True,
|
| 59 |
"length_word_max_cutoff": 25,
|
|
@@ -76,7 +76,7 @@ parameters_filtering_af = {
|
|
| 76 |
"words_augmentation_join_char": "",
|
| 77 |
"cond_check_stopwords": True,
|
| 78 |
"stopwords_min_cutoff": 0,
|
| 79 |
-
"cond_check_flagged_words":
|
| 80 |
"flagged_words_max_cutoff": 0.2,
|
| 81 |
"cond_check_lang_id": True,
|
| 82 |
"lang_id_min_cutoff": 0.6,
|
|
@@ -87,7 +87,7 @@ parameters_filtering_af = {
|
|
| 87 |
parameters_filtering_ar = {
|
| 88 |
"cond_uniform_whitespace": True,
|
| 89 |
"cond_replace_unicode_punctuation": False,
|
| 90 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 91 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 92 |
"cond_remove_long_words": True,
|
| 93 |
"length_word_max_cutoff": 25,
|
|
@@ -110,7 +110,7 @@ parameters_filtering_ar = {
|
|
| 110 |
"words_augmentation_join_char": "",
|
| 111 |
"cond_check_stopwords": True,
|
| 112 |
"stopwords_min_cutoff": 0,
|
| 113 |
-
"cond_check_flagged_words":
|
| 114 |
"flagged_words_max_cutoff": 0.2,
|
| 115 |
"cond_check_lang_id": True,
|
| 116 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -121,7 +121,7 @@ parameters_filtering_ar = {
|
|
| 121 |
parameters_filtering_arz = {
|
| 122 |
"cond_uniform_whitespace": True,
|
| 123 |
"cond_replace_unicode_punctuation": False,
|
| 124 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 125 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 126 |
"cond_remove_long_words": True,
|
| 127 |
"length_word_max_cutoff": 25,
|
|
@@ -144,7 +144,7 @@ parameters_filtering_arz = {
|
|
| 144 |
"words_augmentation_join_char": "",
|
| 145 |
"cond_check_stopwords": True,
|
| 146 |
"stopwords_min_cutoff": 0,
|
| 147 |
-
"cond_check_flagged_words":
|
| 148 |
"flagged_words_max_cutoff": 0.2,
|
| 149 |
"cond_check_lang_id": True,
|
| 150 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -155,7 +155,7 @@ parameters_filtering_arz = {
|
|
| 155 |
parameters_filtering_as = {
|
| 156 |
"cond_uniform_whitespace": True,
|
| 157 |
"cond_replace_unicode_punctuation": False,
|
| 158 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 159 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 160 |
"cond_remove_long_words": True,
|
| 161 |
"length_word_max_cutoff": 25,
|
|
@@ -178,7 +178,7 @@ parameters_filtering_as = {
|
|
| 178 |
"words_augmentation_join_char": "",
|
| 179 |
"cond_check_stopwords": True,
|
| 180 |
"stopwords_min_cutoff": 0,
|
| 181 |
-
"cond_check_flagged_words":
|
| 182 |
"flagged_words_max_cutoff": 0.2,
|
| 183 |
"cond_check_lang_id": True,
|
| 184 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -189,7 +189,7 @@ parameters_filtering_as = {
|
|
| 189 |
parameters_filtering_bn = {
|
| 190 |
"cond_uniform_whitespace": True,
|
| 191 |
"cond_replace_unicode_punctuation": False,
|
| 192 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 193 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 194 |
"cond_remove_long_words": True,
|
| 195 |
"length_word_max_cutoff": 30,
|
|
@@ -212,7 +212,7 @@ parameters_filtering_bn = {
|
|
| 212 |
"words_augmentation_join_char": "",
|
| 213 |
"cond_check_stopwords": True,
|
| 214 |
"stopwords_min_cutoff": 0.05,
|
| 215 |
-
"cond_check_flagged_words":
|
| 216 |
"flagged_words_max_cutoff": 0.2,
|
| 217 |
"cond_check_lang_id": True,
|
| 218 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -246,7 +246,7 @@ parameters_filtering_ca = {
|
|
| 246 |
"words_augmentation_join_char": "",
|
| 247 |
"cond_check_stopwords": True,
|
| 248 |
"stopwords_min_cutoff": 0.25,
|
| 249 |
-
"cond_check_flagged_words":
|
| 250 |
"flagged_words_max_cutoff": 0.1,
|
| 251 |
"cond_check_lang_id": True,
|
| 252 |
"lang_id_min_cutoff": 0.8,
|
|
@@ -291,7 +291,7 @@ parameters_filtering_en = {
|
|
| 291 |
parameters_filtering_es = {
|
| 292 |
"cond_uniform_whitespace": True,
|
| 293 |
"cond_replace_unicode_punctuation": False,
|
| 294 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 295 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 296 |
"cond_remove_long_words": True,
|
| 297 |
"length_word_max_cutoff": 30,
|
|
@@ -314,7 +314,7 @@ parameters_filtering_es = {
|
|
| 314 |
"words_augmentation_join_char": "",
|
| 315 |
"cond_check_stopwords": True,
|
| 316 |
"stopwords_min_cutoff": 0.2,
|
| 317 |
-
"cond_check_flagged_words":
|
| 318 |
"flagged_words_max_cutoff": 0.2,
|
| 319 |
"cond_check_lang_id": True,
|
| 320 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -325,7 +325,7 @@ parameters_filtering_es = {
|
|
| 325 |
parameters_filtering_eu = {
|
| 326 |
"cond_uniform_whitespace": True,
|
| 327 |
"cond_replace_unicode_punctuation": False,
|
| 328 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 329 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 330 |
"cond_remove_long_words": True,
|
| 331 |
"length_word_max_cutoff": 35,
|
|
@@ -348,7 +348,7 @@ parameters_filtering_eu = {
|
|
| 348 |
"words_augmentation_join_char": "",
|
| 349 |
"cond_check_stopwords": True,
|
| 350 |
"stopwords_min_cutoff": 0,
|
| 351 |
-
"cond_check_flagged_words":
|
| 352 |
"flagged_words_max_cutoff": 0.2,
|
| 353 |
"cond_check_lang_id": True,
|
| 354 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -382,7 +382,7 @@ parameters_filtering_fr = {
|
|
| 382 |
"words_augmentation_join_char": "",
|
| 383 |
"cond_check_stopwords": True,
|
| 384 |
"stopwords_min_cutoff": 0.27,
|
| 385 |
-
"cond_check_flagged_words":
|
| 386 |
"flagged_words_max_cutoff": 0.008,
|
| 387 |
"cond_check_lang_id": True,
|
| 388 |
"lang_id_min_cutoff": 0.8,
|
|
@@ -393,7 +393,7 @@ parameters_filtering_fr = {
|
|
| 393 |
parameters_filtering_gu = {
|
| 394 |
"cond_uniform_whitespace": True,
|
| 395 |
"cond_replace_unicode_punctuation": False,
|
| 396 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 397 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 398 |
"cond_remove_long_words": True,
|
| 399 |
"length_word_max_cutoff": 30,
|
|
@@ -416,7 +416,7 @@ parameters_filtering_gu = {
|
|
| 416 |
"words_augmentation_join_char": "",
|
| 417 |
"cond_check_stopwords": True,
|
| 418 |
"stopwords_min_cutoff": 0,
|
| 419 |
-
"cond_check_flagged_words":
|
| 420 |
"flagged_words_max_cutoff": 0.2,
|
| 421 |
"cond_check_lang_id": True,
|
| 422 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -427,7 +427,7 @@ parameters_filtering_gu = {
|
|
| 427 |
parameters_filtering_hi = {
|
| 428 |
"cond_uniform_whitespace": True,
|
| 429 |
"cond_replace_unicode_punctuation": False,
|
| 430 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 431 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 432 |
"cond_remove_long_words": True,
|
| 433 |
"length_word_max_cutoff": 25,
|
|
@@ -450,7 +450,7 @@ parameters_filtering_hi = {
|
|
| 450 |
"words_augmentation_join_char": "",
|
| 451 |
"cond_check_stopwords": True,
|
| 452 |
"stopwords_min_cutoff": 0,
|
| 453 |
-
"cond_check_flagged_words":
|
| 454 |
"flagged_words_max_cutoff": 0.2,
|
| 455 |
"cond_check_lang_id": True,
|
| 456 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -461,41 +461,41 @@ parameters_filtering_hi = {
|
|
| 461 |
parameters_filtering_id = {
|
| 462 |
"cond_uniform_whitespace": True,
|
| 463 |
"cond_replace_unicode_punctuation": False,
|
| 464 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 465 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 466 |
"cond_remove_long_words": True,
|
| 467 |
"length_word_max_cutoff": 30,
|
| 468 |
"cond_check_number_words": True,
|
| 469 |
"tokenization": False,
|
| 470 |
"strip_characters": special_characters_default,
|
| 471 |
-
"number_words_min_cutoff":
|
| 472 |
"number_words_max_cutoff": 100000,
|
| 473 |
"cond_check_character_repetition_removal": True,
|
| 474 |
"character_repetition_length": 10,
|
| 475 |
-
"character_repetition_max_cutoff": 0.
|
| 476 |
"cond_check_word_repetition_removal": True,
|
| 477 |
"word_repetition_length": 5,
|
| 478 |
-
"word_repetition_max_cutoff": 0.
|
| 479 |
"cond_check_special_characters": True,
|
| 480 |
"special_characters": special_characters_default,
|
| 481 |
-
"special_characters_max_cutoff": 0.
|
| 482 |
"cond_words_augmentation": False,
|
| 483 |
"words_augmentation_group_sizes": [],
|
| 484 |
"words_augmentation_join_char": "",
|
| 485 |
"cond_check_stopwords": True,
|
| 486 |
-
"stopwords_min_cutoff": 0.
|
| 487 |
-
"cond_check_flagged_words":
|
| 488 |
-
"flagged_words_max_cutoff": 0.
|
| 489 |
"cond_check_lang_id": True,
|
| 490 |
-
"lang_id_min_cutoff": 0.
|
| 491 |
"cond_check_perplexity": True,
|
| 492 |
-
"perplexity_max_cutoff":
|
| 493 |
}
|
| 494 |
|
| 495 |
parameters_filtering_kn = {
|
| 496 |
"cond_uniform_whitespace": True,
|
| 497 |
"cond_replace_unicode_punctuation": False,
|
| 498 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 499 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 500 |
"cond_remove_long_words": True,
|
| 501 |
"length_word_max_cutoff": 50,
|
|
@@ -518,7 +518,7 @@ parameters_filtering_kn = {
|
|
| 518 |
"words_augmentation_join_char": "",
|
| 519 |
"cond_check_stopwords": True,
|
| 520 |
"stopwords_min_cutoff": 0,
|
| 521 |
-
"cond_check_flagged_words":
|
| 522 |
"flagged_words_max_cutoff": 0.2,
|
| 523 |
"cond_check_lang_id": True,
|
| 524 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -529,7 +529,7 @@ parameters_filtering_kn = {
|
|
| 529 |
parameters_filtering_ml = {
|
| 530 |
"cond_uniform_whitespace": True,
|
| 531 |
"cond_replace_unicode_punctuation": False,
|
| 532 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 533 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 534 |
"cond_remove_long_words": True,
|
| 535 |
"length_word_max_cutoff": 50,
|
|
@@ -552,7 +552,7 @@ parameters_filtering_ml = {
|
|
| 552 |
"words_augmentation_join_char": "",
|
| 553 |
"cond_check_stopwords": True,
|
| 554 |
"stopwords_min_cutoff": 0,
|
| 555 |
-
"cond_check_flagged_words":
|
| 556 |
"flagged_words_max_cutoff": 0.2,
|
| 557 |
"cond_check_lang_id": True,
|
| 558 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -563,7 +563,7 @@ parameters_filtering_ml = {
|
|
| 563 |
parameters_filtering_mr = {
|
| 564 |
"cond_uniform_whitespace": True,
|
| 565 |
"cond_replace_unicode_punctuation": False,
|
| 566 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 567 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 568 |
"cond_remove_long_words": True,
|
| 569 |
"length_word_max_cutoff": 30,
|
|
@@ -586,7 +586,7 @@ parameters_filtering_mr = {
|
|
| 586 |
"words_augmentation_join_char": "",
|
| 587 |
"cond_check_stopwords": True,
|
| 588 |
"stopwords_min_cutoff": 0,
|
| 589 |
-
"cond_check_flagged_words":
|
| 590 |
"flagged_words_max_cutoff": 0.2,
|
| 591 |
"cond_check_lang_id": True,
|
| 592 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -620,7 +620,7 @@ parameters_filtering_pt = {
|
|
| 620 |
"words_augmentation_join_char": "",
|
| 621 |
"cond_check_stopwords": True,
|
| 622 |
"stopwords_min_cutoff": 0.2,
|
| 623 |
-
"cond_check_flagged_words":
|
| 624 |
"flagged_words_max_cutoff": 0.007,
|
| 625 |
"cond_check_lang_id": True,
|
| 626 |
"lang_id_min_cutoff": 0.6,
|
|
@@ -631,7 +631,7 @@ parameters_filtering_pt = {
|
|
| 631 |
parameters_filtering_sw = {
|
| 632 |
"cond_uniform_whitespace": True,
|
| 633 |
"cond_replace_unicode_punctuation": False,
|
| 634 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 635 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 636 |
"cond_remove_long_words": True,
|
| 637 |
"length_word_max_cutoff": 30,
|
|
@@ -654,7 +654,7 @@ parameters_filtering_sw = {
|
|
| 654 |
"words_augmentation_join_char": "",
|
| 655 |
"cond_check_stopwords": True,
|
| 656 |
"stopwords_min_cutoff": 0,
|
| 657 |
-
"cond_check_flagged_words":
|
| 658 |
"flagged_words_max_cutoff": 0.2,
|
| 659 |
"cond_check_lang_id": True,
|
| 660 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -665,7 +665,7 @@ parameters_filtering_sw = {
|
|
| 665 |
parameters_filtering_ta = {
|
| 666 |
"cond_uniform_whitespace": True,
|
| 667 |
"cond_replace_unicode_punctuation": False,
|
| 668 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 669 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 670 |
"cond_remove_long_words": True,
|
| 671 |
"length_word_max_cutoff": 50,
|
|
@@ -688,7 +688,7 @@ parameters_filtering_ta = {
|
|
| 688 |
"words_augmentation_join_char": "",
|
| 689 |
"cond_check_stopwords": True,
|
| 690 |
"stopwords_min_cutoff": 0,
|
| 691 |
-
"cond_check_flagged_words":
|
| 692 |
"flagged_words_max_cutoff": 0.2,
|
| 693 |
"cond_check_lang_id": True,
|
| 694 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -699,7 +699,7 @@ parameters_filtering_ta = {
|
|
| 699 |
parameters_filtering_te = {
|
| 700 |
"cond_uniform_whitespace": True,
|
| 701 |
"cond_replace_unicode_punctuation": False,
|
| 702 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 703 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 704 |
"cond_remove_long_words": True,
|
| 705 |
"length_word_max_cutoff": 35,
|
|
@@ -722,7 +722,7 @@ parameters_filtering_te = {
|
|
| 722 |
"words_augmentation_join_char": "",
|
| 723 |
"cond_check_stopwords": True,
|
| 724 |
"stopwords_min_cutoff": 0,
|
| 725 |
-
"cond_check_flagged_words":
|
| 726 |
"flagged_words_max_cutoff": 0.2,
|
| 727 |
"cond_check_lang_id": True,
|
| 728 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -733,7 +733,7 @@ parameters_filtering_te = {
|
|
| 733 |
parameters_filtering_ur = {
|
| 734 |
"cond_uniform_whitespace": True,
|
| 735 |
"cond_replace_unicode_punctuation": False,
|
| 736 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 737 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 738 |
"cond_remove_long_words": True,
|
| 739 |
"length_word_max_cutoff": 30,
|
|
@@ -756,7 +756,7 @@ parameters_filtering_ur = {
|
|
| 756 |
"words_augmentation_join_char": "",
|
| 757 |
"cond_check_stopwords": True,
|
| 758 |
"stopwords_min_cutoff": 0,
|
| 759 |
-
"cond_check_flagged_words":
|
| 760 |
"flagged_words_max_cutoff": 0.2,
|
| 761 |
"cond_check_lang_id": True,
|
| 762 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -767,7 +767,7 @@ parameters_filtering_ur = {
|
|
| 767 |
parameters_filtering_vi = {
|
| 768 |
"cond_uniform_whitespace": True,
|
| 769 |
"cond_replace_unicode_punctuation": False,
|
| 770 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 771 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 772 |
"cond_remove_long_words": True,
|
| 773 |
"length_word_max_cutoff": 30,
|
|
@@ -790,7 +790,7 @@ parameters_filtering_vi = {
|
|
| 790 |
"words_augmentation_join_char": " ",
|
| 791 |
"cond_check_stopwords": True,
|
| 792 |
"stopwords_min_cutoff": 0,
|
| 793 |
-
"cond_check_flagged_words":
|
| 794 |
"flagged_words_max_cutoff": 0.2,
|
| 795 |
"cond_check_lang_id": True,
|
| 796 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -801,7 +801,7 @@ parameters_filtering_vi = {
|
|
| 801 |
parameters_filtering_yo = {
|
| 802 |
"cond_uniform_whitespace": True,
|
| 803 |
"cond_replace_unicode_punctuation": False,
|
| 804 |
-
"cond_remove_words_with_incorrect_substrings":
|
| 805 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 806 |
"cond_remove_long_words": True,
|
| 807 |
"length_word_max_cutoff": 30,
|
|
@@ -824,7 +824,7 @@ parameters_filtering_yo = {
|
|
| 824 |
"words_augmentation_join_char": "",
|
| 825 |
"cond_check_stopwords": True,
|
| 826 |
"stopwords_min_cutoff": 0,
|
| 827 |
-
"cond_check_flagged_words":
|
| 828 |
"flagged_words_max_cutoff": 0.2,
|
| 829 |
"cond_check_lang_id": True,
|
| 830 |
"lang_id_min_cutoff": 0.75,
|
|
@@ -856,9 +856,9 @@ parameters_filtering_zh = {
|
|
| 856 |
"cond_words_augmentation": True,
|
| 857 |
"words_augmentation_group_sizes": [2],
|
| 858 |
"words_augmentation_join_char": "",
|
| 859 |
-
"cond_check_stopwords":
|
| 860 |
"stopwords_min_cutoff": 0,
|
| 861 |
-
"cond_check_flagged_words":
|
| 862 |
"flagged_words_max_cutoff": 0.2,
|
| 863 |
"cond_check_lang_id": True,
|
| 864 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 26 |
"cond_check_number_words": True,
|
| 27 |
"tokenization": False,
|
| 28 |
"strip_characters": special_characters_default,
|
| 29 |
+
"number_words_min_cutoff": 10,
|
| 30 |
"number_words_max_cutoff": 100000,
|
| 31 |
"cond_check_character_repetition_removal": True,
|
| 32 |
"character_repetition_length": 10,
|
| 33 |
+
"character_repetition_max_cutoff": 0.2,
|
| 34 |
"cond_check_word_repetition_removal": True,
|
| 35 |
"word_repetition_length": 5,
|
| 36 |
+
"word_repetition_max_cutoff": 0.3,
|
| 37 |
"cond_check_special_characters": True,
|
| 38 |
"special_characters": special_characters_default,
|
| 39 |
"special_characters_max_cutoff": 0.4,
|
| 40 |
"cond_words_augmentation": False,
|
| 41 |
"words_augmentation_group_sizes": [],
|
| 42 |
"words_augmentation_join_char": "",
|
| 43 |
+
"cond_check_stopwords": True,
|
| 44 |
+
"stopwords_min_cutoff": 0.1,
|
| 45 |
+
"cond_check_flagged_words": True,
|
| 46 |
+
"flagged_words_max_cutoff": 0.1,
|
| 47 |
"cond_check_lang_id": True,
|
| 48 |
"lang_id_min_cutoff": 0.70,
|
| 49 |
"cond_check_perplexity": False,
|
| 50 |
+
"perplexity_max_cutoff": 10000,
|
| 51 |
}
|
| 52 |
|
| 53 |
parameters_filtering_af = {
|
| 54 |
"cond_uniform_whitespace": True,
|
| 55 |
"cond_replace_unicode_punctuation": False,
|
| 56 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 57 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 58 |
"cond_remove_long_words": True,
|
| 59 |
"length_word_max_cutoff": 25,
|
|
|
|
| 76 |
"words_augmentation_join_char": "",
|
| 77 |
"cond_check_stopwords": True,
|
| 78 |
"stopwords_min_cutoff": 0,
|
| 79 |
+
"cond_check_flagged_words": True,
|
| 80 |
"flagged_words_max_cutoff": 0.2,
|
| 81 |
"cond_check_lang_id": True,
|
| 82 |
"lang_id_min_cutoff": 0.6,
|
|
|
|
| 87 |
parameters_filtering_ar = {
|
| 88 |
"cond_uniform_whitespace": True,
|
| 89 |
"cond_replace_unicode_punctuation": False,
|
| 90 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 91 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 92 |
"cond_remove_long_words": True,
|
| 93 |
"length_word_max_cutoff": 25,
|
|
|
|
| 110 |
"words_augmentation_join_char": "",
|
| 111 |
"cond_check_stopwords": True,
|
| 112 |
"stopwords_min_cutoff": 0,
|
| 113 |
+
"cond_check_flagged_words": True,
|
| 114 |
"flagged_words_max_cutoff": 0.2,
|
| 115 |
"cond_check_lang_id": True,
|
| 116 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 121 |
parameters_filtering_arz = {
|
| 122 |
"cond_uniform_whitespace": True,
|
| 123 |
"cond_replace_unicode_punctuation": False,
|
| 124 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 125 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 126 |
"cond_remove_long_words": True,
|
| 127 |
"length_word_max_cutoff": 25,
|
|
|
|
| 144 |
"words_augmentation_join_char": "",
|
| 145 |
"cond_check_stopwords": True,
|
| 146 |
"stopwords_min_cutoff": 0,
|
| 147 |
+
"cond_check_flagged_words": True,
|
| 148 |
"flagged_words_max_cutoff": 0.2,
|
| 149 |
"cond_check_lang_id": True,
|
| 150 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 155 |
parameters_filtering_as = {
|
| 156 |
"cond_uniform_whitespace": True,
|
| 157 |
"cond_replace_unicode_punctuation": False,
|
| 158 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 159 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 160 |
"cond_remove_long_words": True,
|
| 161 |
"length_word_max_cutoff": 25,
|
|
|
|
| 178 |
"words_augmentation_join_char": "",
|
| 179 |
"cond_check_stopwords": True,
|
| 180 |
"stopwords_min_cutoff": 0,
|
| 181 |
+
"cond_check_flagged_words": True,
|
| 182 |
"flagged_words_max_cutoff": 0.2,
|
| 183 |
"cond_check_lang_id": True,
|
| 184 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 189 |
parameters_filtering_bn = {
|
| 190 |
"cond_uniform_whitespace": True,
|
| 191 |
"cond_replace_unicode_punctuation": False,
|
| 192 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 193 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 194 |
"cond_remove_long_words": True,
|
| 195 |
"length_word_max_cutoff": 30,
|
|
|
|
| 212 |
"words_augmentation_join_char": "",
|
| 213 |
"cond_check_stopwords": True,
|
| 214 |
"stopwords_min_cutoff": 0.05,
|
| 215 |
+
"cond_check_flagged_words": True,
|
| 216 |
"flagged_words_max_cutoff": 0.2,
|
| 217 |
"cond_check_lang_id": True,
|
| 218 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 246 |
"words_augmentation_join_char": "",
|
| 247 |
"cond_check_stopwords": True,
|
| 248 |
"stopwords_min_cutoff": 0.25,
|
| 249 |
+
"cond_check_flagged_words": True,
|
| 250 |
"flagged_words_max_cutoff": 0.1,
|
| 251 |
"cond_check_lang_id": True,
|
| 252 |
"lang_id_min_cutoff": 0.8,
|
|
|
|
| 291 |
parameters_filtering_es = {
|
| 292 |
"cond_uniform_whitespace": True,
|
| 293 |
"cond_replace_unicode_punctuation": False,
|
| 294 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 295 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 296 |
"cond_remove_long_words": True,
|
| 297 |
"length_word_max_cutoff": 30,
|
|
|
|
| 314 |
"words_augmentation_join_char": "",
|
| 315 |
"cond_check_stopwords": True,
|
| 316 |
"stopwords_min_cutoff": 0.2,
|
| 317 |
+
"cond_check_flagged_words": True,
|
| 318 |
"flagged_words_max_cutoff": 0.2,
|
| 319 |
"cond_check_lang_id": True,
|
| 320 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 325 |
parameters_filtering_eu = {
|
| 326 |
"cond_uniform_whitespace": True,
|
| 327 |
"cond_replace_unicode_punctuation": False,
|
| 328 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 329 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 330 |
"cond_remove_long_words": True,
|
| 331 |
"length_word_max_cutoff": 35,
|
|
|
|
| 348 |
"words_augmentation_join_char": "",
|
| 349 |
"cond_check_stopwords": True,
|
| 350 |
"stopwords_min_cutoff": 0,
|
| 351 |
+
"cond_check_flagged_words": True,
|
| 352 |
"flagged_words_max_cutoff": 0.2,
|
| 353 |
"cond_check_lang_id": True,
|
| 354 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 382 |
"words_augmentation_join_char": "",
|
| 383 |
"cond_check_stopwords": True,
|
| 384 |
"stopwords_min_cutoff": 0.27,
|
| 385 |
+
"cond_check_flagged_words": True,
|
| 386 |
"flagged_words_max_cutoff": 0.008,
|
| 387 |
"cond_check_lang_id": True,
|
| 388 |
"lang_id_min_cutoff": 0.8,
|
|
|
|
| 393 |
parameters_filtering_gu = {
|
| 394 |
"cond_uniform_whitespace": True,
|
| 395 |
"cond_replace_unicode_punctuation": False,
|
| 396 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 397 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 398 |
"cond_remove_long_words": True,
|
| 399 |
"length_word_max_cutoff": 30,
|
|
|
|
| 416 |
"words_augmentation_join_char": "",
|
| 417 |
"cond_check_stopwords": True,
|
| 418 |
"stopwords_min_cutoff": 0,
|
| 419 |
+
"cond_check_flagged_words": True,
|
| 420 |
"flagged_words_max_cutoff": 0.2,
|
| 421 |
"cond_check_lang_id": True,
|
| 422 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 427 |
parameters_filtering_hi = {
|
| 428 |
"cond_uniform_whitespace": True,
|
| 429 |
"cond_replace_unicode_punctuation": False,
|
| 430 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 431 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 432 |
"cond_remove_long_words": True,
|
| 433 |
"length_word_max_cutoff": 25,
|
|
|
|
| 450 |
"words_augmentation_join_char": "",
|
| 451 |
"cond_check_stopwords": True,
|
| 452 |
"stopwords_min_cutoff": 0,
|
| 453 |
+
"cond_check_flagged_words": True,
|
| 454 |
"flagged_words_max_cutoff": 0.2,
|
| 455 |
"cond_check_lang_id": True,
|
| 456 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 461 |
parameters_filtering_id = {
|
| 462 |
"cond_uniform_whitespace": True,
|
| 463 |
"cond_replace_unicode_punctuation": False,
|
| 464 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 465 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 466 |
"cond_remove_long_words": True,
|
| 467 |
"length_word_max_cutoff": 30,
|
| 468 |
"cond_check_number_words": True,
|
| 469 |
"tokenization": False,
|
| 470 |
"strip_characters": special_characters_default,
|
| 471 |
+
"number_words_min_cutoff": 15,
|
| 472 |
"number_words_max_cutoff": 100000,
|
| 473 |
"cond_check_character_repetition_removal": True,
|
| 474 |
"character_repetition_length": 10,
|
| 475 |
+
"character_repetition_max_cutoff": 0.15,
|
| 476 |
"cond_check_word_repetition_removal": True,
|
| 477 |
"word_repetition_length": 5,
|
| 478 |
+
"word_repetition_max_cutoff": 0.20,
|
| 479 |
"cond_check_special_characters": True,
|
| 480 |
"special_characters": special_characters_default,
|
| 481 |
+
"special_characters_max_cutoff": 0.34,
|
| 482 |
"cond_words_augmentation": False,
|
| 483 |
"words_augmentation_group_sizes": [],
|
| 484 |
"words_augmentation_join_char": "",
|
| 485 |
"cond_check_stopwords": True,
|
| 486 |
+
"stopwords_min_cutoff": 0.15,
|
| 487 |
+
"cond_check_flagged_words": True,
|
| 488 |
+
"flagged_words_max_cutoff": 0.01,
|
| 489 |
"cond_check_lang_id": True,
|
| 490 |
+
"lang_id_min_cutoff": 0.7,
|
| 491 |
"cond_check_perplexity": True,
|
| 492 |
+
"perplexity_max_cutoff": 5000,
|
| 493 |
}
|
| 494 |
|
| 495 |
parameters_filtering_kn = {
|
| 496 |
"cond_uniform_whitespace": True,
|
| 497 |
"cond_replace_unicode_punctuation": False,
|
| 498 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 499 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 500 |
"cond_remove_long_words": True,
|
| 501 |
"length_word_max_cutoff": 50,
|
|
|
|
| 518 |
"words_augmentation_join_char": "",
|
| 519 |
"cond_check_stopwords": True,
|
| 520 |
"stopwords_min_cutoff": 0,
|
| 521 |
+
"cond_check_flagged_words": True,
|
| 522 |
"flagged_words_max_cutoff": 0.2,
|
| 523 |
"cond_check_lang_id": True,
|
| 524 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 529 |
parameters_filtering_ml = {
|
| 530 |
"cond_uniform_whitespace": True,
|
| 531 |
"cond_replace_unicode_punctuation": False,
|
| 532 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 533 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 534 |
"cond_remove_long_words": True,
|
| 535 |
"length_word_max_cutoff": 50,
|
|
|
|
| 552 |
"words_augmentation_join_char": "",
|
| 553 |
"cond_check_stopwords": True,
|
| 554 |
"stopwords_min_cutoff": 0,
|
| 555 |
+
"cond_check_flagged_words": True,
|
| 556 |
"flagged_words_max_cutoff": 0.2,
|
| 557 |
"cond_check_lang_id": True,
|
| 558 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 563 |
parameters_filtering_mr = {
|
| 564 |
"cond_uniform_whitespace": True,
|
| 565 |
"cond_replace_unicode_punctuation": False,
|
| 566 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 567 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 568 |
"cond_remove_long_words": True,
|
| 569 |
"length_word_max_cutoff": 30,
|
|
|
|
| 586 |
"words_augmentation_join_char": "",
|
| 587 |
"cond_check_stopwords": True,
|
| 588 |
"stopwords_min_cutoff": 0,
|
| 589 |
+
"cond_check_flagged_words": True,
|
| 590 |
"flagged_words_max_cutoff": 0.2,
|
| 591 |
"cond_check_lang_id": True,
|
| 592 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 620 |
"words_augmentation_join_char": "",
|
| 621 |
"cond_check_stopwords": True,
|
| 622 |
"stopwords_min_cutoff": 0.2,
|
| 623 |
+
"cond_check_flagged_words": True,
|
| 624 |
"flagged_words_max_cutoff": 0.007,
|
| 625 |
"cond_check_lang_id": True,
|
| 626 |
"lang_id_min_cutoff": 0.6,
|
|
|
|
| 631 |
parameters_filtering_sw = {
|
| 632 |
"cond_uniform_whitespace": True,
|
| 633 |
"cond_replace_unicode_punctuation": False,
|
| 634 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 635 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 636 |
"cond_remove_long_words": True,
|
| 637 |
"length_word_max_cutoff": 30,
|
|
|
|
| 654 |
"words_augmentation_join_char": "",
|
| 655 |
"cond_check_stopwords": True,
|
| 656 |
"stopwords_min_cutoff": 0,
|
| 657 |
+
"cond_check_flagged_words": True,
|
| 658 |
"flagged_words_max_cutoff": 0.2,
|
| 659 |
"cond_check_lang_id": True,
|
| 660 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 665 |
parameters_filtering_ta = {
|
| 666 |
"cond_uniform_whitespace": True,
|
| 667 |
"cond_replace_unicode_punctuation": False,
|
| 668 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 669 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 670 |
"cond_remove_long_words": True,
|
| 671 |
"length_word_max_cutoff": 50,
|
|
|
|
| 688 |
"words_augmentation_join_char": "",
|
| 689 |
"cond_check_stopwords": True,
|
| 690 |
"stopwords_min_cutoff": 0,
|
| 691 |
+
"cond_check_flagged_words": True,
|
| 692 |
"flagged_words_max_cutoff": 0.2,
|
| 693 |
"cond_check_lang_id": True,
|
| 694 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 699 |
parameters_filtering_te = {
|
| 700 |
"cond_uniform_whitespace": True,
|
| 701 |
"cond_replace_unicode_punctuation": False,
|
| 702 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 703 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 704 |
"cond_remove_long_words": True,
|
| 705 |
"length_word_max_cutoff": 35,
|
|
|
|
| 722 |
"words_augmentation_join_char": "",
|
| 723 |
"cond_check_stopwords": True,
|
| 724 |
"stopwords_min_cutoff": 0,
|
| 725 |
+
"cond_check_flagged_words": True,
|
| 726 |
"flagged_words_max_cutoff": 0.2,
|
| 727 |
"cond_check_lang_id": True,
|
| 728 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 733 |
parameters_filtering_ur = {
|
| 734 |
"cond_uniform_whitespace": True,
|
| 735 |
"cond_replace_unicode_punctuation": False,
|
| 736 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 737 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 738 |
"cond_remove_long_words": True,
|
| 739 |
"length_word_max_cutoff": 30,
|
|
|
|
| 756 |
"words_augmentation_join_char": "",
|
| 757 |
"cond_check_stopwords": True,
|
| 758 |
"stopwords_min_cutoff": 0,
|
| 759 |
+
"cond_check_flagged_words": True,
|
| 760 |
"flagged_words_max_cutoff": 0.2,
|
| 761 |
"cond_check_lang_id": True,
|
| 762 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 767 |
parameters_filtering_vi = {
|
| 768 |
"cond_uniform_whitespace": True,
|
| 769 |
"cond_replace_unicode_punctuation": False,
|
| 770 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 771 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 772 |
"cond_remove_long_words": True,
|
| 773 |
"length_word_max_cutoff": 30,
|
|
|
|
| 790 |
"words_augmentation_join_char": " ",
|
| 791 |
"cond_check_stopwords": True,
|
| 792 |
"stopwords_min_cutoff": 0,
|
| 793 |
+
"cond_check_flagged_words": True,
|
| 794 |
"flagged_words_max_cutoff": 0.2,
|
| 795 |
"cond_check_lang_id": True,
|
| 796 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 801 |
parameters_filtering_yo = {
|
| 802 |
"cond_uniform_whitespace": True,
|
| 803 |
"cond_replace_unicode_punctuation": False,
|
| 804 |
+
"cond_remove_words_with_incorrect_substrings": True,
|
| 805 |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
|
| 806 |
"cond_remove_long_words": True,
|
| 807 |
"length_word_max_cutoff": 30,
|
|
|
|
| 824 |
"words_augmentation_join_char": "",
|
| 825 |
"cond_check_stopwords": True,
|
| 826 |
"stopwords_min_cutoff": 0,
|
| 827 |
+
"cond_check_flagged_words": True,
|
| 828 |
"flagged_words_max_cutoff": 0.2,
|
| 829 |
"cond_check_lang_id": True,
|
| 830 |
"lang_id_min_cutoff": 0.75,
|
|
|
|
| 856 |
"cond_words_augmentation": True,
|
| 857 |
"words_augmentation_group_sizes": [2],
|
| 858 |
"words_augmentation_join_char": "",
|
| 859 |
+
"cond_check_stopwords": True,
|
| 860 |
"stopwords_min_cutoff": 0,
|
| 861 |
+
"cond_check_flagged_words": True,
|
| 862 |
"flagged_words_max_cutoff": 0.2,
|
| 863 |
"cond_check_lang_id": True,
|
| 864 |
"lang_id_min_cutoff": 0.75,
|