Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						6f25c5c
	
1
								Parent(s):
							
							d463071
								
new tool to analyse our own doc
Browse files- .gitignore +2 -0
- app.py +132 -4
- parameters_filtering.py +2 -2
    	
        .gitignore
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            *cpython-39.pyc
         | 
| 2 | 
            +
            .DS_Store
         | 
    	
        app.py
    CHANGED
    
    | @@ -13,7 +13,7 @@ import numpy as np | |
| 13 |  | 
| 14 | 
             
            import matplotlib.pyplot as plt
         | 
| 15 |  | 
| 16 | 
            -
            from filtering import Filtering
         | 
| 17 |  | 
| 18 |  | 
| 19 | 
             
            class Visualization:
         | 
| @@ -25,6 +25,10 @@ class Visualization: | |
| 25 | 
             
                    num_docs,
         | 
| 26 | 
             
                    num_docs_for_words,
         | 
| 27 | 
             
                    max_len_text_display,
         | 
|  | |
|  | |
|  | |
|  | |
| 28 | 
             
                ):
         | 
| 29 | 
             
                    self.path_instructions = path_instructions
         | 
| 30 | 
             
                    self.path_data = path_data
         | 
| @@ -33,6 +37,23 @@ class Visualization: | |
| 33 | 
             
                    self.num_docs_for_words = num_docs_for_words
         | 
| 34 | 
             
                    self.max_len_text_display = max_len_text_display
         | 
| 35 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 36 | 
             
                def preamble(self):
         | 
| 37 | 
             
                    st.markdown(
         | 
| 38 | 
             
                        "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
         | 
| @@ -159,6 +180,7 @@ class Visualization: | |
| 159 | 
             
                                "repetitions_ratio",
         | 
| 160 | 
             
                                cutoff_repetitions_ratio,
         | 
| 161 | 
             
                                True,
         | 
|  | |
| 162 | 
             
                            )
         | 
| 163 | 
             
                            keys.append(new_key)
         | 
| 164 | 
             
                            cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| @@ -392,8 +414,104 @@ class Visualization: | |
| 392 | 
             
                            ax.set_ylabel("frequency in the documents")
         | 
| 393 | 
             
                            st.pyplot(fig)
         | 
| 394 |  | 
| 395 | 
            -
                def  | 
| 396 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 397 |  | 
| 398 | 
             
                def download_data(self):
         | 
| 399 | 
             
                    st.header("Download data")
         | 
| @@ -413,7 +531,7 @@ class Visualization: | |
| 413 | 
             
                    self.filtering_of_words()
         | 
| 414 | 
             
                    self.plot_distributions_filtering_parameters()
         | 
| 415 | 
             
                    #self.plot_zipf_law()
         | 
| 416 | 
            -
                    self. | 
| 417 | 
             
                    self.download_data()
         | 
| 418 |  | 
| 419 |  | 
| @@ -424,6 +542,12 @@ num_docs = 5000 | |
| 424 | 
             
            num_docs_for_words = 500
         | 
| 425 | 
             
            max_len_text_display = 10000
         | 
| 426 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 427 | 
             
            visualization = Visualization(
         | 
| 428 | 
             
                path_instructions,
         | 
| 429 | 
             
                path_data,
         | 
| @@ -431,5 +555,9 @@ visualization = Visualization( | |
| 431 | 
             
                num_docs,
         | 
| 432 | 
             
                num_docs_for_words,
         | 
| 433 | 
             
                max_len_text_display,
         | 
|  | |
|  | |
|  | |
|  | |
| 434 | 
             
            )
         | 
| 435 | 
             
            visualization.visualization()
         | 
|  | |
| 13 |  | 
| 14 | 
             
            import matplotlib.pyplot as plt
         | 
| 15 |  | 
| 16 | 
            +
            from filtering import LoadParameters, ModifyingDocuments, Filtering
         | 
| 17 |  | 
| 18 |  | 
| 19 | 
             
            class Visualization:
         | 
|  | |
| 25 | 
             
                    num_docs,
         | 
| 26 | 
             
                    num_docs_for_words,
         | 
| 27 | 
             
                    max_len_text_display,
         | 
| 28 | 
            +
                    lang_dataset_id,
         | 
| 29 | 
            +
                    path_fasttext_model,
         | 
| 30 | 
            +
                    path_sentencepiece_model,
         | 
| 31 | 
            +
                    path_kenlm_model,
         | 
| 32 | 
             
                ):
         | 
| 33 | 
             
                    self.path_instructions = path_instructions
         | 
| 34 | 
             
                    self.path_data = path_data
         | 
|  | |
| 37 | 
             
                    self.num_docs_for_words = num_docs_for_words
         | 
| 38 | 
             
                    self.max_len_text_display = max_len_text_display
         | 
| 39 |  | 
| 40 | 
            +
                    self.lang_dataset_id = lang_dataset_id
         | 
| 41 | 
            +
                    self.param = LoadParameters.load_parameters(lang_dataset_id)
         | 
| 42 | 
            +
                    self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
         | 
| 43 | 
            +
                    self.badwords = LoadParameters.load_badwords(lang_dataset_id)
         | 
| 44 | 
            +
                    self.model_lang_id = LoadParameters.load_model_lang_id(
         | 
| 45 | 
            +
                        lang_dataset_id, path_fasttext_model
         | 
| 46 | 
            +
                    )
         | 
| 47 | 
            +
                    self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
         | 
| 48 | 
            +
                        lang_dataset_id, path_sentencepiece_model
         | 
| 49 | 
            +
                    )
         | 
| 50 | 
            +
                    self.sentencepiece_model_tok = (
         | 
| 51 | 
            +
                        self.sentencepiece_model if self.param["tokenization"] else None
         | 
| 52 | 
            +
                    )
         | 
| 53 | 
            +
                    self.kenlm_model = LoadParameters.load_kenlm_model(
         | 
| 54 | 
            +
                        lang_dataset_id, path_kenlm_model
         | 
| 55 | 
            +
                    )
         | 
| 56 | 
            +
             | 
| 57 | 
             
                def preamble(self):
         | 
| 58 | 
             
                    st.markdown(
         | 
| 59 | 
             
                        "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
         | 
|  | |
| 180 | 
             
                                "repetitions_ratio",
         | 
| 181 | 
             
                                cutoff_repetitions_ratio,
         | 
| 182 | 
             
                                True,
         | 
| 183 | 
            +
                                repetitions_length,
         | 
| 184 | 
             
                            )
         | 
| 185 | 
             
                            keys.append(new_key)
         | 
| 186 | 
             
                            cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
|  | |
| 414 | 
             
                            ax.set_ylabel("frequency in the documents")
         | 
| 415 | 
             
                            st.pyplot(fig)
         | 
| 416 |  | 
| 417 | 
            +
                def analyse_personal_doc(self):
         | 
| 418 | 
            +
                    st.header("Analyse your own document")
         | 
| 419 | 
            +
             | 
| 420 | 
            +
                    personal_doc = st.text_area(
         | 
| 421 | 
            +
                        label="Paste here the document you want to analyse",
         | 
| 422 | 
            +
                        value="",
         | 
| 423 | 
            +
                        max_chars=10000,
         | 
| 424 | 
            +
                    )
         | 
| 425 | 
            +
             | 
| 426 | 
            +
                    is_discarded = False
         | 
| 427 | 
            +
             | 
| 428 | 
            +
                    def is_doc_discarded(key, score):
         | 
| 429 | 
            +
                        if key[2]: # max cutoff
         | 
| 430 | 
            +
                            return score > key[1]
         | 
| 431 | 
            +
                        else:
         | 
| 432 | 
            +
                            return score < key[1]
         | 
| 433 | 
            +
             | 
| 434 | 
            +
                    for key in self.keys:
         | 
| 435 | 
            +
                        if key[0] == "number_words":
         | 
| 436 | 
            +
                            words = ModifyingDocuments.get_words_from_document(
         | 
| 437 | 
            +
                                personal_doc,
         | 
| 438 | 
            +
                                self.sentencepiece_model_tok,
         | 
| 439 | 
            +
                                lower_case=False,
         | 
| 440 | 
            +
                                strip_characters=self.param["strip_characters"],
         | 
| 441 | 
            +
                            )
         | 
| 442 | 
            +
                            if key[2]:
         | 
| 443 | 
            +
                                st.markdown(f"Number of words: {len(words)}")
         | 
| 444 | 
            +
                            if is_doc_discarded(key, len(words)):
         | 
| 445 | 
            +
                                is_discarded = True
         | 
| 446 | 
            +
             | 
| 447 | 
            +
                        elif key[0] == "repetitions_ratio":
         | 
| 448 | 
            +
                            repetitions_ratio = Filtering.compute_repetitions_ratio(personal_doc, int(key[3]))
         | 
| 449 | 
            +
                            repetitions_ratio = round(repetitions_ratio, 3)
         | 
| 450 | 
            +
                            st.markdown(f"Repetitions ratio: {repetitions_ratio}")
         | 
| 451 | 
            +
                            if is_doc_discarded(key, repetitions_ratio):
         | 
| 452 | 
            +
                                is_discarded = True
         | 
| 453 | 
            +
             | 
| 454 | 
            +
                        elif key[0] == "special_characters_ratio":
         | 
| 455 | 
            +
                            special_characters_ratio = Filtering.compute_special_characters_ratio(
         | 
| 456 | 
            +
                                personal_doc, self.param["special_characters"]
         | 
| 457 | 
            +
                            )
         | 
| 458 | 
            +
                            special_characters_ratio = round(special_characters_ratio, 3)
         | 
| 459 | 
            +
                            st.markdown(f"Special characters ratio: {special_characters_ratio}")
         | 
| 460 | 
            +
                            if is_doc_discarded(key, special_characters_ratio):
         | 
| 461 | 
            +
                                is_discarded = True
         | 
| 462 | 
            +
             | 
| 463 | 
            +
                        elif key[0] == "stopwords_ratio":
         | 
| 464 | 
            +
                            stopwords_ratio = Filtering.compute_stopwords_ratio(
         | 
| 465 | 
            +
                                personal_doc,
         | 
| 466 | 
            +
                                self.sentencepiece_model_tok,
         | 
| 467 | 
            +
                                self.param["strip_characters"],
         | 
| 468 | 
            +
                                self.param["cond_words_augmentation"],
         | 
| 469 | 
            +
                                self.param["words_augmentation_group_sizes"],
         | 
| 470 | 
            +
                                self.param["words_augmentation_join_char"],
         | 
| 471 | 
            +
                                self.stopwords,
         | 
| 472 | 
            +
                            )
         | 
| 473 | 
            +
                            stopwords_ratio = round(stopwords_ratio, 3)
         | 
| 474 | 
            +
                            st.markdown(f"Stop words ratio: {stopwords_ratio}")
         | 
| 475 | 
            +
                            if is_doc_discarded(key, stopwords_ratio):
         | 
| 476 | 
            +
                                is_discarded = True
         | 
| 477 | 
            +
             | 
| 478 | 
            +
                        elif key[0] == "badwords_ratio":
         | 
| 479 | 
            +
                            badwords_ratio = Filtering.compute_badwords_ratio(
         | 
| 480 | 
            +
                                personal_doc,
         | 
| 481 | 
            +
                                self.sentencepiece_model_tok,
         | 
| 482 | 
            +
                                self.param["strip_characters"],
         | 
| 483 | 
            +
                                self.param["cond_words_augmentation"],
         | 
| 484 | 
            +
                                self.param["words_augmentation_group_sizes"],
         | 
| 485 | 
            +
                                self.param["words_augmentation_join_char"],
         | 
| 486 | 
            +
                                self.badwords,
         | 
| 487 | 
            +
                            )
         | 
| 488 | 
            +
                            badwords_ratio = round(badwords_ratio, 3)
         | 
| 489 | 
            +
                            st.markdown(f"Flagged words ratio: {badwords_ratio}")
         | 
| 490 | 
            +
                            if is_doc_discarded(key, badwords_ratio):
         | 
| 491 | 
            +
                                is_discarded = True
         | 
| 492 | 
            +
             | 
| 493 | 
            +
                        elif key[0] == "lang_id_score":
         | 
| 494 | 
            +
                            lang_pred_dataset_id, lang_id_score = Filtering.compute_lang_id_pred_score(
         | 
| 495 | 
            +
                                personal_doc, self.model_lang_id
         | 
| 496 | 
            +
                            )
         | 
| 497 | 
            +
                            lang_id_score = round(lang_id_score, 3)
         | 
| 498 | 
            +
                            st.markdown(f"Language identification confidence score: {lang_id_score}")
         | 
| 499 | 
            +
                            if is_doc_discarded(key, badwords_ratio) or (self.lang_dataset_id != lang_pred_dataset_id):
         | 
| 500 | 
            +
                                is_discarded = True
         | 
| 501 | 
            +
             | 
| 502 | 
            +
                        elif key[0] == "perplexity_score":
         | 
| 503 | 
            +
                            perplexity_score = Filtering.compute_perplexity_score(
         | 
| 504 | 
            +
                                personal_doc,
         | 
| 505 | 
            +
                                self.sentencepiece_model,
         | 
| 506 | 
            +
                                self.kenlm_model,
         | 
| 507 | 
            +
                            )
         | 
| 508 | 
            +
                            perplexity_score = round(perplexity_score, 3)
         | 
| 509 | 
            +
                            st.markdown(f"Perplexity score: {perplexity_score}")
         | 
| 510 | 
            +
                            if is_doc_discarded(key, perplexity_score):
         | 
| 511 | 
            +
                                is_discarded = True
         | 
| 512 | 
            +
             | 
| 513 | 
            +
                    is_discarded = "" if is_discarded else "not "
         | 
| 514 | 
            +
                    st.markdown(f"With the current filtering parameters, this document is {is_discarded}discarded.")
         | 
| 515 |  | 
| 516 | 
             
                def download_data(self):
         | 
| 517 | 
             
                    st.header("Download data")
         | 
|  | |
| 531 | 
             
                    self.filtering_of_words()
         | 
| 532 | 
             
                    self.plot_distributions_filtering_parameters()
         | 
| 533 | 
             
                    #self.plot_zipf_law()
         | 
| 534 | 
            +
                    self.analyse_personal_doc()
         | 
| 535 | 
             
                    self.download_data()
         | 
| 536 |  | 
| 537 |  | 
|  | |
| 542 | 
             
            num_docs_for_words = 500
         | 
| 543 | 
             
            max_len_text_display = 10000
         | 
| 544 |  | 
| 545 | 
            +
            # Only useful for analyse_personal_doc
         | 
| 546 | 
            +
            lang_dataset_id = "en"
         | 
| 547 | 
            +
            path_fasttext_model = "./lid.176.bin"
         | 
| 548 | 
            +
            path_sentencepiece_model = "./en.sp.model"
         | 
| 549 | 
            +
            path_kenlm_model = "./en.arpa.bin"
         | 
| 550 | 
            +
             | 
| 551 | 
             
            visualization = Visualization(
         | 
| 552 | 
             
                path_instructions,
         | 
| 553 | 
             
                path_data,
         | 
|  | |
| 555 | 
             
                num_docs,
         | 
| 556 | 
             
                num_docs_for_words,
         | 
| 557 | 
             
                max_len_text_display,
         | 
| 558 | 
            +
                lang_dataset_id,
         | 
| 559 | 
            +
                path_fasttext_model,
         | 
| 560 | 
            +
                path_sentencepiece_model,
         | 
| 561 | 
            +
                path_kenlm_model,
         | 
| 562 | 
             
            )
         | 
| 563 | 
             
            visualization.visualization()
         | 
    	
        parameters_filtering.py
    CHANGED
    
    | @@ -7,8 +7,8 @@ other_special_characters = ( | |
| 7 | 
             
                "         ’“”–ー一▬…✦�£•€«»°·═"
         | 
| 8 | 
             
                "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰
‑≤≥‖"
         | 
| 9 | 
             
                "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
         | 
| 10 | 
            -
                " | 
| 11 | 
            -
                " | 
| 12 | 
             
            )
         | 
| 13 | 
             
            emoji = list(emoji.UNICODE_EMOJI["en"].keys())
         | 
| 14 |  | 
|  | |
| 7 | 
             
                "         ’“”–ー一▬…✦�£•€«»°·═"
         | 
| 8 | 
             
                "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰
‑≤≥‖"
         | 
| 9 | 
             
                "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
         | 
| 10 | 
            +
                "゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
         | 
| 11 | 
            +
                "」﴾》"
         | 
| 12 | 
             
            )
         | 
| 13 | 
             
            emoji = list(emoji.UNICODE_EMOJI["en"].keys())
         | 
| 14 |  | 

