Spaces:
Running
Running
Update curated.py
Browse files- curated.py +201 -24
curated.py
CHANGED
|
@@ -571,6 +571,183 @@ phil_examples = Div(
|
|
| 571 |
),
|
| 572 |
)
|
| 573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
filtering_process = Div(
|
| 575 |
Section(
|
| 576 |
H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
|
|
@@ -605,10 +782,10 @@ filtering_process = Div(
|
|
| 605 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
| 606 |
),
|
| 607 |
table_div_arx,
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
),
|
| 613 |
),
|
| 614 |
Section(
|
|
@@ -647,10 +824,10 @@ filtering_process = Div(
|
|
| 647 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
| 648 |
),
|
| 649 |
table_div_s2o,
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
),
|
| 655 |
),
|
| 656 |
Section(
|
|
@@ -683,10 +860,10 @@ filtering_process = Div(
|
|
| 683 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
| 684 |
),
|
| 685 |
table_div_med,
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
),
|
| 691 |
),
|
| 692 |
Section(
|
|
@@ -715,10 +892,10 @@ filtering_process = Div(
|
|
| 715 |
H4("Filtering"),
|
| 716 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
| 717 |
table_div_up,
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
),
|
| 723 |
),
|
| 724 |
Section(
|
|
@@ -860,10 +1037,10 @@ filtering_process = Div(
|
|
| 860 |
Li("None"),
|
| 861 |
),
|
| 862 |
table_div_dmm,
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
),
|
| 868 |
),
|
| 869 |
Section(
|
|
@@ -881,10 +1058,10 @@ filtering_process = Div(
|
|
| 881 |
Li("Unigram Log Probability"),
|
| 882 |
),
|
| 883 |
table_div_pg19,
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
),
|
| 889 |
),
|
| 890 |
)
|
|
|
|
| 571 |
),
|
| 572 |
)
|
| 573 |
|
| 574 |
+
arx_examples = Div(
|
| 575 |
+
Div(
|
| 576 |
+
get_arx_data(target=gen_random_id()),
|
| 577 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
| 578 |
+
),
|
| 579 |
+
)
|
| 580 |
+
|
| 581 |
+
def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
|
| 582 |
+
doc_id = max(0, min(int(doc_id), 9))
|
| 583 |
+
|
| 584 |
+
if data_source == "S2ORC":
|
| 585 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
| 586 |
+
open("data/curated_samples/s2orc_raw.json")
|
| 587 |
+
)
|
| 588 |
+
else:
|
| 589 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 590 |
+
|
| 591 |
+
raw_json = raw_sample_doc[doc_id]
|
| 592 |
+
extracted_json = extracted_sample_doc[doc_id]
|
| 593 |
+
return view_data(
|
| 594 |
+
raw_json,
|
| 595 |
+
extracted_json,
|
| 596 |
+
doc_id=doc_id,
|
| 597 |
+
data_source="S2ORC",
|
| 598 |
+
data_sources="S2ORC",
|
| 599 |
+
target=target,
|
| 600 |
+
)
|
| 601 |
+
|
| 602 |
+
s2o_examples = Div(
|
| 603 |
+
Div(
|
| 604 |
+
get_S2ORC_data(target=gen_random_id()),
|
| 605 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
| 606 |
+
),
|
| 607 |
+
)
|
| 608 |
+
|
| 609 |
+
def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
|
| 610 |
+
doc_id = max(0, min(int(doc_id), 9))
|
| 611 |
+
|
| 612 |
+
if data_source == "S2ORC":
|
| 613 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
| 614 |
+
open("data/curated_samples/s2orc_abstract_raw.json")
|
| 615 |
+
)
|
| 616 |
+
else:
|
| 617 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 618 |
+
|
| 619 |
+
raw_json = raw_sample_doc[doc_id]
|
| 620 |
+
extracted_json = extracted_sample_doc[doc_id]
|
| 621 |
+
return view_data(
|
| 622 |
+
raw_json,
|
| 623 |
+
extracted_json,
|
| 624 |
+
doc_id=doc_id,
|
| 625 |
+
data_source="S2ORC Abstract",
|
| 626 |
+
data_sources="S2ORC Abstract",
|
| 627 |
+
target=target,
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
s2oa_examples = Div(
|
| 631 |
+
Div(
|
| 632 |
+
get_S2ORCA_data(target=gen_random_id()),
|
| 633 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
| 634 |
+
),
|
| 635 |
+
)
|
| 636 |
+
|
| 637 |
+
def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
|
| 638 |
+
doc_id = max(0, min(int(doc_id), 9))
|
| 639 |
+
|
| 640 |
+
if data_source == "Pubmed":
|
| 641 |
+
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
|
| 642 |
+
extracted_sample_doc = json.load(
|
| 643 |
+
open("data/curated_samples/pubmed_extract.json")
|
| 644 |
+
)
|
| 645 |
+
else:
|
| 646 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 647 |
+
|
| 648 |
+
raw_json = raw_sample_doc[doc_id]
|
| 649 |
+
extracted_json = extracted_sample_doc[doc_id]
|
| 650 |
+
return view_data(
|
| 651 |
+
raw_json,
|
| 652 |
+
extracted_json,
|
| 653 |
+
doc_id=doc_id,
|
| 654 |
+
data_source="Pubmed",
|
| 655 |
+
data_sources="Pubmed",
|
| 656 |
+
target=target,
|
| 657 |
+
)
|
| 658 |
+
|
| 659 |
+
pubmed_examples = Div(
|
| 660 |
+
Div(
|
| 661 |
+
get_pubmed_data(target=gen_random_id()),
|
| 662 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
| 663 |
+
),
|
| 664 |
+
)
|
| 665 |
+
|
| 666 |
+
def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
|
| 667 |
+
doc_id = max(0, min(int(doc_id), 9))
|
| 668 |
+
|
| 669 |
+
if data_source == "DM Math":
|
| 670 |
+
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
|
| 671 |
+
extracted_sample_doc = json.load(
|
| 672 |
+
open("data/curated_samples/dm_maths_extract.json")
|
| 673 |
+
)
|
| 674 |
+
else:
|
| 675 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 676 |
+
|
| 677 |
+
raw_json = raw_sample_doc[doc_id]
|
| 678 |
+
extracted_json = extracted_sample_doc[doc_id]
|
| 679 |
+
return view_data(
|
| 680 |
+
raw_json,
|
| 681 |
+
extracted_json,
|
| 682 |
+
doc_id=doc_id,
|
| 683 |
+
data_source="DM Math",
|
| 684 |
+
data_sources="DM Math",
|
| 685 |
+
target=target,
|
| 686 |
+
)
|
| 687 |
+
|
| 688 |
+
dmm_examples = Div(
|
| 689 |
+
Div(
|
| 690 |
+
get_dmm_data(target=gen_random_id()),
|
| 691 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
| 692 |
+
),
|
| 693 |
+
)
|
| 694 |
+
|
| 695 |
+
def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
|
| 696 |
+
doc_id = max(0, min(int(doc_id), 9))
|
| 697 |
+
|
| 698 |
+
if data_source == "PG19":
|
| 699 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
| 700 |
+
open("data/curated_samples/pg19_raw.json")
|
| 701 |
+
)
|
| 702 |
+
else:
|
| 703 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 704 |
+
|
| 705 |
+
raw_json = raw_sample_doc[doc_id]
|
| 706 |
+
extracted_json = extracted_sample_doc[doc_id]
|
| 707 |
+
return view_data(
|
| 708 |
+
raw_json,
|
| 709 |
+
extracted_json,
|
| 710 |
+
doc_id=doc_id,
|
| 711 |
+
data_source="PG19",
|
| 712 |
+
data_sources="PG19",
|
| 713 |
+
target=target,
|
| 714 |
+
)
|
| 715 |
+
|
| 716 |
+
pg19_examples = Div(
|
| 717 |
+
Div(
|
| 718 |
+
get_pg19_data(target=gen_random_id()),
|
| 719 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
| 720 |
+
),
|
| 721 |
+
)
|
| 722 |
+
|
| 723 |
+
def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
|
| 724 |
+
doc_id = max(0, min(int(doc_id), 9))
|
| 725 |
+
|
| 726 |
+
if data_source == "Europarl":
|
| 727 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
| 728 |
+
open("data/curated_samples/europarl_raw.json")
|
| 729 |
+
)
|
| 730 |
+
else:
|
| 731 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 732 |
+
|
| 733 |
+
raw_json = raw_sample_doc[doc_id]
|
| 734 |
+
extracted_json = extracted_sample_doc[doc_id]
|
| 735 |
+
return view_data(
|
| 736 |
+
raw_json,
|
| 737 |
+
extracted_json,
|
| 738 |
+
doc_id=doc_id,
|
| 739 |
+
data_source="Europarl",
|
| 740 |
+
data_sources="Europarl",
|
| 741 |
+
target=target,
|
| 742 |
+
)
|
| 743 |
+
|
| 744 |
+
eu_examples = Div(
|
| 745 |
+
Div(
|
| 746 |
+
get_eu_data(target=gen_random_id()),
|
| 747 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
| 748 |
+
),
|
| 749 |
+
)
|
| 750 |
+
|
| 751 |
filtering_process = Div(
|
| 752 |
Section(
|
| 753 |
H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
|
|
|
|
| 782 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
| 783 |
),
|
| 784 |
table_div_arx,
|
| 785 |
+
Details(
|
| 786 |
+
Summary("ArXiv Filtering Examples"),
|
| 787 |
+
arx_examples,
|
| 788 |
+
),
|
| 789 |
),
|
| 790 |
),
|
| 791 |
Section(
|
|
|
|
| 824 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
| 825 |
),
|
| 826 |
table_div_s2o,
|
| 827 |
+
Details(
|
| 828 |
+
Summary("FreeLaw Filtering Examples -- need to update"),
|
| 829 |
+
freelaw_examples,
|
| 830 |
+
),
|
| 831 |
),
|
| 832 |
),
|
| 833 |
Section(
|
|
|
|
| 860 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
| 861 |
),
|
| 862 |
table_div_med,
|
| 863 |
+
Details(
|
| 864 |
+
Summary("PubMed Filtering Examples"),
|
| 865 |
+
pubmed_examples,
|
| 866 |
+
),
|
| 867 |
),
|
| 868 |
),
|
| 869 |
Section(
|
|
|
|
| 892 |
H4("Filtering"),
|
| 893 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
| 894 |
table_div_up,
|
| 895 |
+
Details(
|
| 896 |
+
Summary("EuroParl Filtering Examples"),
|
| 897 |
+
eu_examples,
|
| 898 |
+
),
|
| 899 |
),
|
| 900 |
),
|
| 901 |
Section(
|
|
|
|
| 1037 |
Li("None"),
|
| 1038 |
),
|
| 1039 |
table_div_dmm,
|
| 1040 |
+
Details(
|
| 1041 |
+
Summary("DM Math Filtering Examples"),
|
| 1042 |
+
dmm_examples,
|
| 1043 |
+
),
|
| 1044 |
),
|
| 1045 |
),
|
| 1046 |
Section(
|
|
|
|
| 1058 |
Li("Unigram Log Probability"),
|
| 1059 |
),
|
| 1060 |
table_div_pg19,
|
| 1061 |
+
Details(
|
| 1062 |
+
Summary("PG-19 Filtering Examples"),
|
| 1063 |
+
pg19_examples,
|
| 1064 |
+
),
|
| 1065 |
),
|
| 1066 |
),
|
| 1067 |
)
|