Spaces:
Running
Running
Update curated.py
Browse files- curated.py +32 -3
curated.py
CHANGED
|
@@ -9,12 +9,41 @@ from rich import print
|
|
| 9 |
import uuid
|
| 10 |
import plotly.express as px
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
|
| 13 |
copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
|
| 14 |
|
| 15 |
local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
|
| 16 |
|
| 17 |
-
|
| 18 |
treemap_data = {
|
| 19 |
'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
|
| 20 |
'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
|
|
@@ -467,7 +496,7 @@ def curated(request):
|
|
| 467 |
table_html = preprocessing_steps.to_html(index=False, border=0)
|
| 468 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
| 469 |
data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
|
| 470 |
-
|
| 471 |
return Div(
|
| 472 |
H2("Curated Sources: Overview"),
|
| 473 |
overview_text,
|
|
@@ -475,7 +504,7 @@ def curated(request):
|
|
| 475 |
plotly2fasthtml(treemap_chart),
|
| 476 |
table_desc,
|
| 477 |
H2("Curated Sources: Data Gathering and Filtering"),
|
| 478 |
-
|
| 479 |
data_preparation_div,
|
| 480 |
H3("Data Filtering"),
|
| 481 |
data_preprocessing_div,
|
|
|
|
| 9 |
import uuid
|
| 10 |
import plotly.express as px
|
| 11 |
|
| 12 |
+
filtering_process = Div(
|
| 13 |
+
Section(
|
| 14 |
+
H3("Title"),
|
| 15 |
+
H4("Download and Extraction"),
|
| 16 |
+
Ol(
|
| 17 |
+
Li("one"),
|
| 18 |
+
Li("two"),
|
| 19 |
+
),
|
| 20 |
+
H4("Filtering"),
|
| 21 |
+
Ol(
|
| 22 |
+
Li("one"),
|
| 23 |
+
Li("two"),
|
| 24 |
+
),
|
| 25 |
+
H4("Local Deduplication Process"),
|
| 26 |
+
Ol(
|
| 27 |
+
Li("one"),
|
| 28 |
+
Li("two"),
|
| 29 |
+
),
|
| 30 |
+
H4("Global Deduplication Process"),
|
| 31 |
+
Ol(
|
| 32 |
+
Li("one"),
|
| 33 |
+
Li("two"),
|
| 34 |
+
),
|
| 35 |
+
|
| 36 |
+
),
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
|
| 43 |
copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
|
| 44 |
|
| 45 |
local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
|
| 46 |
|
|
|
|
| 47 |
treemap_data = {
|
| 48 |
'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
|
| 49 |
'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
|
|
|
|
| 496 |
table_html = preprocessing_steps.to_html(index=False, border=0)
|
| 497 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
| 498 |
data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
|
| 499 |
+
|
| 500 |
return Div(
|
| 501 |
H2("Curated Sources: Overview"),
|
| 502 |
overview_text,
|
|
|
|
| 504 |
plotly2fasthtml(treemap_chart),
|
| 505 |
table_desc,
|
| 506 |
H2("Curated Sources: Data Gathering and Filtering"),
|
| 507 |
+
filtering_process,
|
| 508 |
data_preparation_div,
|
| 509 |
H3("Data Filtering"),
|
| 510 |
data_preprocessing_div,
|