Spaces:
Running
Running
Update overview.py
Browse files- overview.py +25 -4
overview.py
CHANGED
|
@@ -11,7 +11,7 @@ import web
|
|
| 11 |
import common
|
| 12 |
import results
|
| 13 |
|
| 14 |
-
|
| 15 |
{
|
| 16 |
"Dataset": [
|
| 17 |
"TxT360",
|
|
@@ -83,6 +83,26 @@ dataset_comparison = pd.DataFrame(
|
|
| 83 |
"-",
|
| 84 |
"Included",
|
| 85 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
"PG-19": [
|
| 87 |
"Included",
|
| 88 |
"-",
|
|
@@ -146,8 +166,8 @@ dataset_comparison = pd.DataFrame(
|
|
| 146 |
}
|
| 147 |
)
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
|
| 152 |
dataset_sources = pd.DataFrame(
|
| 153 |
{
|
|
@@ -259,7 +279,8 @@ both critical for effective LLM pre-training."""),
|
|
| 259 |
P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training."),
|
| 260 |
H3("TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered."),
|
| 261 |
P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
|
| 262 |
-
|
|
|
|
| 263 |
P("Table 2: Basic TxT360 Statistics."),
|
| 264 |
table_div1,
|
| 265 |
),
|
|
|
|
| 11 |
import common
|
| 12 |
import results
|
| 13 |
|
| 14 |
+
dataset_comparison1 = pd.DataFrame(
|
| 15 |
{
|
| 16 |
"Dataset": [
|
| 17 |
"TxT360",
|
|
|
|
| 83 |
"-",
|
| 84 |
"Included",
|
| 85 |
],
|
| 86 |
+
|
| 87 |
+
}
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
table_html = dataset_comparison1.to_html(index=False, border=0)
|
| 91 |
+
table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
|
| 92 |
+
|
| 93 |
+
dataset_comparison2 = pd.DataFrame(
|
| 94 |
+
{
|
| 95 |
+
"Dataset": [
|
| 96 |
+
"TxT360",
|
| 97 |
+
"FineWeb",
|
| 98 |
+
"RefinedWeb",
|
| 99 |
+
"RedPajama-v2",
|
| 100 |
+
"C4",
|
| 101 |
+
"Dolma",
|
| 102 |
+
"RedPajama-v1",
|
| 103 |
+
"The Pile",
|
| 104 |
+
],
|
| 105 |
+
|
| 106 |
"PG-19": [
|
| 107 |
"Included",
|
| 108 |
"-",
|
|
|
|
| 166 |
}
|
| 167 |
)
|
| 168 |
|
| 169 |
+
table_html2 = dataset_comparison2.to_html(index=False, border=0)
|
| 170 |
+
table_div2 = Div(NotStr(table_html2), style="margin: 40px;")
|
| 171 |
|
| 172 |
dataset_sources = pd.DataFrame(
|
| 173 |
{
|
|
|
|
| 279 |
P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training."),
|
| 280 |
H3("TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered."),
|
| 281 |
P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
|
| 282 |
+
table_div1,
|
| 283 |
+
table_div2,
|
| 284 |
P("Table 2: Basic TxT360 Statistics."),
|
| 285 |
table_div1,
|
| 286 |
),
|