Spaces:
Running
Running
Update curated.py
Browse files- curated.py +52 -52
curated.py
CHANGED
|
@@ -89,19 +89,19 @@ table_div_wikipedia = Div(NotStr(table_html_wikipedia), style="margin: 40px;")
|
|
| 89 |
freelaw_filter = pd.DataFrame(
|
| 90 |
{
|
| 91 |
"Dataset": [
|
| 92 |
-
"
|
| 93 |
],
|
| 94 |
"Lines Downloaded": [
|
| 95 |
-
"
|
| 96 |
],
|
| 97 |
"Percent Removed After Language Filter": [
|
| 98 |
-
"
|
| 99 |
],
|
| 100 |
"Percent Removed After Min Word Count Filter": [
|
| 101 |
-
"
|
| 102 |
],
|
| 103 |
"Percent Removed After Unigram Probability Filter": [
|
| 104 |
-
"0.
|
| 105 |
],
|
| 106 |
"Percent Removed After Local Dedup": [
|
| 107 |
"",
|
|
@@ -118,16 +118,16 @@ table_div_freelaw = Div(NotStr(table_html_freelaw), style="margin: 40px;")
|
|
| 118 |
dmm_filter = pd.DataFrame(
|
| 119 |
{
|
| 120 |
"Dataset": [
|
| 121 |
-
"
|
| 122 |
],
|
| 123 |
"Lines Downloaded": [
|
| 124 |
-
"
|
| 125 |
],
|
| 126 |
"Percent Removed After Language Filter": [
|
| 127 |
"0.00%",
|
| 128 |
],
|
| 129 |
"Percent Removed After Min Word Count Filter": [
|
| 130 |
-
"
|
| 131 |
],
|
| 132 |
"Percent Removed After Unigram Probability Filter": [
|
| 133 |
"0.00%",
|
|
@@ -148,19 +148,19 @@ table_div_dmm = Div(NotStr(table_html_dmm), style="margin: 40px;")
|
|
| 148 |
uspto_filter = pd.DataFrame(
|
| 149 |
{
|
| 150 |
"Dataset": [
|
| 151 |
-
"
|
| 152 |
],
|
| 153 |
"Lines Downloaded": [
|
| 154 |
-
"
|
| 155 |
],
|
| 156 |
"Percent Removed After Language Filter": [
|
| 157 |
-
"0.
|
| 158 |
],
|
| 159 |
"Percent Removed After Min Word Count Filter": [
|
| 160 |
-
"1.
|
| 161 |
],
|
| 162 |
"Percent Removed After Unigram Probability Filter": [
|
| 163 |
-
"0.
|
| 164 |
],
|
| 165 |
"Percent Removed After Local Dedup": [
|
| 166 |
"",
|
|
@@ -177,19 +177,19 @@ table_div_uspto = Div(NotStr(table_html_uspto), style="margin: 40px;")
|
|
| 177 |
pg19_filter = pd.DataFrame(
|
| 178 |
{
|
| 179 |
"Dataset": [
|
| 180 |
-
"
|
| 181 |
],
|
| 182 |
"Lines Downloaded": [
|
| 183 |
-
"
|
| 184 |
],
|
| 185 |
"Percent Removed After Language Filter": [
|
| 186 |
-
"0.
|
| 187 |
],
|
| 188 |
"Percent Removed After Min Word Count Filter": [
|
| 189 |
-
"
|
| 190 |
],
|
| 191 |
"Percent Removed After Unigram Probability Filter": [
|
| 192 |
-
"0.
|
| 193 |
],
|
| 194 |
"Percent Removed After Local Dedup": [
|
| 195 |
"",
|
|
@@ -207,19 +207,19 @@ table_div_pg19 = Div(NotStr(table_html_pg19), style="margin: 40px;")
|
|
| 207 |
hn_filter = pd.DataFrame(
|
| 208 |
{
|
| 209 |
"Dataset": [
|
| 210 |
-
"
|
| 211 |
],
|
| 212 |
"Lines Downloaded": [
|
| 213 |
-
"
|
| 214 |
],
|
| 215 |
"Percent Removed After Language Filter": [
|
| 216 |
-
"
|
| 217 |
],
|
| 218 |
"Percent Removed After Min Word Count Filter": [
|
| 219 |
-
"
|
| 220 |
],
|
| 221 |
"Percent Removed After Unigram Probability Filter": [
|
| 222 |
-
"0.
|
| 223 |
],
|
| 224 |
"Percent Removed After Local Dedup": [
|
| 225 |
"",
|
|
@@ -237,19 +237,19 @@ table_div_hn = Div(NotStr(table_html_hn), style="margin: 40px;")
|
|
| 237 |
uirc_filter = pd.DataFrame(
|
| 238 |
{
|
| 239 |
"Dataset": [
|
| 240 |
-
"
|
| 241 |
],
|
| 242 |
"Lines Downloaded": [
|
| 243 |
-
"
|
| 244 |
],
|
| 245 |
"Percent Removed After Language Filter": [
|
| 246 |
-
"
|
| 247 |
],
|
| 248 |
"Percent Removed After Min Word Count Filter": [
|
| 249 |
-
"
|
| 250 |
],
|
| 251 |
"Percent Removed After Unigram Probability Filter": [
|
| 252 |
-
"
|
| 253 |
],
|
| 254 |
"Percent Removed After Local Dedup": [
|
| 255 |
"",
|
|
@@ -266,16 +266,16 @@ table_div_uirc = Div(NotStr(table_html_uirc), style="margin: 40px;")
|
|
| 266 |
up_filter = pd.DataFrame(
|
| 267 |
{
|
| 268 |
"Dataset": [
|
| 269 |
-
"
|
| 270 |
],
|
| 271 |
"Lines Downloaded": [
|
| 272 |
-
"
|
| 273 |
],
|
| 274 |
"Percent Removed After Language Filter": [
|
| 275 |
"0.00%",
|
| 276 |
],
|
| 277 |
"Percent Removed After Min Word Count Filter": [
|
| 278 |
-
"
|
| 279 |
],
|
| 280 |
"Percent Removed After Unigram Probability Filter": [
|
| 281 |
"0.00%",
|
|
@@ -295,16 +295,16 @@ table_div_up = Div(NotStr(table_html_up), style="margin: 40px;")
|
|
| 295 |
se_filter = pd.DataFrame(
|
| 296 |
{
|
| 297 |
"Dataset": [
|
| 298 |
-
"
|
| 299 |
],
|
| 300 |
"Lines Downloaded": [
|
| 301 |
-
"
|
| 302 |
],
|
| 303 |
"Percent Removed After Language Filter": [
|
| 304 |
"0.00%",
|
| 305 |
],
|
| 306 |
"Percent Removed After Min Word Count Filter": [
|
| 307 |
-
"
|
| 308 |
],
|
| 309 |
"Percent Removed After Unigram Probability Filter": [
|
| 310 |
"0.00%",
|
|
@@ -324,19 +324,19 @@ table_div_se = Div(NotStr(table_html_se), style="margin: 40px;")
|
|
| 324 |
arx_filter = pd.DataFrame(
|
| 325 |
{
|
| 326 |
"Dataset": [
|
| 327 |
-
"
|
| 328 |
],
|
| 329 |
"Lines Downloaded": [
|
| 330 |
-
"
|
| 331 |
],
|
| 332 |
"Percent Removed After Language Filter": [
|
| 333 |
-
"
|
| 334 |
],
|
| 335 |
"Percent Removed After Min Word Count Filter": [
|
| 336 |
-
"
|
| 337 |
],
|
| 338 |
"Percent Removed After Unigram Probability Filter": [
|
| 339 |
-
"0.
|
| 340 |
],
|
| 341 |
"Percent Removed After Local Dedup": [
|
| 342 |
"",
|
|
@@ -353,16 +353,16 @@ table_div_arx = Div(NotStr(table_html_arx), style="margin: 40px;")
|
|
| 353 |
s2o_filter = pd.DataFrame(
|
| 354 |
{
|
| 355 |
"Dataset": [
|
| 356 |
-
"
|
| 357 |
],
|
| 358 |
"Lines Downloaded": [
|
| 359 |
-
"
|
| 360 |
],
|
| 361 |
"Percent Removed After Language Filter": [
|
| 362 |
"0.00%",
|
| 363 |
],
|
| 364 |
"Percent Removed After Min Word Count Filter": [
|
| 365 |
-
"
|
| 366 |
],
|
| 367 |
"Percent Removed After Unigram Probability Filter": [
|
| 368 |
"0.00%",
|
|
@@ -382,19 +382,19 @@ table_div_s2o = Div(NotStr(table_html_s2o), style="margin: 40px;")
|
|
| 382 |
med_filter = pd.DataFrame(
|
| 383 |
{
|
| 384 |
"Dataset": [
|
| 385 |
-
"
|
| 386 |
],
|
| 387 |
"Lines Downloaded": [
|
| 388 |
-
"
|
| 389 |
],
|
| 390 |
"Percent Removed After Language Filter": [
|
| 391 |
-
"
|
| 392 |
],
|
| 393 |
"Percent Removed After Min Word Count Filter": [
|
| 394 |
-
"1.
|
| 395 |
],
|
| 396 |
"Percent Removed After Unigram Probability Filter": [
|
| 397 |
-
"0.
|
| 398 |
],
|
| 399 |
"Percent Removed After Local Dedup": [
|
| 400 |
"",
|
|
@@ -411,19 +411,19 @@ table_div_med = Div(NotStr(table_html_med), style="margin: 40px;")
|
|
| 411 |
phil_filter = pd.DataFrame(
|
| 412 |
{
|
| 413 |
"Dataset": [
|
| 414 |
-
"
|
| 415 |
],
|
| 416 |
"Lines Downloaded": [
|
| 417 |
-
"
|
| 418 |
],
|
| 419 |
"Percent Removed After Language Filter": [
|
| 420 |
-
"
|
| 421 |
],
|
| 422 |
"Percent Removed After Min Word Count Filter": [
|
| 423 |
-
"
|
| 424 |
],
|
| 425 |
"Percent Removed After Unigram Probability Filter": [
|
| 426 |
-
"0.
|
| 427 |
],
|
| 428 |
"Percent Removed After Local Dedup": [
|
| 429 |
"",
|
|
|
|
| 89 |
freelaw_filter = pd.DataFrame(
|
| 90 |
{
|
| 91 |
"Dataset": [
|
| 92 |
+
"FreeLaw",
|
| 93 |
],
|
| 94 |
"Lines Downloaded": [
|
| 95 |
+
"75971288",
|
| 96 |
],
|
| 97 |
"Percent Removed After Language Filter": [
|
| 98 |
+
"3.00%",
|
| 99 |
],
|
| 100 |
"Percent Removed After Min Word Count Filter": [
|
| 101 |
+
"7.49%",
|
| 102 |
],
|
| 103 |
"Percent Removed After Unigram Probability Filter": [
|
| 104 |
+
"0.07%",
|
| 105 |
],
|
| 106 |
"Percent Removed After Local Dedup": [
|
| 107 |
"",
|
|
|
|
| 118 |
dmm_filter = pd.DataFrame(
|
| 119 |
{
|
| 120 |
"Dataset": [
|
| 121 |
+
"DM Math",
|
| 122 |
],
|
| 123 |
"Lines Downloaded": [
|
| 124 |
+
"112559888",
|
| 125 |
],
|
| 126 |
"Percent Removed After Language Filter": [
|
| 127 |
"0.00%",
|
| 128 |
],
|
| 129 |
"Percent Removed After Min Word Count Filter": [
|
| 130 |
+
"0.00%",
|
| 131 |
],
|
| 132 |
"Percent Removed After Unigram Probability Filter": [
|
| 133 |
"0.00%",
|
|
|
|
| 148 |
uspto_filter = pd.DataFrame(
|
| 149 |
{
|
| 150 |
"Dataset": [
|
| 151 |
+
"USPTO",
|
| 152 |
],
|
| 153 |
"Lines Downloaded": [
|
| 154 |
+
"6880276",
|
| 155 |
],
|
| 156 |
"Percent Removed After Language Filter": [
|
| 157 |
+
"0.02%",
|
| 158 |
],
|
| 159 |
"Percent Removed After Min Word Count Filter": [
|
| 160 |
+
"1.88%",
|
| 161 |
],
|
| 162 |
"Percent Removed After Unigram Probability Filter": [
|
| 163 |
+
"0.01%",
|
| 164 |
],
|
| 165 |
"Percent Removed After Local Dedup": [
|
| 166 |
"",
|
|
|
|
| 177 |
pg19_filter = pd.DataFrame(
|
| 178 |
{
|
| 179 |
"Dataset": [
|
| 180 |
+
"PG-19",
|
| 181 |
],
|
| 182 |
"Lines Downloaded": [
|
| 183 |
+
"28752",
|
| 184 |
],
|
| 185 |
"Percent Removed After Language Filter": [
|
| 186 |
+
"0.24%",
|
| 187 |
],
|
| 188 |
"Percent Removed After Min Word Count Filter": [
|
| 189 |
+
"0.00%",
|
| 190 |
],
|
| 191 |
"Percent Removed After Unigram Probability Filter": [
|
| 192 |
+
"0.17%",
|
| 193 |
],
|
| 194 |
"Percent Removed After Local Dedup": [
|
| 195 |
"",
|
|
|
|
| 207 |
hn_filter = pd.DataFrame(
|
| 208 |
{
|
| 209 |
"Dataset": [
|
| 210 |
+
"HackerNews",
|
| 211 |
],
|
| 212 |
"Lines Downloaded": [
|
| 213 |
+
"2064931",
|
| 214 |
],
|
| 215 |
"Percent Removed After Language Filter": [
|
| 216 |
+
"2.62%%",
|
| 217 |
],
|
| 218 |
"Percent Removed After Min Word Count Filter": [
|
| 219 |
+
"0.02%",
|
| 220 |
],
|
| 221 |
"Percent Removed After Unigram Probability Filter": [
|
| 222 |
+
"0.34%",
|
| 223 |
],
|
| 224 |
"Percent Removed After Local Dedup": [
|
| 225 |
"",
|
|
|
|
| 237 |
uirc_filter = pd.DataFrame(
|
| 238 |
{
|
| 239 |
"Dataset": [
|
| 240 |
+
"Ubunutu IRC",
|
| 241 |
],
|
| 242 |
"Lines Downloaded": [
|
| 243 |
+
"37966",
|
| 244 |
],
|
| 245 |
"Percent Removed After Language Filter": [
|
| 246 |
+
"38.10%",
|
| 247 |
],
|
| 248 |
"Percent Removed After Min Word Count Filter": [
|
| 249 |
+
"0.14%",
|
| 250 |
],
|
| 251 |
"Percent Removed After Unigram Probability Filter": [
|
| 252 |
+
"1.12%",
|
| 253 |
],
|
| 254 |
"Percent Removed After Local Dedup": [
|
| 255 |
"",
|
|
|
|
| 266 |
up_filter = pd.DataFrame(
|
| 267 |
{
|
| 268 |
"Dataset": [
|
| 269 |
+
"EuroParl",
|
| 270 |
],
|
| 271 |
"Lines Downloaded": [
|
| 272 |
+
"69814",
|
| 273 |
],
|
| 274 |
"Percent Removed After Language Filter": [
|
| 275 |
"0.00%",
|
| 276 |
],
|
| 277 |
"Percent Removed After Min Word Count Filter": [
|
| 278 |
+
"0.00%",
|
| 279 |
],
|
| 280 |
"Percent Removed After Unigram Probability Filter": [
|
| 281 |
"0.00%",
|
|
|
|
| 295 |
se_filter = pd.DataFrame(
|
| 296 |
{
|
| 297 |
"Dataset": [
|
| 298 |
+
"StackExchange",
|
| 299 |
],
|
| 300 |
"Lines Downloaded": [
|
| 301 |
+
"23246548",
|
| 302 |
],
|
| 303 |
"Percent Removed After Language Filter": [
|
| 304 |
"0.00%",
|
| 305 |
],
|
| 306 |
"Percent Removed After Min Word Count Filter": [
|
| 307 |
+
"0.00%",
|
| 308 |
],
|
| 309 |
"Percent Removed After Unigram Probability Filter": [
|
| 310 |
"0.00%",
|
|
|
|
| 324 |
arx_filter = pd.DataFrame(
|
| 325 |
{
|
| 326 |
"Dataset": [
|
| 327 |
+
"ArXiv",
|
| 328 |
],
|
| 329 |
"Lines Downloaded": [
|
| 330 |
+
"1911867",
|
| 331 |
],
|
| 332 |
"Percent Removed After Language Filter": [
|
| 333 |
+
"2.22%",
|
| 334 |
],
|
| 335 |
"Percent Removed After Min Word Count Filter": [
|
| 336 |
+
"5.65%",
|
| 337 |
],
|
| 338 |
"Percent Removed After Unigram Probability Filter": [
|
| 339 |
+
"0.07%",
|
| 340 |
],
|
| 341 |
"Percent Removed After Local Dedup": [
|
| 342 |
"",
|
|
|
|
| 353 |
s2o_filter = pd.DataFrame(
|
| 354 |
{
|
| 355 |
"Dataset": [
|
| 356 |
+
"S2ORC",
|
| 357 |
],
|
| 358 |
"Lines Downloaded": [
|
| 359 |
+
"12963563",
|
| 360 |
],
|
| 361 |
"Percent Removed After Language Filter": [
|
| 362 |
"0.00%",
|
| 363 |
],
|
| 364 |
"Percent Removed After Min Word Count Filter": [
|
| 365 |
+
"0.00%",
|
| 366 |
],
|
| 367 |
"Percent Removed After Unigram Probability Filter": [
|
| 368 |
"0.00%",
|
|
|
|
| 382 |
med_filter = pd.DataFrame(
|
| 383 |
{
|
| 384 |
"Dataset": [
|
| 385 |
+
"PubMed - Central",
|
| 386 |
],
|
| 387 |
"Lines Downloaded": [
|
| 388 |
+
"5230932",
|
| 389 |
],
|
| 390 |
"Percent Removed After Language Filter": [
|
| 391 |
+
"7.66%",
|
| 392 |
],
|
| 393 |
"Percent Removed After Min Word Count Filter": [
|
| 394 |
+
"1.29%",
|
| 395 |
],
|
| 396 |
"Percent Removed After Unigram Probability Filter": [
|
| 397 |
+
"0.02%",
|
| 398 |
],
|
| 399 |
"Percent Removed After Local Dedup": [
|
| 400 |
"",
|
|
|
|
| 411 |
phil_filter = pd.DataFrame(
|
| 412 |
{
|
| 413 |
"Dataset": [
|
| 414 |
+
"Phil Papers",
|
| 415 |
],
|
| 416 |
"Lines Downloaded": [
|
| 417 |
+
"49389",
|
| 418 |
],
|
| 419 |
"Percent Removed After Language Filter": [
|
| 420 |
+
"20.68%",
|
| 421 |
],
|
| 422 |
"Percent Removed After Min Word Count Filter": [
|
| 423 |
+
"0.00%",
|
| 424 |
],
|
| 425 |
"Percent Removed After Unigram Probability Filter": [
|
| 426 |
+
"0.12%",
|
| 427 |
],
|
| 428 |
"Percent Removed After Local Dedup": [
|
| 429 |
"",
|