Spaces:
Runtime error
Runtime error
Remove unused nested column
Browse files
app.py
CHANGED
|
@@ -145,14 +145,24 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
|
|
| 145 |
|
| 146 |
|
| 147 |
@spaces.GPU(duration=60 * 5)
|
| 148 |
-
def generate_topics(dataset, config, split, column,
|
| 149 |
logging.info(
|
| 150 |
-
f"Generating topics for {dataset=} {config=} {split=} {column=} {
|
| 151 |
)
|
| 152 |
|
| 153 |
parquet_urls = get_parquet_urls(dataset, config, split)
|
| 154 |
split_rows = get_split_rows(dataset, config, split)
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
limit = min(split_rows, MAX_ROWS)
|
| 158 |
n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
|
|
@@ -178,6 +188,11 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 178 |
if full_processing
|
| 179 |
else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
|
| 180 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
yield (
|
| 182 |
gr.Accordion(open=False),
|
| 183 |
gr.DataFrame(value=[], interactive=False, visible=True),
|
|
@@ -185,6 +200,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 185 |
gr.Label({message: rows_processed / limit}, visible=True),
|
| 186 |
"",
|
| 187 |
)
|
|
|
|
| 188 |
while offset < limit:
|
| 189 |
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
|
| 190 |
if not docs:
|
|
@@ -199,6 +215,9 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 199 |
|
| 200 |
if base_model is None:
|
| 201 |
base_model = new_model
|
|
|
|
|
|
|
|
|
|
| 202 |
else:
|
| 203 |
updated_model = BERTopic.merge_models([base_model, new_model])
|
| 204 |
nr_new_topics = len(set(updated_model.topics_)) - len(
|
|
@@ -216,11 +235,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 216 |
|
| 217 |
topics_info = base_model.get_topic_info()
|
| 218 |
all_topics = base_model.topics_
|
| 219 |
-
sub_title = (
|
| 220 |
-
f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
|
| 221 |
-
if full_processing
|
| 222 |
-
else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
|
| 223 |
-
)
|
| 224 |
topic_plot = (
|
| 225 |
base_model.visualize_document_datamap(
|
| 226 |
docs=all_docs,
|
|
@@ -271,7 +285,8 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 271 |
|
| 272 |
logging.info("Finished processing all data")
|
| 273 |
|
| 274 |
-
|
|
|
|
| 275 |
if plot_type == "DataMapPlot":
|
| 276 |
topic_plot.savefig(plot_png, format="png", dpi=300)
|
| 277 |
else:
|
|
@@ -287,7 +302,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 287 |
for topic in all_topics
|
| 288 |
]
|
| 289 |
)
|
| 290 |
-
dataset_clear_name = dataset.replace("/", "-")
|
| 291 |
interactive_plot = datamapplot.create_interactive_plot(
|
| 292 |
reduced_embeddings_array,
|
| 293 |
topic_names_array,
|
|
@@ -308,7 +322,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 308 |
with open(html_file_path, "w", encoding="utf-8") as html_file:
|
| 309 |
html_file.write(html_content)
|
| 310 |
|
| 311 |
-
repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{
|
| 312 |
|
| 313 |
space_id = create_space_with_content(
|
| 314 |
api=api,
|
|
@@ -364,9 +378,6 @@ with gr.Blocks() as demo:
|
|
| 364 |
|
| 365 |
with gr.Row():
|
| 366 |
text_column_dropdown = gr.Dropdown(label="Text column name")
|
| 367 |
-
nested_text_column_dropdown = gr.Dropdown(
|
| 368 |
-
label="Nested text column name", visible=False
|
| 369 |
-
)
|
| 370 |
plot_type_radio = gr.Radio(
|
| 371 |
["DataMapPlot", "Plotly"],
|
| 372 |
value="DataMapPlot",
|
|
@@ -388,7 +399,6 @@ with gr.Blocks() as demo:
|
|
| 388 |
subset_dropdown,
|
| 389 |
split_dropdown,
|
| 390 |
text_column_dropdown,
|
| 391 |
-
nested_text_column_dropdown,
|
| 392 |
plot_type_radio,
|
| 393 |
],
|
| 394 |
outputs=[
|
|
@@ -408,7 +418,6 @@ with gr.Blocks() as demo:
|
|
| 408 |
subset_dropdown: gr.Dropdown(visible=False),
|
| 409 |
split_dropdown: gr.Dropdown(visible=False),
|
| 410 |
text_column_dropdown: gr.Dropdown(label="Text column name"),
|
| 411 |
-
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
| 412 |
}
|
| 413 |
try:
|
| 414 |
info_resp = get_info(dataset)
|
|
@@ -417,7 +426,6 @@ with gr.Blocks() as demo:
|
|
| 417 |
subset_dropdown: gr.Dropdown(visible=False),
|
| 418 |
split_dropdown: gr.Dropdown(visible=False),
|
| 419 |
text_column_dropdown: gr.Dropdown(label="Text column name"),
|
| 420 |
-
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
| 421 |
}
|
| 422 |
subsets: list[str] = list(info_resp)
|
| 423 |
subset = default_subset if default_subset in subsets else subsets[0]
|
|
@@ -433,20 +441,6 @@ with gr.Blocks() as demo:
|
|
| 433 |
for feature_name, feature in features.items()
|
| 434 |
if _is_string_feature(feature)
|
| 435 |
]
|
| 436 |
-
nested_features = [
|
| 437 |
-
feature_name
|
| 438 |
-
for feature_name, feature in features.items()
|
| 439 |
-
if isinstance(feature, dict)
|
| 440 |
-
and isinstance(next(iter(feature.values())), dict)
|
| 441 |
-
]
|
| 442 |
-
nested_text_features = [
|
| 443 |
-
feature_name
|
| 444 |
-
for feature_name in nested_features
|
| 445 |
-
if any(
|
| 446 |
-
_is_string_feature(nested_feature)
|
| 447 |
-
for nested_feature in features[feature_name].values()
|
| 448 |
-
)
|
| 449 |
-
]
|
| 450 |
if not text_feature:
|
| 451 |
return {
|
| 452 |
subset_dropdown: gr.Dropdown(
|
|
@@ -456,34 +450,9 @@ with gr.Blocks() as demo:
|
|
| 456 |
value=split, choices=splits, visible=len(splits) > 1
|
| 457 |
),
|
| 458 |
text_column_dropdown: gr.Dropdown(
|
| 459 |
-
choices=text_features
|
| 460 |
label="Text column name",
|
| 461 |
),
|
| 462 |
-
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
| 463 |
-
}
|
| 464 |
-
if text_feature in nested_text_features:
|
| 465 |
-
nested_keys = [
|
| 466 |
-
feature_name
|
| 467 |
-
for feature_name, feature in features[text_feature].items()
|
| 468 |
-
if _is_string_feature(feature)
|
| 469 |
-
]
|
| 470 |
-
return {
|
| 471 |
-
subset_dropdown: gr.Dropdown(
|
| 472 |
-
value=subset, choices=subsets, visible=len(subsets) > 1
|
| 473 |
-
),
|
| 474 |
-
split_dropdown: gr.Dropdown(
|
| 475 |
-
value=split, choices=splits, visible=len(splits) > 1
|
| 476 |
-
),
|
| 477 |
-
text_column_dropdown: gr.Dropdown(
|
| 478 |
-
choices=text_features + nested_text_features,
|
| 479 |
-
label="Text column name",
|
| 480 |
-
),
|
| 481 |
-
nested_text_column_dropdown: gr.Dropdown(
|
| 482 |
-
value=nested_keys[0],
|
| 483 |
-
choices=nested_keys,
|
| 484 |
-
label="Nested text column name",
|
| 485 |
-
visible=True,
|
| 486 |
-
),
|
| 487 |
}
|
| 488 |
return {
|
| 489 |
subset_dropdown: gr.Dropdown(
|
|
@@ -493,9 +462,8 @@ with gr.Blocks() as demo:
|
|
| 493 |
value=split, choices=splits, visible=len(splits) > 1
|
| 494 |
),
|
| 495 |
text_column_dropdown: gr.Dropdown(
|
| 496 |
-
choices=text_features
|
| 497 |
),
|
| 498 |
-
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
| 499 |
}
|
| 500 |
|
| 501 |
@dataset_name.change(
|
|
@@ -504,7 +472,6 @@ with gr.Blocks() as demo:
|
|
| 504 |
subset_dropdown,
|
| 505 |
split_dropdown,
|
| 506 |
text_column_dropdown,
|
| 507 |
-
nested_text_column_dropdown,
|
| 508 |
],
|
| 509 |
)
|
| 510 |
def show_input_from_subset_dropdown(dataset: str) -> dict:
|
|
@@ -518,7 +485,6 @@ with gr.Blocks() as demo:
|
|
| 518 |
subset_dropdown,
|
| 519 |
split_dropdown,
|
| 520 |
text_column_dropdown,
|
| 521 |
-
nested_text_column_dropdown,
|
| 522 |
],
|
| 523 |
)
|
| 524 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
|
@@ -532,7 +498,6 @@ with gr.Blocks() as demo:
|
|
| 532 |
subset_dropdown,
|
| 533 |
split_dropdown,
|
| 534 |
text_column_dropdown,
|
| 535 |
-
nested_text_column_dropdown,
|
| 536 |
],
|
| 537 |
)
|
| 538 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
|
@@ -546,7 +511,6 @@ with gr.Blocks() as demo:
|
|
| 546 |
subset_dropdown,
|
| 547 |
split_dropdown,
|
| 548 |
text_column_dropdown,
|
| 549 |
-
nested_text_column_dropdown,
|
| 550 |
],
|
| 551 |
)
|
| 552 |
def show_input_from_text_column_dropdown(
|
|
|
|
| 145 |
|
| 146 |
|
| 147 |
@spaces.GPU(duration=60 * 5)
|
| 148 |
+
def generate_topics(dataset, config, split, column, plot_type):
|
| 149 |
logging.info(
|
| 150 |
+
f"Generating topics for {dataset=} {config=} {split=} {column=} {plot_type=}"
|
| 151 |
)
|
| 152 |
|
| 153 |
parquet_urls = get_parquet_urls(dataset, config, split)
|
| 154 |
split_rows = get_split_rows(dataset, config, split)
|
| 155 |
+
if split_rows is None or split_rows == 0:
|
| 156 |
+
return (
|
| 157 |
+
gr.Accordion(open=True),
|
| 158 |
+
gr.DataFrame(value=[], interactive=False, visible=True),
|
| 159 |
+
gr.Plot(value=None, visible=True),
|
| 160 |
+
gr.Label(
|
| 161 |
+
{"❌ Error: No data found for the selected dataset": 0.0}, visible=True
|
| 162 |
+
),
|
| 163 |
+
"",
|
| 164 |
+
)
|
| 165 |
+
logging.info(f"Split number of rows: {split_rows}")
|
| 166 |
|
| 167 |
limit = min(split_rows, MAX_ROWS)
|
| 168 |
n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
|
|
|
|
| 188 |
if full_processing
|
| 189 |
else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
|
| 190 |
)
|
| 191 |
+
sub_title = (
|
| 192 |
+
f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
|
| 193 |
+
if full_processing
|
| 194 |
+
else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
|
| 195 |
+
)
|
| 196 |
yield (
|
| 197 |
gr.Accordion(open=False),
|
| 198 |
gr.DataFrame(value=[], interactive=False, visible=True),
|
|
|
|
| 200 |
gr.Label({message: rows_processed / limit}, visible=True),
|
| 201 |
"",
|
| 202 |
)
|
| 203 |
+
|
| 204 |
while offset < limit:
|
| 205 |
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
|
| 206 |
if not docs:
|
|
|
|
| 215 |
|
| 216 |
if base_model is None:
|
| 217 |
base_model = new_model
|
| 218 |
+
logging.info(
|
| 219 |
+
f"The following topics are newly found: {base_model.topic_labels_}"
|
| 220 |
+
)
|
| 221 |
else:
|
| 222 |
updated_model = BERTopic.merge_models([base_model, new_model])
|
| 223 |
nr_new_topics = len(set(updated_model.topics_)) - len(
|
|
|
|
| 235 |
|
| 236 |
topics_info = base_model.get_topic_info()
|
| 237 |
all_topics = base_model.topics_
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
topic_plot = (
|
| 239 |
base_model.visualize_document_datamap(
|
| 240 |
docs=all_docs,
|
|
|
|
| 285 |
|
| 286 |
logging.info("Finished processing all data")
|
| 287 |
|
| 288 |
+
dataset_clear_name = dataset.replace("/", "-")
|
| 289 |
+
plot_png = f"{dataset_clear_name}-{plot_type.lower()}.png"
|
| 290 |
if plot_type == "DataMapPlot":
|
| 291 |
topic_plot.savefig(plot_png, format="png", dpi=300)
|
| 292 |
else:
|
|
|
|
| 302 |
for topic in all_topics
|
| 303 |
]
|
| 304 |
)
|
|
|
|
| 305 |
interactive_plot = datamapplot.create_interactive_plot(
|
| 306 |
reduced_embeddings_array,
|
| 307 |
topic_names_array,
|
|
|
|
| 322 |
with open(html_file_path, "w", encoding="utf-8") as html_file:
|
| 323 |
html_file.write(html_content)
|
| 324 |
|
| 325 |
+
repo_id = f"{DATASETS_TOPICS_ORGANIZATION}/{dataset_clear_name}"
|
| 326 |
|
| 327 |
space_id = create_space_with_content(
|
| 328 |
api=api,
|
|
|
|
| 378 |
|
| 379 |
with gr.Row():
|
| 380 |
text_column_dropdown = gr.Dropdown(label="Text column name")
|
|
|
|
|
|
|
|
|
|
| 381 |
plot_type_radio = gr.Radio(
|
| 382 |
["DataMapPlot", "Plotly"],
|
| 383 |
value="DataMapPlot",
|
|
|
|
| 399 |
subset_dropdown,
|
| 400 |
split_dropdown,
|
| 401 |
text_column_dropdown,
|
|
|
|
| 402 |
plot_type_radio,
|
| 403 |
],
|
| 404 |
outputs=[
|
|
|
|
| 418 |
subset_dropdown: gr.Dropdown(visible=False),
|
| 419 |
split_dropdown: gr.Dropdown(visible=False),
|
| 420 |
text_column_dropdown: gr.Dropdown(label="Text column name"),
|
|
|
|
| 421 |
}
|
| 422 |
try:
|
| 423 |
info_resp = get_info(dataset)
|
|
|
|
| 426 |
subset_dropdown: gr.Dropdown(visible=False),
|
| 427 |
split_dropdown: gr.Dropdown(visible=False),
|
| 428 |
text_column_dropdown: gr.Dropdown(label="Text column name"),
|
|
|
|
| 429 |
}
|
| 430 |
subsets: list[str] = list(info_resp)
|
| 431 |
subset = default_subset if default_subset in subsets else subsets[0]
|
|
|
|
| 441 |
for feature_name, feature in features.items()
|
| 442 |
if _is_string_feature(feature)
|
| 443 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
if not text_feature:
|
| 445 |
return {
|
| 446 |
subset_dropdown: gr.Dropdown(
|
|
|
|
| 450 |
value=split, choices=splits, visible=len(splits) > 1
|
| 451 |
),
|
| 452 |
text_column_dropdown: gr.Dropdown(
|
| 453 |
+
choices=text_features,
|
| 454 |
label="Text column name",
|
| 455 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
}
|
| 457 |
return {
|
| 458 |
subset_dropdown: gr.Dropdown(
|
|
|
|
| 462 |
value=split, choices=splits, visible=len(splits) > 1
|
| 463 |
),
|
| 464 |
text_column_dropdown: gr.Dropdown(
|
| 465 |
+
choices=text_features, label="Text column name"
|
| 466 |
),
|
|
|
|
| 467 |
}
|
| 468 |
|
| 469 |
@dataset_name.change(
|
|
|
|
| 472 |
subset_dropdown,
|
| 473 |
split_dropdown,
|
| 474 |
text_column_dropdown,
|
|
|
|
| 475 |
],
|
| 476 |
)
|
| 477 |
def show_input_from_subset_dropdown(dataset: str) -> dict:
|
|
|
|
| 485 |
subset_dropdown,
|
| 486 |
split_dropdown,
|
| 487 |
text_column_dropdown,
|
|
|
|
| 488 |
],
|
| 489 |
)
|
| 490 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
|
|
|
| 498 |
subset_dropdown,
|
| 499 |
split_dropdown,
|
| 500 |
text_column_dropdown,
|
|
|
|
| 501 |
],
|
| 502 |
)
|
| 503 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
|
|
|
| 511 |
subset_dropdown,
|
| 512 |
split_dropdown,
|
| 513 |
text_column_dropdown,
|
|
|
|
| 514 |
],
|
| 515 |
)
|
| 516 |
def show_input_from_text_column_dropdown(
|