Spaces:
Runtime error
Runtime error
Open in spaces
Browse files
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
# import spaces
|
| 3 |
import gradio as gr
|
| 4 |
|
| 5 |
import logging
|
|
@@ -16,13 +15,10 @@ from bertopic import BERTopic
|
|
| 16 |
from bertopic.representation import KeyBERTInspired
|
| 17 |
from bertopic.representation import TextGeneration
|
| 18 |
|
| 19 |
-
# Temporary disabling because of ZeroGPU does not support cuml
|
| 20 |
from cuml.manifold import UMAP
|
| 21 |
from cuml.cluster import HDBSCAN
|
| 22 |
|
| 23 |
-
|
| 24 |
-
# from hdbscan import HDBSCAN
|
| 25 |
-
from huggingface_hub import HfApi
|
| 26 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 27 |
from sentence_transformers import SentenceTransformer
|
| 28 |
from prompts import REPRESENTATION_PROMPT
|
|
@@ -59,6 +55,7 @@ logging.basicConfig(
|
|
| 59 |
MAX_ROWS = 50_000
|
| 60 |
CHUNK_SIZE = 10_000
|
| 61 |
|
|
|
|
| 62 |
|
| 63 |
session = requests.Session()
|
| 64 |
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
@@ -186,7 +183,6 @@ def _push_to_hub(
|
|
| 186 |
logging.info(f"Pushing file to hub: {dataset_id} on file {file_path}")
|
| 187 |
|
| 188 |
file_name = file_path.split("/")[-1]
|
| 189 |
-
api = HfApi(token=HF_TOKEN)
|
| 190 |
try:
|
| 191 |
logging.info(f"About to push {file_path} - {dataset_id}")
|
| 192 |
api.upload_file(
|
|
@@ -200,6 +196,44 @@ def _push_to_hub(
|
|
| 200 |
raise
|
| 201 |
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
| 204 |
logging.info(
|
| 205 |
f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
|
|
@@ -239,6 +273,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 239 |
gr.Plot(value=None, visible=True),
|
| 240 |
gr.Label({message: rows_processed / limit}, visible=True),
|
| 241 |
"",
|
|
|
|
| 242 |
)
|
| 243 |
while offset < limit:
|
| 244 |
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
|
|
@@ -278,6 +313,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 278 |
docs=all_docs,
|
| 279 |
reduced_embeddings=reduced_embeddings_array,
|
| 280 |
title=dataset,
|
|
|
|
| 281 |
width=800,
|
| 282 |
height=700,
|
| 283 |
arrowprops={
|
|
@@ -286,6 +322,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 286 |
"linewidth": 0,
|
| 287 |
"fc": "#33333377",
|
| 288 |
},
|
|
|
|
| 289 |
dynamic_label_size=False,
|
| 290 |
# label_wrap_width=12,
|
| 291 |
# label_over_points=True,
|
|
@@ -299,6 +336,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 299 |
reduced_embeddings=reduced_embeddings_array,
|
| 300 |
custom_labels=True,
|
| 301 |
title=dataset,
|
|
|
|
| 302 |
)
|
| 303 |
)
|
| 304 |
|
|
@@ -317,6 +355,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 317 |
topic_plot,
|
| 318 |
gr.Label({message: progress}, visible=True),
|
| 319 |
"",
|
|
|
|
| 320 |
)
|
| 321 |
|
| 322 |
offset += CHUNK_SIZE
|
|
@@ -330,20 +369,42 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 330 |
topic_plot.write_image(plot_png)
|
| 331 |
|
| 332 |
_push_to_hub(dataset, plot_png)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
plot_png_link = (
|
| 334 |
f"https://huggingface.co/datasets/{EXPORTS_REPOSITORY}/blob/main/{plot_png}"
|
| 335 |
)
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
# *cord19_label_layers,
|
| 339 |
-
# font_family="Cinzel",
|
| 340 |
-
# enable_search=True,
|
| 341 |
-
# inline_data=False,
|
| 342 |
-
# offline_data_prefix="cord-large-1",
|
| 343 |
-
# initial_zoom_fraction=0.4,
|
| 344 |
-
# )
|
| 345 |
-
# all_topics, _ = base_model.transform(all_topics)
|
| 346 |
-
# logging.info(f"TAll opics: {all_topics[:5]}")
|
| 347 |
yield (
|
| 348 |
gr.Accordion(open=False),
|
| 349 |
topics_info,
|
|
@@ -352,6 +413,7 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
|
| 352 |
{f"✅ Done: {rows_processed} rows have been processed": 1.0}, visible=True
|
| 353 |
),
|
| 354 |
f"[]({plot_png_link})",
|
|
|
|
| 355 |
)
|
| 356 |
cuda.empty_cache()
|
| 357 |
|
|
@@ -400,7 +462,9 @@ with gr.Blocks() as demo:
|
|
| 400 |
|
| 401 |
gr.Markdown("## Data map")
|
| 402 |
full_topics_generation_label = gr.Label(visible=False, show_label=False)
|
| 403 |
-
|
|
|
|
|
|
|
| 404 |
topics_plot = gr.Plot()
|
| 405 |
with gr.Accordion("Topics Info", open=False):
|
| 406 |
topics_df = gr.DataFrame(interactive=False, visible=True)
|
|
@@ -420,6 +484,7 @@ with gr.Blocks() as demo:
|
|
| 420 |
topics_plot,
|
| 421 |
full_topics_generation_label,
|
| 422 |
open_png_label,
|
|
|
|
| 423 |
],
|
| 424 |
)
|
| 425 |
|
|
|
|
| 1 |
+
import spaces
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
|
| 4 |
import logging
|
|
|
|
| 15 |
from bertopic.representation import KeyBERTInspired
|
| 16 |
from bertopic.representation import TextGeneration
|
| 17 |
|
|
|
|
| 18 |
from cuml.manifold import UMAP
|
| 19 |
from cuml.cluster import HDBSCAN
|
| 20 |
|
| 21 |
+
from huggingface_hub import HfApi, SpaceCard
|
|
|
|
|
|
|
| 22 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 23 |
from sentence_transformers import SentenceTransformer
|
| 24 |
from prompts import REPRESENTATION_PROMPT
|
|
|
|
| 55 |
MAX_ROWS = 50_000
|
| 56 |
CHUNK_SIZE = 10_000
|
| 57 |
|
| 58 |
+
api = HfApi(token=HF_TOKEN)
|
| 59 |
|
| 60 |
session = requests.Session()
|
| 61 |
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
| 183 |
logging.info(f"Pushing file to hub: {dataset_id} on file {file_path}")
|
| 184 |
|
| 185 |
file_name = file_path.split("/")[-1]
|
|
|
|
| 186 |
try:
|
| 187 |
logging.info(f"About to push {file_path} - {dataset_id}")
|
| 188 |
api.upload_file(
|
|
|
|
| 196 |
raise
|
| 197 |
|
| 198 |
|
| 199 |
+
def create_space_with_content(dataset_id, html_file_path):
|
| 200 |
+
# TODO: Parameterize organization name
|
| 201 |
+
repo_id = f"datasets-topics/{dataset_id.replace('/', '-')}"
|
| 202 |
+
logging.info(f"Creating space with content: {repo_id} on file {html_file_path}")
|
| 203 |
+
api.create_repo(
|
| 204 |
+
repo_id=repo_id,
|
| 205 |
+
repo_type="space",
|
| 206 |
+
private=False,
|
| 207 |
+
exist_ok=True,
|
| 208 |
+
token=HF_TOKEN,
|
| 209 |
+
space_sdk="static",
|
| 210 |
+
)
|
| 211 |
+
SPACE_REPO_CARD_CONTENT = """
|
| 212 |
+
---
|
| 213 |
+
title: {dataset_id} topic modeling
|
| 214 |
+
sdk: static
|
| 215 |
+
pinned: false
|
| 216 |
+
datasets:
|
| 217 |
+
- {dataset_id}
|
| 218 |
+
---
|
| 219 |
+
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
SpaceCard(
|
| 223 |
+
content=SPACE_REPO_CARD_CONTENT.format(dataset_id=dataset_id)
|
| 224 |
+
).push_to_hub(repo_id=repo_id, repo_type="space", token=HF_TOKEN)
|
| 225 |
+
|
| 226 |
+
api.upload_file(
|
| 227 |
+
path_or_fileobj=html_file_path,
|
| 228 |
+
path_in_repo="index.html",
|
| 229 |
+
repo_type="space",
|
| 230 |
+
repo_id=repo_id,
|
| 231 |
+
token=HF_TOKEN,
|
| 232 |
+
)
|
| 233 |
+
logging.info(f"Space created done")
|
| 234 |
+
return repo_id
|
| 235 |
+
|
| 236 |
+
|
| 237 |
def generate_topics(dataset, config, split, column, nested_column, plot_type):
|
| 238 |
logging.info(
|
| 239 |
f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
|
|
|
|
| 273 |
gr.Plot(value=None, visible=True),
|
| 274 |
gr.Label({message: rows_processed / limit}, visible=True),
|
| 275 |
"",
|
| 276 |
+
"",
|
| 277 |
)
|
| 278 |
while offset < limit:
|
| 279 |
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
|
|
|
|
| 313 |
docs=all_docs,
|
| 314 |
reduced_embeddings=reduced_embeddings_array,
|
| 315 |
title=dataset,
|
| 316 |
+
font_family="Montserrat Thin",
|
| 317 |
width=800,
|
| 318 |
height=700,
|
| 319 |
arrowprops={
|
|
|
|
| 322 |
"linewidth": 0,
|
| 323 |
"fc": "#33333377",
|
| 324 |
},
|
| 325 |
+
# TODO: Make it configurable in UI
|
| 326 |
dynamic_label_size=False,
|
| 327 |
# label_wrap_width=12,
|
| 328 |
# label_over_points=True,
|
|
|
|
| 336 |
reduced_embeddings=reduced_embeddings_array,
|
| 337 |
custom_labels=True,
|
| 338 |
title=dataset,
|
| 339 |
+
font_family="Montserrat Thin",
|
| 340 |
)
|
| 341 |
)
|
| 342 |
|
|
|
|
| 355 |
topic_plot,
|
| 356 |
gr.Label({message: progress}, visible=True),
|
| 357 |
"",
|
| 358 |
+
"",
|
| 359 |
)
|
| 360 |
|
| 361 |
offset += CHUNK_SIZE
|
|
|
|
| 369 |
topic_plot.write_image(plot_png)
|
| 370 |
|
| 371 |
_push_to_hub(dataset, plot_png)
|
| 372 |
+
|
| 373 |
+
all_topics, _ = base_model.transform(all_docs)
|
| 374 |
+
topic_info = base_model.get_topic_info()
|
| 375 |
+
|
| 376 |
+
topic_names = {row["Topic"]: row["Name"] for index, row in topic_info.iterrows()}
|
| 377 |
+
topic_names_array = np.array(
|
| 378 |
+
[
|
| 379 |
+
topic_names.get(topic, "No Topic").split("_")[1].strip("-")
|
| 380 |
+
for topic in all_topics
|
| 381 |
+
]
|
| 382 |
+
)
|
| 383 |
+
dataset_clear_name = dataset.replace("/", "-")
|
| 384 |
+
interactive_plot = datamapplot.create_interactive_plot(
|
| 385 |
+
reduced_embeddings_array,
|
| 386 |
+
topic_names_array,
|
| 387 |
+
hover_text=all_docs,
|
| 388 |
+
title=dataset,
|
| 389 |
+
enable_search=True,
|
| 390 |
+
font_family="Montserrat Thin",
|
| 391 |
+
# TODO: Export data to .arrow and also serve it
|
| 392 |
+
inline_data=True,
|
| 393 |
+
# offline_data_prefix=dataset_clear_name,
|
| 394 |
+
initial_zoom_fraction=0.9,
|
| 395 |
+
)
|
| 396 |
+
html_content = str(interactive_plot)
|
| 397 |
+
html_file_path = f"{dataset_clear_name}.html"
|
| 398 |
+
with open(html_file_path, "w", encoding="utf-8") as html_file:
|
| 399 |
+
html_file.write(html_content)
|
| 400 |
+
|
| 401 |
+
space_id = create_space_with_content(dataset, html_file_path)
|
| 402 |
+
|
| 403 |
plot_png_link = (
|
| 404 |
f"https://huggingface.co/datasets/{EXPORTS_REPOSITORY}/blob/main/{plot_png}"
|
| 405 |
)
|
| 406 |
+
|
| 407 |
+
space_link = f"https://huggingface.co/spaces/{space_id}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
yield (
|
| 409 |
gr.Accordion(open=False),
|
| 410 |
topics_info,
|
|
|
|
| 413 |
{f"✅ Done: {rows_processed} rows have been processed": 1.0}, visible=True
|
| 414 |
),
|
| 415 |
f"[]({plot_png_link})",
|
| 416 |
+
f"[]({space_link})",
|
| 417 |
)
|
| 418 |
cuda.empty_cache()
|
| 419 |
|
|
|
|
| 462 |
|
| 463 |
gr.Markdown("## Data map")
|
| 464 |
full_topics_generation_label = gr.Label(visible=False, show_label=False)
|
| 465 |
+
with gr.Row():
|
| 466 |
+
open_png_label = gr.Markdown()
|
| 467 |
+
open_space_label = gr.Markdown()
|
| 468 |
topics_plot = gr.Plot()
|
| 469 |
with gr.Accordion("Topics Info", open=False):
|
| 470 |
topics_df = gr.DataFrame(interactive=False, visible=True)
|
|
|
|
| 484 |
topics_plot,
|
| 485 |
full_topics_generation_label,
|
| 486 |
open_png_label,
|
| 487 |
+
open_space_label,
|
| 488 |
],
|
| 489 |
)
|
| 490 |
|