Spaces:
Runtime error
Runtime error
Adding num rows
Browse files
app.py
CHANGED
|
@@ -15,7 +15,7 @@ from sentence_transformers import SentenceTransformer
|
|
| 15 |
from dotenv import load_dotenv
|
| 16 |
import os
|
| 17 |
|
| 18 |
-
import spaces
|
| 19 |
import gradio as gr
|
| 20 |
|
| 21 |
|
|
@@ -81,7 +81,7 @@ def get_docs_from_parquet(parquet_urls, column, offset, limit):
|
|
| 81 |
return df[column].tolist()
|
| 82 |
|
| 83 |
|
| 84 |
-
@spaces.GPU
|
| 85 |
def calculate_embeddings(docs):
|
| 86 |
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
|
| 87 |
|
|
@@ -92,7 +92,7 @@ def calculate_n_neighbors_and_components(n_rows):
|
|
| 92 |
return n_neighbors, n_components
|
| 93 |
|
| 94 |
|
| 95 |
-
@spaces.GPU
|
| 96 |
def fit_model(docs, embeddings, n_neighbors, n_components):
|
| 97 |
global global_topic_model
|
| 98 |
|
|
@@ -116,11 +116,11 @@ def fit_model(docs, embeddings, n_neighbors, n_components):
|
|
| 116 |
new_model = BERTopic(
|
| 117 |
language="english",
|
| 118 |
# Sub-models
|
| 119 |
-
embedding_model=sentence_model,
|
| 120 |
-
umap_model=umap_model,
|
| 121 |
-
hdbscan_model=hdbscan_model,
|
| 122 |
-
|
| 123 |
-
|
| 124 |
# Hyperparameters
|
| 125 |
top_n_words=10,
|
| 126 |
verbose=True,
|
|
@@ -162,12 +162,16 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 162 |
all_docs = []
|
| 163 |
reduced_embeddings_list = []
|
| 164 |
topics_info, topic_plot = None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
yield (
|
| 166 |
gr.DataFrame(value=[], interactive=False, visible=True),
|
| 167 |
gr.Plot(value=None, visible=True),
|
| 168 |
-
gr.Label(
|
| 169 |
-
{f"⚙️ Generating topics {dataset}": rows_processed / limit}, visible=True
|
| 170 |
-
),
|
| 171 |
)
|
| 172 |
while offset < limit:
|
| 173 |
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
|
|
@@ -207,10 +211,16 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 207 |
rows_processed += len(docs)
|
| 208 |
progress = min(rows_processed / limit, 1.0)
|
| 209 |
logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
yield (
|
| 211 |
topics_info,
|
| 212 |
topic_plot,
|
| 213 |
-
gr.Label({
|
| 214 |
)
|
| 215 |
|
| 216 |
offset += CHUNK_SIZE
|
|
@@ -219,7 +229,9 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 219 |
yield (
|
| 220 |
topics_info,
|
| 221 |
topic_plot,
|
| 222 |
-
gr.Label(
|
|
|
|
|
|
|
| 223 |
)
|
| 224 |
cuda.empty_cache()
|
| 225 |
|
|
@@ -260,7 +272,7 @@ with gr.Blocks() as demo:
|
|
| 260 |
|
| 261 |
generate_button = gr.Button("Generate Topics", variant="primary")
|
| 262 |
|
| 263 |
-
gr.Markdown("##
|
| 264 |
full_topics_generation_label = gr.Label(visible=False, show_label=False)
|
| 265 |
topics_plot = gr.Plot()
|
| 266 |
with gr.Accordion("Topics Info", open=False):
|
|
|
|
| 15 |
from dotenv import load_dotenv
|
| 16 |
import os
|
| 17 |
|
| 18 |
+
# import spaces
|
| 19 |
import gradio as gr
|
| 20 |
|
| 21 |
|
|
|
|
| 81 |
return df[column].tolist()
|
| 82 |
|
| 83 |
|
| 84 |
+
# @spaces.GPU
|
| 85 |
def calculate_embeddings(docs):
|
| 86 |
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
|
| 87 |
|
|
|
|
| 92 |
return n_neighbors, n_components
|
| 93 |
|
| 94 |
|
| 95 |
+
# @spaces.GPU
|
| 96 |
def fit_model(docs, embeddings, n_neighbors, n_components):
|
| 97 |
global global_topic_model
|
| 98 |
|
|
|
|
| 116 |
new_model = BERTopic(
|
| 117 |
language="english",
|
| 118 |
# Sub-models
|
| 119 |
+
embedding_model=sentence_model, # Step 1 - Extract embeddings
|
| 120 |
+
umap_model=umap_model, # Step 2 - UMAP model
|
| 121 |
+
hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings
|
| 122 |
+
vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
|
| 123 |
+
representation_model=representation_model, # Step 5 - Label topics
|
| 124 |
# Hyperparameters
|
| 125 |
top_n_words=10,
|
| 126 |
verbose=True,
|
|
|
|
| 162 |
all_docs = []
|
| 163 |
reduced_embeddings_list = []
|
| 164 |
topics_info, topic_plot = None, None
|
| 165 |
+
full_processing = split_rows <= MAX_ROWS
|
| 166 |
+
message = (
|
| 167 |
+
f"⚙️ Processing full dataset: 0 of ({split_rows} rows)"
|
| 168 |
+
if full_processing
|
| 169 |
+
else f"⚙️ Processing partial dataset 0 of ({limit} rows)"
|
| 170 |
+
)
|
| 171 |
yield (
|
| 172 |
gr.DataFrame(value=[], interactive=False, visible=True),
|
| 173 |
gr.Plot(value=None, visible=True),
|
| 174 |
+
gr.Label({message: rows_processed / limit}, visible=True),
|
|
|
|
|
|
|
| 175 |
)
|
| 176 |
while offset < limit:
|
| 177 |
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
|
|
|
|
| 211 |
rows_processed += len(docs)
|
| 212 |
progress = min(rows_processed / limit, 1.0)
|
| 213 |
logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
|
| 214 |
+
message = (
|
| 215 |
+
f"⚙️ Processing full dataset: {rows_processed} of {limit}"
|
| 216 |
+
if full_processing
|
| 217 |
+
else f"⚙️ Processing partial dataset: {rows_processed} of {limit} rows"
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
yield (
|
| 221 |
topics_info,
|
| 222 |
topic_plot,
|
| 223 |
+
gr.Label({message: progress}, visible=True),
|
| 224 |
)
|
| 225 |
|
| 226 |
offset += CHUNK_SIZE
|
|
|
|
| 229 |
yield (
|
| 230 |
topics_info,
|
| 231 |
topic_plot,
|
| 232 |
+
gr.Label(
|
| 233 |
+
{f"✅ Done: {rows_processed} rows have been processed": 1.0}, visible=True
|
| 234 |
+
),
|
| 235 |
)
|
| 236 |
cuda.empty_cache()
|
| 237 |
|
|
|
|
| 272 |
|
| 273 |
generate_button = gr.Button("Generate Topics", variant="primary")
|
| 274 |
|
| 275 |
+
gr.Markdown("## Data map")
|
| 276 |
full_topics_generation_label = gr.Label(visible=False, show_label=False)
|
| 277 |
topics_plot = gr.Plot()
|
| 278 |
with gr.Accordion("Topics Info", open=False):
|