Spaces:
Runtime error
Runtime error
Fix for small datasets and custom topics
Browse files
app.py
CHANGED
|
@@ -152,7 +152,7 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 152 |
base_model = None
|
| 153 |
all_docs = []
|
| 154 |
reduced_embeddings_list = []
|
| 155 |
-
|
| 156 |
while offset < limit:
|
| 157 |
docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
|
| 158 |
if not docs:
|
|
@@ -164,11 +164,13 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 164 |
|
| 165 |
embeddings = calculate_embeddings(docs)
|
| 166 |
base_model, _ = fit_model(base_model, docs, embeddings)
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
| 172 |
|
| 173 |
reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
|
| 174 |
reduced_embeddings_list.append(reduced_embeddings)
|
|
@@ -189,7 +191,7 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 189 |
offset += chunk_size
|
| 190 |
|
| 191 |
logging.info("Finished processing all data")
|
| 192 |
-
return
|
| 193 |
|
| 194 |
|
| 195 |
with gr.Blocks() as demo:
|
|
|
|
| 152 |
base_model = None
|
| 153 |
all_docs = []
|
| 154 |
reduced_embeddings_list = []
|
| 155 |
+
topics_info, topic_plot = None, None
|
| 156 |
while offset < limit:
|
| 157 |
docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
|
| 158 |
if not docs:
|
|
|
|
| 164 |
|
| 165 |
embeddings = calculate_embeddings(docs)
|
| 166 |
base_model, _ = fit_model(base_model, docs, embeddings)
|
| 167 |
+
|
| 168 |
+
repr_model_topics = {
|
| 169 |
+
key: label[0][0].split("\n")[0]
|
| 170 |
+
for key, label in base_model.get_topics(full=True)["Llama2"].items()
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
base_model.set_topic_labels(repr_model_topics)
|
| 174 |
|
| 175 |
reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
|
| 176 |
reduced_embeddings_list.append(reduced_embeddings)
|
|
|
|
| 191 |
offset += chunk_size
|
| 192 |
|
| 193 |
logging.info("Finished processing all data")
|
| 194 |
+
return topics_info, topic_plot
|
| 195 |
|
| 196 |
|
| 197 |
with gr.Blocks() as demo:
|