Spaces:
Runtime error
Runtime error
Change parameters by dataset size
Browse files
app.py
CHANGED
|
@@ -2,12 +2,10 @@ import requests
|
|
| 2 |
import logging
|
| 3 |
import duckdb
|
| 4 |
import numpy as np
|
| 5 |
-
|
| 6 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 7 |
from bertopic import BERTopic
|
| 8 |
-
from bertopic.representation import
|
| 9 |
-
KeyBERTInspired,
|
| 10 |
-
)
|
| 11 |
from umap import UMAP
|
| 12 |
from hdbscan import HDBSCAN
|
| 13 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
@@ -21,6 +19,11 @@ import os
|
|
| 21 |
import gradio as gr
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
load_dotenv()
|
| 25 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 26 |
assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
|
|
@@ -55,7 +58,7 @@ def get_split_rows(dataset, config, split):
|
|
| 55 |
None,
|
| 56 |
)
|
| 57 |
if split_size is None:
|
| 58 |
-
raise Exception(f"Error fetching split{split} in config {config}")
|
| 59 |
return split_size["num_rows"]
|
| 60 |
|
| 61 |
|
|
@@ -83,27 +86,37 @@ def calculate_embeddings(docs):
|
|
| 83 |
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
|
| 84 |
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
# @spaces.GPU
|
| 87 |
-
def fit_model(docs, embeddings, n_neighbors):
|
| 88 |
global global_topic_model
|
| 89 |
|
| 90 |
umap_model = UMAP(
|
| 91 |
n_neighbors=n_neighbors,
|
| 92 |
-
n_components=
|
| 93 |
min_dist=0.0,
|
| 94 |
metric="cosine",
|
| 95 |
random_state=42,
|
| 96 |
)
|
| 97 |
|
| 98 |
hdbscan_model = HDBSCAN(
|
| 99 |
-
min_cluster_size=
|
|
|
|
|
|
|
| 100 |
metric="euclidean",
|
| 101 |
cluster_selection_method="eom",
|
| 102 |
prediction_data=True,
|
| 103 |
)
|
| 104 |
|
| 105 |
new_model = BERTopic(
|
| 106 |
-
"english",
|
| 107 |
# Sub-models
|
| 108 |
embedding_model=sentence_model,
|
| 109 |
umap_model=umap_model,
|
|
@@ -113,7 +126,7 @@ def fit_model(docs, embeddings, n_neighbors):
|
|
| 113 |
# Hyperparameters
|
| 114 |
top_n_words=10,
|
| 115 |
verbose=True,
|
| 116 |
-
min_topic_size=n_neighbors, #
|
| 117 |
)
|
| 118 |
logging.info("Fitting new model")
|
| 119 |
new_model.fit(docs, embeddings)
|
|
@@ -124,10 +137,6 @@ def fit_model(docs, embeddings, n_neighbors):
|
|
| 124 |
logging.info("Global model updated")
|
| 125 |
|
| 126 |
|
| 127 |
-
def calculate_n_neighbors(n_rows):
|
| 128 |
-
return max(n_rows // 20, 2)
|
| 129 |
-
|
| 130 |
-
|
| 131 |
def generate_topics(dataset, config, split, column, nested_column):
|
| 132 |
logging.info(
|
| 133 |
f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
|
|
@@ -138,11 +147,11 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 138 |
logging.info(f"Split rows: {split_rows}")
|
| 139 |
|
| 140 |
limit = min(split_rows, MAX_ROWS)
|
| 141 |
-
n_neighbors =
|
| 142 |
|
| 143 |
reduce_umap_model = UMAP(
|
| 144 |
n_neighbors=n_neighbors,
|
| 145 |
-
n_components=2,
|
| 146 |
min_dist=0.0,
|
| 147 |
metric="cosine",
|
| 148 |
random_state=42,
|
|
@@ -172,7 +181,7 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 172 |
)
|
| 173 |
|
| 174 |
embeddings = calculate_embeddings(docs)
|
| 175 |
-
fit_model(docs, embeddings, n_neighbors)
|
| 176 |
|
| 177 |
if base_model is None:
|
| 178 |
base_model = global_topic_model
|
|
|
|
| 2 |
import logging
|
| 3 |
import duckdb
|
| 4 |
import numpy as np
|
| 5 |
+
from torch import cuda
|
| 6 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 7 |
from bertopic import BERTopic
|
| 8 |
+
from bertopic.representation import KeyBERTInspired
|
|
|
|
|
|
|
| 9 |
from umap import UMAP
|
| 10 |
from hdbscan import HDBSCAN
|
| 11 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
| 19 |
import gradio as gr
|
| 20 |
|
| 21 |
|
| 22 |
+
"""
|
| 23 |
+
TODOs:
|
| 24 |
+
- Try for small dataset <1000 rows
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
load_dotenv()
|
| 28 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 29 |
assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
|
|
|
|
| 58 |
None,
|
| 59 |
)
|
| 60 |
if split_size is None:
|
| 61 |
+
raise Exception(f"Error fetching split {split} in config {config}")
|
| 62 |
return split_size["num_rows"]
|
| 63 |
|
| 64 |
|
|
|
|
| 86 |
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
|
| 87 |
|
| 88 |
|
| 89 |
+
# Adjust n_neighbors and n_components based on dataset size
|
| 90 |
+
def calculate_n_neighbors_and_components(n_rows):
|
| 91 |
+
# Ensure n_neighbors is proportional to the dataset size, with reasonable limits
|
| 92 |
+
n_neighbors = min(max(n_rows // 20, 15), 100)
|
| 93 |
+
n_components = 10 if n_rows > 1000 else 5 # Higher components for larger datasets
|
| 94 |
+
return n_neighbors, n_components
|
| 95 |
+
|
| 96 |
+
|
| 97 |
# @spaces.GPU
|
| 98 |
+
def fit_model(docs, embeddings, n_neighbors, n_components):
|
| 99 |
global global_topic_model
|
| 100 |
|
| 101 |
umap_model = UMAP(
|
| 102 |
n_neighbors=n_neighbors,
|
| 103 |
+
n_components=n_components,
|
| 104 |
min_dist=0.0,
|
| 105 |
metric="cosine",
|
| 106 |
random_state=42,
|
| 107 |
)
|
| 108 |
|
| 109 |
hdbscan_model = HDBSCAN(
|
| 110 |
+
min_cluster_size=max(
|
| 111 |
+
5, n_neighbors // 2
|
| 112 |
+
), # Reducing min_cluster_size for fewer outliers
|
| 113 |
metric="euclidean",
|
| 114 |
cluster_selection_method="eom",
|
| 115 |
prediction_data=True,
|
| 116 |
)
|
| 117 |
|
| 118 |
new_model = BERTopic(
|
| 119 |
+
language="english",
|
| 120 |
# Sub-models
|
| 121 |
embedding_model=sentence_model,
|
| 122 |
umap_model=umap_model,
|
|
|
|
| 126 |
# Hyperparameters
|
| 127 |
top_n_words=10,
|
| 128 |
verbose=True,
|
| 129 |
+
min_topic_size=n_neighbors, # Coherent with n_neighbors?
|
| 130 |
)
|
| 131 |
logging.info("Fitting new model")
|
| 132 |
new_model.fit(docs, embeddings)
|
|
|
|
| 137 |
logging.info("Global model updated")
|
| 138 |
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
def generate_topics(dataset, config, split, column, nested_column):
|
| 141 |
logging.info(
|
| 142 |
f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
|
|
|
|
| 147 |
logging.info(f"Split rows: {split_rows}")
|
| 148 |
|
| 149 |
limit = min(split_rows, MAX_ROWS)
|
| 150 |
+
n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
|
| 151 |
|
| 152 |
reduce_umap_model = UMAP(
|
| 153 |
n_neighbors=n_neighbors,
|
| 154 |
+
n_components=2, # For visualization, keeping it at 2 (2D)
|
| 155 |
min_dist=0.0,
|
| 156 |
metric="cosine",
|
| 157 |
random_state=42,
|
|
|
|
| 181 |
)
|
| 182 |
|
| 183 |
embeddings = calculate_embeddings(docs)
|
| 184 |
+
fit_model(docs, embeddings, n_neighbors, n_components)
|
| 185 |
|
| 186 |
if base_model is None:
|
| 187 |
base_model = global_topic_model
|