Spaces:
Runtime error
Runtime error
Adding progress bar
Browse files
app.py
CHANGED
|
@@ -38,6 +38,9 @@ logging.basicConfig(
|
|
| 38 |
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 39 |
)
|
| 40 |
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
session = requests.Session()
|
| 43 |
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
@@ -113,6 +116,22 @@ reduce_umap_model = UMAP(
|
|
| 113 |
global_topic_model = None
|
| 114 |
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
def get_parquet_urls(dataset, config, split):
|
| 117 |
parquet_files = session.get(
|
| 118 |
f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}",
|
|
@@ -170,9 +189,13 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 170 |
)
|
| 171 |
|
| 172 |
parquet_urls = get_parquet_urls(dataset, config, split)
|
| 173 |
-
|
| 174 |
-
|
|
|
|
|
|
|
| 175 |
offset = 0
|
|
|
|
|
|
|
| 176 |
base_model = None
|
| 177 |
all_docs = []
|
| 178 |
reduced_embeddings_list = []
|
|
@@ -180,15 +203,17 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 180 |
yield (
|
| 181 |
gr.DataFrame(interactive=False, visible=True),
|
| 182 |
gr.Plot(visible=True),
|
| 183 |
-
gr.Label(
|
|
|
|
|
|
|
| 184 |
)
|
| 185 |
while offset < limit:
|
| 186 |
-
docs = get_docs_from_parquet(parquet_urls, column, offset,
|
| 187 |
if not docs:
|
| 188 |
break
|
| 189 |
|
| 190 |
logging.info(
|
| 191 |
-
f"----> Processing chunk: {offset=} {
|
| 192 |
)
|
| 193 |
|
| 194 |
embeddings = calculate_embeddings(docs)
|
|
@@ -225,15 +250,17 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 225 |
)
|
| 226 |
|
| 227 |
logging.info(f"Topics: {repr_model_topics}")
|
| 228 |
-
progress = min(offset / limit, 1.0)
|
| 229 |
|
|
|
|
|
|
|
|
|
|
| 230 |
yield (
|
| 231 |
topics_info,
|
| 232 |
topic_plot,
|
| 233 |
gr.Label({f"⚙️ Generating topics {dataset}": progress}, visible=True),
|
| 234 |
)
|
| 235 |
|
| 236 |
-
offset +=
|
| 237 |
|
| 238 |
logging.info("Finished processing all data")
|
| 239 |
cuda.empty_cache() # Clear cache at the end of each chunk
|
|
|
|
| 38 |
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 39 |
)
|
| 40 |
|
| 41 |
+
MAX_ROWS = 1_000
|
| 42 |
+
CHUNK_SIZE = 300
|
| 43 |
+
|
| 44 |
|
| 45 |
session = requests.Session()
|
| 46 |
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
| 116 |
global_topic_model = None
|
| 117 |
|
| 118 |
|
| 119 |
+
def get_split_rows(dataset, config, split):
|
| 120 |
+
config_size = session.get(
|
| 121 |
+
f"https://datasets-server.huggingface.co/size?dataset={dataset}&config={config}",
|
| 122 |
+
timeout=20,
|
| 123 |
+
).json()
|
| 124 |
+
if "error" in config_size:
|
| 125 |
+
raise Exception(f"Error fetching config size: {config_size['error']}")
|
| 126 |
+
split_size = next(
|
| 127 |
+
(s for s in config_size["size"]["splits"] if s["split"] == split),
|
| 128 |
+
None,
|
| 129 |
+
)
|
| 130 |
+
if split_size is None:
|
| 131 |
+
raise Exception(f"Error fetching split{split} in config {config}")
|
| 132 |
+
return split_size["num_rows"]
|
| 133 |
+
|
| 134 |
+
|
| 135 |
def get_parquet_urls(dataset, config, split):
|
| 136 |
parquet_files = session.get(
|
| 137 |
f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}",
|
|
|
|
| 189 |
)
|
| 190 |
|
| 191 |
parquet_urls = get_parquet_urls(dataset, config, split)
|
| 192 |
+
split_rows = get_split_rows(dataset, config, split)
|
| 193 |
+
logging.info(f"Split rows: {split_rows}")
|
| 194 |
+
|
| 195 |
+
limit = min(split_rows, MAX_ROWS)
|
| 196 |
offset = 0
|
| 197 |
+
rows_processed = 0
|
| 198 |
+
|
| 199 |
base_model = None
|
| 200 |
all_docs = []
|
| 201 |
reduced_embeddings_list = []
|
|
|
|
| 203 |
yield (
|
| 204 |
gr.DataFrame(interactive=False, visible=True),
|
| 205 |
gr.Plot(visible=True),
|
| 206 |
+
gr.Label(
|
| 207 |
+
{f"⚙️ Generating topics {dataset}": rows_processed / limit}, visible=True
|
| 208 |
+
),
|
| 209 |
)
|
| 210 |
while offset < limit:
|
| 211 |
+
docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
|
| 212 |
if not docs:
|
| 213 |
break
|
| 214 |
|
| 215 |
logging.info(
|
| 216 |
+
f"----> Processing chunk: {offset=} {CHUNK_SIZE=} with {len(docs)} docs"
|
| 217 |
)
|
| 218 |
|
| 219 |
embeddings = calculate_embeddings(docs)
|
|
|
|
| 250 |
)
|
| 251 |
|
| 252 |
logging.info(f"Topics: {repr_model_topics}")
|
|
|
|
| 253 |
|
| 254 |
+
rows_processed += len(docs)
|
| 255 |
+
progress = min(rows_processed / limit, 1.0)
|
| 256 |
+
logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
|
| 257 |
yield (
|
| 258 |
topics_info,
|
| 259 |
topic_plot,
|
| 260 |
gr.Label({f"⚙️ Generating topics {dataset}": progress}, visible=True),
|
| 261 |
)
|
| 262 |
|
| 263 |
+
offset += CHUNK_SIZE
|
| 264 |
|
| 265 |
logging.info("Finished processing all data")
|
| 266 |
cuda.empty_cache() # Clear cache at the end of each chunk
|