Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,6 +19,7 @@ logging.basicConfig(
|
|
| 19 |
|
| 20 |
|
| 21 |
session = requests.Session()
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def get_parquet_urls(dataset, config, split):
|
|
@@ -41,7 +42,7 @@ def get_docs_from_parquet(parquet_urls, column, offset, limit):
|
|
| 41 |
|
| 42 |
|
| 43 |
@spaces.GPU
|
| 44 |
-
def calculate_embeddings(
|
| 45 |
embeddings = sentence_model.encode(docs, show_progress_bar=True, batch_size=100)
|
| 46 |
logging.info(f"Embeddings shape: {embeddings.shape}")
|
| 47 |
return embeddings
|
|
@@ -91,11 +92,10 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 91 |
# Create instances of GPU-accelerated UMAP and HDBSCAN
|
| 92 |
# umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
|
| 93 |
# hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)
|
| 94 |
-
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 95 |
while True:
|
| 96 |
docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
|
| 97 |
logging.info(f"------------> New chunk data {offset=} {chunk_size=}")
|
| 98 |
-
embeddings = calculate_embeddings(
|
| 99 |
offset = offset + chunk_size
|
| 100 |
if not docs or offset >= limit:
|
| 101 |
break
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
session = requests.Session()
|
| 22 |
+
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 23 |
|
| 24 |
|
| 25 |
def get_parquet_urls(dataset, config, split):
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
@spaces.GPU
|
| 45 |
+
def calculate_embeddings(docs):
|
| 46 |
embeddings = sentence_model.encode(docs, show_progress_bar=True, batch_size=100)
|
| 47 |
logging.info(f"Embeddings shape: {embeddings.shape}")
|
| 48 |
return embeddings
|
|
|
|
| 92 |
# Create instances of GPU-accelerated UMAP and HDBSCAN
|
| 93 |
# umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
|
| 94 |
# hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)
|
|
|
|
| 95 |
while True:
|
| 96 |
docs = get_docs_from_parquet(parquet_urls, column, offset, chunk_size)
|
| 97 |
logging.info(f"------------> New chunk data {offset=} {chunk_size=}")
|
| 98 |
+
embeddings = calculate_embeddings(docs)
|
| 99 |
offset = offset + chunk_size
|
| 100 |
if not docs or offset >= limit:
|
| 101 |
break
|