Spaces:
Running
Running
make things persist
Browse files- app.py +10 -1
- modular_graph_and_candidates.py +55 -19
app.py
CHANGED
|
@@ -52,7 +52,16 @@ def _escape_srcdoc(text: str) -> str:
|
|
| 52 |
|
| 53 |
|
| 54 |
def run(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
graph = build_graph_json(
|
| 58 |
transformers_dir=repo_path,
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def run(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
| 55 |
+
# Check if we can use cached embeddings for embedding similarity
|
| 56 |
+
embeddings_cache = Path("embeddings_cache.npz")
|
| 57 |
+
|
| 58 |
+
if sim_method == "embedding" and embeddings_cache.exists():
|
| 59 |
+
print("π Using cached embeddings - skipping repo download")
|
| 60 |
+
# Use a dummy path since we won't need the actual repo
|
| 61 |
+
repo_path = Path("/tmp/dummy")
|
| 62 |
+
else:
|
| 63 |
+
print("π₯ Downloading/updating repository")
|
| 64 |
+
repo_path = clone_or_cache(repo_url)
|
| 65 |
|
| 66 |
graph = build_graph_json(
|
| 67 |
transformers_dir=repo_path,
|
modular_graph_and_candidates.py
CHANGED
|
@@ -130,22 +130,22 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 130 |
print(f"Encoding embeddings for {len(names)} models...")
|
| 131 |
batch_size = 4 # keep your default
|
| 132 |
|
| 133 |
-
# ββ
|
| 134 |
-
|
| 135 |
start_idx = 0
|
| 136 |
emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
|
| 137 |
|
| 138 |
-
if
|
| 139 |
try:
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
if names[:len(
|
| 143 |
-
loaded =
|
| 144 |
all_embeddings.append(loaded)
|
| 145 |
-
start_idx = len(
|
| 146 |
-
print(f"
|
| 147 |
except Exception as e:
|
| 148 |
-
print(f"β οΈ Failed to load
|
| 149 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 150 |
|
| 151 |
for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
|
|
@@ -161,16 +161,16 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 161 |
|
| 162 |
all_embeddings.append(emb)
|
| 163 |
|
| 164 |
-
# save
|
| 165 |
try:
|
| 166 |
cur = np.vstack(all_embeddings).astype(np.float32)
|
| 167 |
np.savez(
|
| 168 |
-
|
| 169 |
embeddings=cur,
|
| 170 |
names=np.array(names[:i+len(batch_names)], dtype=object),
|
| 171 |
)
|
| 172 |
except Exception as e:
|
| 173 |
-
print(f"β οΈ Failed to write
|
| 174 |
|
| 175 |
if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
|
| 176 |
torch.cuda.empty_cache()
|
|
@@ -193,14 +193,42 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 193 |
if s >= thr:
|
| 194 |
out[(processed_names[i], processed_names[j])] = s
|
| 195 |
|
| 196 |
-
|
| 197 |
-
try:
|
| 198 |
-
ckpt_path.unlink()
|
| 199 |
-
except Exception:
|
| 200 |
-
pass
|
| 201 |
-
|
| 202 |
return out
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
|
| 206 |
|
|
@@ -269,6 +297,14 @@ def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str,
|
|
| 269 |
if sim_method == "jaccard":
|
| 270 |
return similarity_clusters({m: bags[m] for m in missing}, threshold)
|
| 271 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
return embedding_similarity_clusters(models_root, missing, threshold)
|
| 273 |
|
| 274 |
def build_graph_json(
|
|
|
|
| 130 |
print(f"Encoding embeddings for {len(names)} models...")
|
| 131 |
batch_size = 4 # keep your default
|
| 132 |
|
| 133 |
+
# ββ persistent embeddings storage ββββββββββββββββββββββββββββββββββββββββββββ
|
| 134 |
+
embeddings_path = Path("embeddings_cache.npz")
|
| 135 |
start_idx = 0
|
| 136 |
emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
|
| 137 |
|
| 138 |
+
if embeddings_path.exists():
|
| 139 |
try:
|
| 140 |
+
cached = np.load(embeddings_path, allow_pickle=True)
|
| 141 |
+
cached_names = list(cached["names"])
|
| 142 |
+
if names[:len(cached_names)] == cached_names:
|
| 143 |
+
loaded = cached["embeddings"].astype(np.float32)
|
| 144 |
all_embeddings.append(loaded)
|
| 145 |
+
start_idx = len(cached_names)
|
| 146 |
+
print(f"π¦ Using cached embeddings for {start_idx}/{len(names)} models")
|
| 147 |
except Exception as e:
|
| 148 |
+
print(f"β οΈ Failed to load cached embeddings: {type(e).__name__}: {e}")
|
| 149 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 150 |
|
| 151 |
for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
|
|
|
|
| 161 |
|
| 162 |
all_embeddings.append(emb)
|
| 163 |
|
| 164 |
+
# save to persistent cache after each batch
|
| 165 |
try:
|
| 166 |
cur = np.vstack(all_embeddings).astype(np.float32)
|
| 167 |
np.savez(
|
| 168 |
+
embeddings_path,
|
| 169 |
embeddings=cur,
|
| 170 |
names=np.array(names[:i+len(batch_names)], dtype=object),
|
| 171 |
)
|
| 172 |
except Exception as e:
|
| 173 |
+
print(f"β οΈ Failed to write embeddings cache: {type(e).__name__}: {e}")
|
| 174 |
|
| 175 |
if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
|
| 176 |
torch.cuda.empty_cache()
|
|
|
|
| 193 |
if s >= thr:
|
| 194 |
out[(processed_names[i], processed_names[j])] = s
|
| 195 |
|
| 196 |
+
print(f"πΎ Embeddings saved to {embeddings_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
return out
|
| 198 |
|
| 199 |
+
def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
|
| 200 |
+
"""Compute similarities from cached embeddings without reprocessing."""
|
| 201 |
+
embeddings_path = Path("embeddings_cache.npz")
|
| 202 |
+
|
| 203 |
+
if not embeddings_path.exists():
|
| 204 |
+
return {}
|
| 205 |
+
|
| 206 |
+
try:
|
| 207 |
+
cached = np.load(embeddings_path, allow_pickle=True)
|
| 208 |
+
embeddings = cached["embeddings"].astype(np.float32)
|
| 209 |
+
names = list(cached["names"])
|
| 210 |
+
|
| 211 |
+
# Normalize embeddings
|
| 212 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
| 213 |
+
embeddings = embeddings / norms
|
| 214 |
+
|
| 215 |
+
# Compute similarities
|
| 216 |
+
sims_mat = embeddings @ embeddings.T
|
| 217 |
+
|
| 218 |
+
out = {}
|
| 219 |
+
for i in range(len(names)):
|
| 220 |
+
for j in range(i + 1, len(names)):
|
| 221 |
+
s = float(sims_mat[i, j])
|
| 222 |
+
if s >= threshold:
|
| 223 |
+
out[(names[i], names[j])] = s
|
| 224 |
+
|
| 225 |
+
print(f"β‘ Computed {len(out)} similarities from cache (threshold: {threshold})")
|
| 226 |
+
return out
|
| 227 |
+
|
| 228 |
+
except Exception as e:
|
| 229 |
+
print(f"β οΈ Failed to compute from cache: {e}")
|
| 230 |
+
return {}
|
| 231 |
+
|
| 232 |
|
| 233 |
|
| 234 |
|
|
|
|
| 297 |
if sim_method == "jaccard":
|
| 298 |
return similarity_clusters({m: bags[m] for m in missing}, threshold)
|
| 299 |
else:
|
| 300 |
+
# Try to use cached embeddings first
|
| 301 |
+
embeddings_path = Path("embeddings_cache.npz")
|
| 302 |
+
if embeddings_path.exists():
|
| 303 |
+
cached_sims = compute_similarities_from_cache(threshold)
|
| 304 |
+
if cached_sims: # Cache exists and worked
|
| 305 |
+
return cached_sims
|
| 306 |
+
|
| 307 |
+
# Fallback to full computation
|
| 308 |
return embedding_similarity_clusters(models_root, missing, threshold)
|
| 309 |
|
| 310 |
def build_graph_json(
|