timeline_1

Running

App Files Files Community

Molbap HF Staff commited on Aug 18

Commit

061a198

1 Parent(s): 4fa1ace

make things persist

Browse files

Files changed (2) hide show

app.py +10 -1
modular_graph_and_candidates.py +55 -19

app.py CHANGED Viewed

@@ -52,7 +52,16 @@ def _escape_srcdoc(text: str) -> str:
 def run(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
-    repo_path = clone_or_cache(repo_url)
     graph = build_graph_json(
         transformers_dir=repo_path,

 def run(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
+    # Check if we can use cached embeddings for embedding similarity
+    embeddings_cache = Path("embeddings_cache.npz")
+    if sim_method == "embedding" and embeddings_cache.exists():
+        print("🚀 Using cached embeddings - skipping repo download")
+        # Use a dummy path since we won't need the actual repo
+        repo_path = Path("/tmp/dummy")
+    else:
+        print("📥 Downloading/updating repository")
+        repo_path = clone_or_cache(repo_url)
     graph = build_graph_json(
         transformers_dir=repo_path,

modular_graph_and_candidates.py CHANGED Viewed

@@ -130,22 +130,22 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
     print(f"Encoding embeddings for {len(names)} models...")
     batch_size = 4  # keep your default
-    # ── checkpoint / resume ────────────────────────────────────────────────────
-    ckpt_path = models_root / "__emb_ckpt.npz"
     start_idx = 0
     emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
-    if ckpt_path.exists():
         try:
-            ckpt = np.load(ckpt_path, allow_pickle=True)
-            ckpt_names = list(ckpt["names"])
-            if names[:len(ckpt_names)] == ckpt_names:
-                loaded = ckpt["embeddings"].astype(np.float32)
                 all_embeddings.append(loaded)
-                start_idx = len(ckpt_names)
-                print(f"Resuming from checkpoint at {start_idx}/{len(names)}")
         except Exception as e:
-            print(f"⚠️  Failed to load checkpoint: {type(e).__name__}: {e}")
     # ───────────────────────────────────────────────────────────────────────────
     for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
@@ -161,16 +161,16 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
         all_embeddings.append(emb)
-        # save checkpoint after each batch
         try:
             cur = np.vstack(all_embeddings).astype(np.float32)
             np.savez(
-                ckpt_path,
                 embeddings=cur,
                 names=np.array(names[:i+len(batch_names)], dtype=object),
             )
         except Exception as e:
-            print(f"⚠️  Failed to write checkpoint: {type(e).__name__}: {e}")
         if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
             torch.cuda.empty_cache()
@@ -193,14 +193,42 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
             if s >= thr:
                 out[(processed_names[i], processed_names[j])] = s
-    # best-effort cleanup
-    try:
-        ckpt_path.unlink()
-    except Exception:
-        pass
     return out
@@ -269,6 +297,14 @@ def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str,
     if sim_method == "jaccard":
         return similarity_clusters({m: bags[m] for m in missing}, threshold)
     else:
         return embedding_similarity_clusters(models_root, missing, threshold)
 def build_graph_json(

     print(f"Encoding embeddings for {len(names)} models...")
     batch_size = 4  # keep your default
+    # ── persistent embeddings storage ────────────────────────────────────────────
+    embeddings_path = Path("embeddings_cache.npz")
     start_idx = 0
     emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
+    if embeddings_path.exists():
         try:
+            cached = np.load(embeddings_path, allow_pickle=True)
+            cached_names = list(cached["names"])
+            if names[:len(cached_names)] == cached_names:
+                loaded = cached["embeddings"].astype(np.float32)
                 all_embeddings.append(loaded)
+                start_idx = len(cached_names)
+                print(f"📦 Using cached embeddings for {start_idx}/{len(names)} models")
         except Exception as e:
+            print(f"⚠️  Failed to load cached embeddings: {type(e).__name__}: {e}")
     # ───────────────────────────────────────────────────────────────────────────
     for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
         all_embeddings.append(emb)
+        # save to persistent cache after each batch
         try:
             cur = np.vstack(all_embeddings).astype(np.float32)
             np.savez(
+                embeddings_path,
                 embeddings=cur,
                 names=np.array(names[:i+len(batch_names)], dtype=object),
             )
         except Exception as e:
+            print(f"⚠️  Failed to write embeddings cache: {type(e).__name__}: {e}")
         if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
             torch.cuda.empty_cache()
             if s >= thr:
                 out[(processed_names[i], processed_names[j])] = s
+    print(f"💾 Embeddings saved to {embeddings_path}")
     return out
+def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
+    """Compute similarities from cached embeddings without reprocessing."""
+    embeddings_path = Path("embeddings_cache.npz")
+    if not embeddings_path.exists():
+        return {}
+    try:
+        cached = np.load(embeddings_path, allow_pickle=True)
+        embeddings = cached["embeddings"].astype(np.float32)
+        names = list(cached["names"])
+        # Normalize embeddings
+        norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
+        embeddings = embeddings / norms
+        # Compute similarities
+        sims_mat = embeddings @ embeddings.T
+        out = {}
+        for i in range(len(names)):
+            for j in range(i + 1, len(names)):
+                s = float(sims_mat[i, j])
+                if s >= threshold:
+                    out[(names[i], names[j])] = s
+        print(f"⚡ Computed {len(out)} similarities from cache (threshold: {threshold})")
+        return out
+    except Exception as e:
+        print(f"⚠️  Failed to compute from cache: {e}")
+        return {}
     if sim_method == "jaccard":
         return similarity_clusters({m: bags[m] for m in missing}, threshold)
     else:
+        # Try to use cached embeddings first
+        embeddings_path = Path("embeddings_cache.npz")
+        if embeddings_path.exists():
+            cached_sims = compute_similarities_from_cache(threshold)
+            if cached_sims:  # Cache exists and worked
+                return cached_sims
+        # Fallback to full computation
         return embedding_similarity_clusters(models_root, missing, threshold)
 def build_graph_json(