Spaces:

Molbap
/

transformers-modular-refactor

Running

App Files Files Community

Molbap HF Staff commited on Aug 12

Commit

7490843

verified ·

1 Parent(s): dc04102

Add longer seqlens

Browse files

Files changed (1) hide show

modular_graph_and_candidates.py +23 -8

modular_graph_and_candidates.py CHANGED Viewed

@@ -96,10 +96,21 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
 @spaces.GPU
 def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
-    model = SentenceTransformer("codesage/codesage-large-v2", trust_remote_code=True)
-    model.max_seq_length = 4096  # truncate overly long modeling files
-    texts = {}
     for name in tqdm(missing, desc="Reading modeling files"):
         code = ""
         for py in (models_root / name).rglob("modeling_*.py"):
@@ -113,23 +124,27 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
     all_embeddings = []
     print("Encoding embeddings...")
-    batch_size = 8  # or 2 if memory is tight
     for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
         batch = [texts[n] for n in names[i:i+batch_size]]
         emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
         all_embeddings.append(emb)
-    embeddings = np.vstack(all_embeddings)  # [N, D]
     print("Computing pairwise similarities...")
-    sims = embeddings @ embeddings.T  # cosine since already normalized
     out = {}
     for i in range(len(names)):
         for j in range(i + 1, len(names)):
-            s = sims[i, j]
             if s >= thr:
-                out[(names[i], names[j])] = float(s)
     return out

 @spaces.GPU
 def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
+    model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
+    # Hard-cap by backend max positions (prevents IndexError in self.wpe)
+    try:
+        cfg = model[0].auto_model.config
+        pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
+    except Exception:
+        pos_limit = 1024  # conservative fallback if config is odd
+    seq_len = min(pos_limit, 2048)  # optional extra ceiling if pos_limit is huge
+    model.max_seq_length = seq_len               # SentenceTransformer wrapper
+    model[0].max_seq_length = seq_len            # its Transformer submodule actually used for tokenize()
+    model[0].tokenizer.model_max_length = seq_len  # ensure tokenizer truncates
+    texts = {}
     for name in tqdm(missing, desc="Reading modeling files"):
         code = ""
         for py in (models_root / name).rglob("modeling_*.py"):
     all_embeddings = []
     print("Encoding embeddings...")
+    batch_size = 8
     for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
         batch = [texts[n] for n in names[i:i+batch_size]]
         emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
         all_embeddings.append(emb)
+    # Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
+    import numpy as np
+    embeddings = np.vstack(all_embeddings).astype(np.float32)
+    norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
+    embeddings = embeddings / norms
     print("Computing pairwise similarities...")
+    sims_mat = embeddings @ embeddings.T
     out = {}
     for i in range(len(names)):
         for j in range(i + 1, len(names)):
+            s = float(sims_mat[i, j])
             if s >= thr:
+                out[(names[i], names[j])] = s
     return out