Add longer seqlens
Browse files
modular_graph_and_candidates.py
CHANGED
|
@@ -96,10 +96,21 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
|
|
| 96 |
|
| 97 |
@spaces.GPU
|
| 98 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
| 99 |
-
model = SentenceTransformer("codesage/codesage-large-v2", trust_remote_code=True)
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
for name in tqdm(missing, desc="Reading modeling files"):
|
| 104 |
code = ""
|
| 105 |
for py in (models_root / name).rglob("modeling_*.py"):
|
|
@@ -113,23 +124,27 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 113 |
all_embeddings = []
|
| 114 |
|
| 115 |
print("Encoding embeddings...")
|
| 116 |
-
batch_size = 8
|
| 117 |
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
| 118 |
batch = [texts[n] for n in names[i:i+batch_size]]
|
| 119 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
| 120 |
all_embeddings.append(emb)
|
| 121 |
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
print("Computing pairwise similarities...")
|
| 125 |
-
|
| 126 |
|
| 127 |
out = {}
|
| 128 |
for i in range(len(names)):
|
| 129 |
for j in range(i + 1, len(names)):
|
| 130 |
-
s =
|
| 131 |
if s >= thr:
|
| 132 |
-
out[(names[i], names[j])] =
|
| 133 |
return out
|
| 134 |
|
| 135 |
|
|
|
|
| 96 |
|
| 97 |
@spaces.GPU
|
| 98 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
| 99 |
+
model = SentenceTransformer("codesage/codesage-large-v2", device="cpu", trust_remote_code=True)
|
| 100 |
+
|
| 101 |
+
# Hard-cap by backend max positions (prevents IndexError in self.wpe)
|
| 102 |
+
try:
|
| 103 |
+
cfg = model[0].auto_model.config
|
| 104 |
+
pos_limit = int(getattr(cfg, "n_positions", getattr(cfg, "max_position_embeddings")))
|
| 105 |
+
except Exception:
|
| 106 |
+
pos_limit = 1024 # conservative fallback if config is odd
|
| 107 |
|
| 108 |
+
seq_len = min(pos_limit, 2048) # optional extra ceiling if pos_limit is huge
|
| 109 |
+
model.max_seq_length = seq_len # SentenceTransformer wrapper
|
| 110 |
+
model[0].max_seq_length = seq_len # its Transformer submodule actually used for tokenize()
|
| 111 |
+
model[0].tokenizer.model_max_length = seq_len # ensure tokenizer truncates
|
| 112 |
+
|
| 113 |
+
texts = {}
|
| 114 |
for name in tqdm(missing, desc="Reading modeling files"):
|
| 115 |
code = ""
|
| 116 |
for py in (models_root / name).rglob("modeling_*.py"):
|
|
|
|
| 124 |
all_embeddings = []
|
| 125 |
|
| 126 |
print("Encoding embeddings...")
|
| 127 |
+
batch_size = 8
|
| 128 |
for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
|
| 129 |
batch = [texts[n] for n in names[i:i+batch_size]]
|
| 130 |
emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
| 131 |
all_embeddings.append(emb)
|
| 132 |
|
| 133 |
+
# Cosine similarity requires normalized vectors; SentenceTransformers doesn't always return them normalized
|
| 134 |
+
import numpy as np
|
| 135 |
+
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
| 136 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
| 137 |
+
embeddings = embeddings / norms
|
| 138 |
|
| 139 |
print("Computing pairwise similarities...")
|
| 140 |
+
sims_mat = embeddings @ embeddings.T
|
| 141 |
|
| 142 |
out = {}
|
| 143 |
for i in range(len(names)):
|
| 144 |
for j in range(i + 1, len(names)):
|
| 145 |
+
s = float(sims_mat[i, j])
|
| 146 |
if s >= thr:
|
| 147 |
+
out[(names[i], names[j])] = s
|
| 148 |
return out
|
| 149 |
|
| 150 |
|