Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -24,7 +24,11 @@ def embed_sparse(text: str):
|
|
| 24 |
scores = bm25.get_scores(tokens)
|
| 25 |
# Map each term to its BM25 weight
|
| 26 |
term_weights = {tok: float(score) for tok, score in zip(tokens, scores)}
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# 3. Late-interaction embedding model (ColBERT)
|
| 30 |
colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)
|
|
|
|
| 24 |
scores = bm25.get_scores(tokens)
|
| 25 |
# Map each term to its BM25 weight
|
| 26 |
term_weights = {tok: float(score) for tok, score in zip(tokens, scores)}
|
| 27 |
+
# Build a consistent vocabulary (sorted for deterministic indices)
|
| 28 |
+
terms = sorted(term_weights.keys())
|
| 29 |
+
indices = list(range(len(terms)))
|
| 30 |
+
values = [term_weights[term] for term in terms]
|
| 31 |
+
return {"indices": indices, "values": values, "terms": terms} # 'terms' is optional, for debugging
|
| 32 |
|
| 33 |
# 3. Late-interaction embedding model (ColBERT)
|
| 34 |
colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)
|