Spaces:
Sleeping
Sleeping
Bhushan4829
commited on
Commit
·
c374fa9
1
Parent(s):
7c15e5a
Updated Code
Browse files- requirements.txt +1 -1
- semantic_search.py +41 -17
requirements.txt
CHANGED
|
@@ -2,4 +2,4 @@ fastapi
|
|
| 2 |
sentence_transformers
|
| 3 |
pandas
|
| 4 |
uvicorn
|
| 5 |
-
|
|
|
|
| 2 |
sentence_transformers
|
| 3 |
pandas
|
| 4 |
uvicorn
|
| 5 |
+
numpy
|
semantic_search.py
CHANGED
|
@@ -2,6 +2,7 @@ from fastapi import FastAPI, HTTPException
|
|
| 2 |
from pydantic import BaseModel
|
| 3 |
from sentence_transformers import SentenceTransformer, util
|
| 4 |
import pandas as pd
|
|
|
|
| 5 |
|
| 6 |
# Initialize FastAPI app
|
| 7 |
app = FastAPI()
|
|
@@ -12,15 +13,14 @@ class QueryRequest(BaseModel):
|
|
| 12 |
results: dict
|
| 13 |
|
| 14 |
class EnhancedSemanticSearchEvaluator:
|
| 15 |
-
def __init__(self,
|
| 16 |
self.models = {
|
| 17 |
"Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
|
| 18 |
"Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
|
| 19 |
"Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
|
| 20 |
}
|
| 21 |
-
self.relevance_threshold = relevance_threshold
|
| 22 |
self.top_k = top_k
|
| 23 |
-
self.
|
| 24 |
|
| 25 |
def compute_similarity(self, model, query, matches):
|
| 26 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
|
@@ -30,10 +30,44 @@ class EnhancedSemanticSearchEvaluator:
|
|
| 30 |
scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
|
| 31 |
return scores
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def rank_results(self, model, query, matches):
|
| 34 |
similarity_scores = self.compute_similarity(model, query, matches)
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
|
| 38 |
return ranked_matches
|
| 39 |
|
|
@@ -47,18 +81,8 @@ class EnhancedSemanticSearchEvaluator:
|
|
| 47 |
results_with_scores = []
|
| 48 |
for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
|
| 49 |
doc_id = match['id']
|
| 50 |
-
similarity_score = match['
|
| 51 |
-
|
| 52 |
-
if similarity_score >= 0.7:
|
| 53 |
-
llm_score = 5
|
| 54 |
-
elif similarity_score >= 0.5:
|
| 55 |
-
llm_score = 4
|
| 56 |
-
elif similarity_score >= 0.3:
|
| 57 |
-
llm_score = 3
|
| 58 |
-
elif similarity_score >= 0.1:
|
| 59 |
-
llm_score = 2
|
| 60 |
-
else:
|
| 61 |
-
llm_score = 1
|
| 62 |
|
| 63 |
results_with_scores.append({
|
| 64 |
"Rank": rank,
|
|
|
|
| 2 |
from pydantic import BaseModel
|
| 3 |
from sentence_transformers import SentenceTransformer, util
|
| 4 |
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
|
| 7 |
# Initialize FastAPI app
|
| 8 |
app = FastAPI()
|
|
|
|
| 13 |
results: dict
|
| 14 |
|
| 15 |
class EnhancedSemanticSearchEvaluator:
|
| 16 |
+
def __init__(self, top_k=300, relevance_threshold=3):
|
| 17 |
self.models = {
|
| 18 |
"Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
|
| 19 |
"Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
|
| 20 |
"Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
|
| 21 |
}
|
|
|
|
| 22 |
self.top_k = top_k
|
| 23 |
+
self.relevance_threshold = relevance_threshold
|
| 24 |
|
| 25 |
def compute_similarity(self, model, query, matches):
|
| 26 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
|
|
|
| 30 |
scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
|
| 31 |
return scores
|
| 32 |
|
| 33 |
+
def normalize_scores(self, similarity_scores):
|
| 34 |
+
"""
|
| 35 |
+
Normalize similarity scores to a 0-1 range for consistent scaling.
|
| 36 |
+
"""
|
| 37 |
+
max_score = max(similarity_scores) if similarity_scores else 1
|
| 38 |
+
normalized_scores = [score / max_score for score in similarity_scores]
|
| 39 |
+
return normalized_scores
|
| 40 |
+
|
| 41 |
+
def compute_dynamic_thresholds(self, normalized_scores):
|
| 42 |
+
"""
|
| 43 |
+
Compute dynamic thresholds based on the score distribution (percentiles).
|
| 44 |
+
"""
|
| 45 |
+
high_threshold = np.percentile(normalized_scores, 90)
|
| 46 |
+
medium_threshold = np.percentile(normalized_scores, 70)
|
| 47 |
+
low_threshold = np.percentile(normalized_scores, 50)
|
| 48 |
+
return high_threshold, medium_threshold, low_threshold
|
| 49 |
+
|
| 50 |
def rank_results(self, model, query, matches):
|
| 51 |
similarity_scores = self.compute_similarity(model, query, matches)
|
| 52 |
+
normalized_scores = self.normalize_scores(similarity_scores)
|
| 53 |
+
high_threshold, medium_threshold, low_threshold = self.compute_dynamic_thresholds(normalized_scores)
|
| 54 |
+
|
| 55 |
+
for match, normalized_score in zip(matches, normalized_scores):
|
| 56 |
+
match['similarity_score'] = normalized_score
|
| 57 |
+
|
| 58 |
+
# Dynamically assign LLM scores based on thresholds
|
| 59 |
+
if normalized_score >= high_threshold:
|
| 60 |
+
match['llm_score'] = 5
|
| 61 |
+
elif normalized_score >= medium_threshold:
|
| 62 |
+
match['llm_score'] = 4
|
| 63 |
+
elif normalized_score >= low_threshold:
|
| 64 |
+
match['llm_score'] = 3
|
| 65 |
+
elif normalized_score >= 0.1: # Lowest tier
|
| 66 |
+
match['llm_score'] = 2
|
| 67 |
+
else:
|
| 68 |
+
match['llm_score'] = 1
|
| 69 |
+
|
| 70 |
+
# Rank results by similarity score
|
| 71 |
ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
|
| 72 |
return ranked_matches
|
| 73 |
|
|
|
|
| 81 |
results_with_scores = []
|
| 82 |
for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
|
| 83 |
doc_id = match['id']
|
| 84 |
+
similarity_score = match['similarity_score']
|
| 85 |
+
llm_score = match['llm_score']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
results_with_scores.append({
|
| 88 |
"Rank": rank,
|