Spaces:

ashwinradhe
/

my-fastapi-app

Sleeping

App Files Files Community

Bhushan4829 commited on Jan 8

Commit

c374fa9

1 Parent(s): 7c15e5a

Updated Code

Browse files

Files changed (2) hide show

requirements.txt +1 -1
semantic_search.py +41 -17

requirements.txt CHANGED Viewed

@@ -2,4 +2,4 @@ fastapi
 sentence_transformers
 pandas
 uvicorn

 sentence_transformers
 pandas
 uvicorn
+numpy

semantic_search.py CHANGED Viewed

@@ -2,6 +2,7 @@ from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from sentence_transformers import SentenceTransformer, util
 import pandas as pd
 # Initialize FastAPI app
 app = FastAPI()
@@ -12,15 +13,14 @@ class QueryRequest(BaseModel):
     results: dict
 class EnhancedSemanticSearchEvaluator:
-    def __init__(self, relevance_threshold=3, top_k=300, similarity_threshold=0.5):
         self.models = {
             "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
             "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
             "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
         }
-        self.relevance_threshold = relevance_threshold
         self.top_k = top_k
-        self.similarity_threshold = similarity_threshold
     def compute_similarity(self, model, query, matches):
         query_embedding = model.encode(query, convert_to_tensor=True)
@@ -30,10 +30,44 @@ class EnhancedSemanticSearchEvaluator:
         scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
         return scores
     def rank_results(self, model, query, matches):
         similarity_scores = self.compute_similarity(model, query, matches)
-        for match, score in zip(matches, similarity_scores):
-            match['similarity_score'] = score
         ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
         return ranked_matches
@@ -47,18 +81,8 @@ class EnhancedSemanticSearchEvaluator:
             results_with_scores = []
             for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
                 doc_id = match['id']
-                similarity_score = match['score']
-                if similarity_score >= 0.7:
-                    llm_score = 5
-                elif similarity_score >= 0.5:
-                    llm_score = 4
-                elif similarity_score >= 0.3:
-                    llm_score = 3
-                elif similarity_score >= 0.1:
-                    llm_score = 2
-                else:
-                    llm_score = 1
                 results_with_scores.append({
                     "Rank": rank,

 from pydantic import BaseModel
 from sentence_transformers import SentenceTransformer, util
 import pandas as pd
+import numpy as np
 # Initialize FastAPI app
 app = FastAPI()
     results: dict
 class EnhancedSemanticSearchEvaluator:
+    def __init__(self, top_k=300, relevance_threshold=3):
         self.models = {
             "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
             "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
             "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
         }
         self.top_k = top_k
+        self.relevance_threshold = relevance_threshold
     def compute_similarity(self, model, query, matches):
         query_embedding = model.encode(query, convert_to_tensor=True)
         scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
         return scores
+    def normalize_scores(self, similarity_scores):
+        """
+        Normalize similarity scores to a 0-1 range for consistent scaling.
+        """
+        max_score = max(similarity_scores) if similarity_scores else 1
+        normalized_scores = [score / max_score for score in similarity_scores]
+        return normalized_scores
+    def compute_dynamic_thresholds(self, normalized_scores):
+        """
+        Compute dynamic thresholds based on the score distribution (percentiles).
+        """
+        high_threshold = np.percentile(normalized_scores, 90)
+        medium_threshold = np.percentile(normalized_scores, 70)
+        low_threshold = np.percentile(normalized_scores, 50)
+        return high_threshold, medium_threshold, low_threshold
     def rank_results(self, model, query, matches):
         similarity_scores = self.compute_similarity(model, query, matches)
+        normalized_scores = self.normalize_scores(similarity_scores)
+        high_threshold, medium_threshold, low_threshold = self.compute_dynamic_thresholds(normalized_scores)
+        for match, normalized_score in zip(matches, normalized_scores):
+            match['similarity_score'] = normalized_score
+            # Dynamically assign LLM scores based on thresholds
+            if normalized_score >= high_threshold:
+                match['llm_score'] = 5
+            elif normalized_score >= medium_threshold:
+                match['llm_score'] = 4
+            elif normalized_score >= low_threshold:
+                match['llm_score'] = 3
+            elif normalized_score >= 0.1:  # Lowest tier
+                match['llm_score'] = 2
+            else:
+                match['llm_score'] = 1
+        # Rank results by similarity score
         ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
         return ranked_matches
             results_with_scores = []
             for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
                 doc_id = match['id']
+                similarity_score = match['similarity_score']
+                llm_score = match['llm_score']
                 results_with_scores.append({
                     "Rank": rank,