Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| from sentence_transformers import SentenceTransformer, util | |
| import pandas as pd | |
| import numpy as np | |
| # Initialize FastAPI app | |
| app = FastAPI() | |
| # Define request model | |
| class QueryRequest(BaseModel): | |
| query: str | |
| results: dict | |
| class EnhancedSemanticSearchEvaluator: | |
| def __init__(self, top_k=300, relevance_threshold=3): | |
| self.models = { | |
| "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'), | |
| "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'), | |
| "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2') | |
| } | |
| self.top_k = top_k | |
| self.relevance_threshold = relevance_threshold | |
| def compute_similarity(self, model, query, matches): | |
| query_embedding = model.encode(query, convert_to_tensor=True) | |
| match_embeddings = model.encode( | |
| [match['metadata'] for match in matches], convert_to_tensor=True | |
| ) | |
| scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist() | |
| return scores | |
| def normalize_scores(self, similarity_scores): | |
| """ | |
| Normalize similarity scores to a 0-1 range for consistent scaling. | |
| """ | |
| max_score = max(similarity_scores) if similarity_scores else 1 | |
| normalized_scores = [score / max_score for score in similarity_scores] | |
| return normalized_scores | |
| def compute_dynamic_thresholds(self, normalized_scores): | |
| """ | |
| Compute dynamic thresholds based on the score distribution (percentiles). | |
| """ | |
| high_threshold = np.percentile(normalized_scores, 90) | |
| medium_threshold = np.percentile(normalized_scores, 70) | |
| low_threshold = np.percentile(normalized_scores, 50) | |
| return high_threshold, medium_threshold, low_threshold | |
| def rank_results(self, model, query, matches): | |
| similarity_scores = self.compute_similarity(model, query, matches) | |
| normalized_scores = self.normalize_scores(similarity_scores) | |
| high_threshold, medium_threshold, low_threshold = self.compute_dynamic_thresholds(normalized_scores) | |
| for match, normalized_score in zip(matches, normalized_scores): | |
| match['similarity_score'] = normalized_score | |
| # Dynamically assign LLM scores based on thresholds | |
| if normalized_score >= high_threshold: | |
| match['llm_score'] = 5 | |
| elif normalized_score >= medium_threshold: | |
| match['llm_score'] = 4 | |
| elif normalized_score >= low_threshold: | |
| match['llm_score'] = 3 | |
| elif normalized_score >= 0.1: # Lowest tier | |
| match['llm_score'] = 2 | |
| else: | |
| match['llm_score'] = 1 | |
| # Rank results by similarity score | |
| ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True) | |
| return ranked_matches | |
| def evaluate_results(self, query, results): | |
| all_metrics = {} | |
| results_status = {} | |
| for model_name, model in self.models.items(): | |
| ranked_matches = self.rank_results(model, query, results['matches']) | |
| results_with_scores = [] | |
| for rank, match in enumerate(ranked_matches[:self.top_k], start=1): | |
| doc_id = match['id'] | |
| similarity_score = match['similarity_score'] | |
| llm_score = match['llm_score'] | |
| results_with_scores.append({ | |
| "Rank": rank, | |
| "Document ID": doc_id, | |
| "Similarity Score": similarity_score, | |
| "LLM Score": llm_score | |
| }) | |
| results_df = pd.DataFrame(results_with_scores) | |
| results_df['Pass'] = results_df['LLM Score'] >= self.relevance_threshold | |
| pass_rate = results_df['Pass'].mean() | |
| metrics = { | |
| "Pass Rate": pass_rate, | |
| "Precision@K": results_df.head(self.top_k)['Pass'].mean(), | |
| "Recall@K": results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1), | |
| "F1@K": ( | |
| 2 * (results_df.head(self.top_k)['Pass'].mean() * (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) / | |
| (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) | |
| if (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) > 0 else 0) | |
| } | |
| all_metrics[model_name] = metrics | |
| results_status[model_name] = "Test Passed" if pass_rate > 0.5 else "Test Failed" | |
| return results_status | |
| evaluator = EnhancedSemanticSearchEvaluator() | |
| async def evaluate(request: QueryRequest): | |
| try: | |
| query = request.query | |
| results = request.results | |
| evaluation_result = evaluator.evaluate_results(query, results) | |
| return evaluation_result | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |