Spaces:
Sleeping
Sleeping
| import os, warnings | |
| from dotenv import load_dotenv | |
| from schemas import * | |
| os.environ["CURL_CA_BUNDLE"] = "" | |
| warnings.filterwarnings("ignore") | |
| load_dotenv() | |
| from datasets import load_dataset | |
| import bm25s | |
| from bm25s.hf import BM25HF | |
| from fastapi import FastAPI | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import FileResponse | |
| from fastapi.staticfiles import StaticFiles | |
| from sklearn.preprocessing import MinMaxScaler | |
| import numpy as np | |
| import litellm | |
| bm25_index = BM25HF.load_from_hub("OrganizedProgrammers/3GPPBM25IndexSections", load_corpus=True, token=os.environ["HF_TOKEN"]) | |
| app = FastAPI(title="RAGnarok", | |
| description="Speak with the specifications") | |
| app.mount("/static", StaticFiles(directory="static"), name="static") | |
| origins = [ | |
| "*", | |
| ] | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=origins, | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| def main_menu(): | |
| return FileResponse(os.path.join("templates", "index.html")) | |
| def search_specifications(req: SearchRequest): | |
| keywords = req.keyword | |
| threshold = req.threshold | |
| results_out = [] | |
| query_tokens = bm25s.tokenize(keywords) | |
| results, scores = bm25_index.retrieve(query_tokens, k=len(bm25_index.corpus)) | |
| def calculate_boosted_score(metadata, score, query): | |
| title = set(metadata['title'].lower().split()) | |
| q = set(query.lower().split()) | |
| spec_id_presence = 0.5 if metadata['id'].lower() in q else 0 | |
| booster = len(q & title) * 0.5 | |
| return score + spec_id_presence + booster | |
| spec_scores = {} | |
| spec_indices = {} | |
| spec_details = {} | |
| for i in range(results.shape[1]): | |
| doc = results[0, i] | |
| score = scores[0, i] | |
| spec = doc["metadata"]["id"] | |
| boosted_score = calculate_boosted_score(doc['metadata'], score, keywords) | |
| if spec not in spec_scores or boosted_score > spec_scores[spec]: | |
| spec_scores[spec] = boosted_score | |
| spec_indices[spec] = i | |
| spec_details[spec] = { | |
| 'original_score': score, | |
| 'boosted_score': boosted_score, | |
| 'doc': doc | |
| } | |
| def normalize_scores(scores_dict): | |
| if not scores_dict: | |
| return {} | |
| scores_array = np.array(list(scores_dict.values())).reshape(-1, 1) | |
| scaler = MinMaxScaler() | |
| normalized_scores = scaler.fit_transform(scores_array).flatten() | |
| normalized_dict = {} | |
| for i, spec in enumerate(scores_dict.keys()): | |
| normalized_dict[spec] = normalized_scores[i] | |
| return normalized_dict | |
| normalized_scores = normalize_scores(spec_scores) | |
| for spec in spec_details: | |
| spec_details[spec]["normalized_score"] = normalized_scores[spec] | |
| unique_specs = sorted(normalized_scores.keys(), key=lambda x: normalized_scores[x], reverse=True) | |
| for rank, spec in enumerate(unique_specs, 1): | |
| details = spec_details[spec] | |
| metadata = details['doc']['metadata'] | |
| if details['normalized_score'] < threshold / 100: | |
| break | |
| results_out.append({'id': metadata['id'], 'title': metadata['title'], 'section': metadata['section_title'], 'content': details['doc']['text'], 'similarity': int(details['normalized_score']*100)}) | |
| return SearchResponse(results=results_out) | |
| def questions_the_sources(req: ChatRequest): | |
| model = req.model | |
| resp = litellm.completion( | |
| model=f"gemini/{model}", | |
| messages=req.messages, | |
| api_key=os.environ["GEMINI"] | |
| ) | |
| return ChatResponse(response=resp.choices[0].message.content) |