Spaces:
Sleeping
Sleeping
| from sentence_transformers import SentenceTransformer, util | |
| from collections import Counter | |
| from langchain_openai import ChatOpenAI | |
| from langchain_core.messages import HumanMessage, SystemMessage | |
| from google.cloud import secretmanager | |
| class SecondaryModelDependencies: | |
| def __init__(self): | |
| self.text_similarity_model = SentenceTransformer( | |
| 'sentence-transformers/all-mpnet-base-v2') | |
| api_key = self.access_openai_api_key() | |
| self.llm_gpt35 = ChatOpenAI( | |
| api_key=api_key, model="gpt-3.5-turbo") | |
| self.llm_gpt4 = ChatOpenAI( | |
| api_key=api_key, model="gpt-4-turbo") | |
| def access_openai_api_key(self): | |
| client = secretmanager.SecretManagerServiceClient() | |
| name = "projects/steady-climate-416810/secrets/OPENAI_API_KEY/versions/1" | |
| response = client.access_secret_version(request={"name": name}) | |
| return response.payload.data.decode('UTF-8') | |
| def calculate_features(self, question: str, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]): | |
| backspace_count_normalized = backspace_count / len(answer) | |
| typing_duration_normalized = typing_duration / len(answer) | |
| letter_discrepancy = self.calculate_letter_discrepancy( | |
| answer, letter_click_counts) | |
| gpt35_answer = self.generate_gpt35_answer(question) | |
| gpt4_answer = self.generate_gpt4_answer(question) | |
| cosine_sim_gpt35 = self.calculate_similarity_gpt35( | |
| answer, gpt35_answer) | |
| cosine_sim_gpt4 = self.calculate_similarity_gpt4(answer, gpt4_answer) | |
| return [ | |
| probability, backspace_count_normalized, typing_duration_normalized, | |
| letter_discrepancy, cosine_sim_gpt35, cosine_sim_gpt4 | |
| ] | |
| def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]): | |
| # Calculate letter frequencies in the text | |
| text_letter_counts = Counter(text.lower()) | |
| # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text | |
| ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1) | |
| for letter in "abcdefghijklmnopqrstuvwxyz"] | |
| # Average the ratios and normalize by the length of the text | |
| average_ratio = sum(ratios) / len(ratios) | |
| discrepancy_ratio_normalized = average_ratio / \ | |
| (len(text) if len(text) > 0 else 1) | |
| return discrepancy_ratio_normalized | |
| def generate_gpt35_answer(self, question: str): | |
| messages = [ | |
| SystemMessage( | |
| content="Please answer the following question based solely on your internal knowledge, without external references. Assume you are the human."), | |
| HumanMessage(question) | |
| ] | |
| gpt35_answer = self.llm_gpt35.invoke(messages) | |
| return gpt35_answer.content | |
| def generate_gpt4_answer(self, question: str): | |
| messages = [ | |
| SystemMessage( | |
| content="Please answer the following question based solely on your internal knowledge, without external references. Assume you are the human."), | |
| HumanMessage(question) | |
| ] | |
| gpt4_answer = self.llm_gpt4.invoke(messages) | |
| return gpt4_answer.content | |
| def calculate_similarity_gpt35(self, answer: str, gpt35_answer: str) -> float: | |
| embedding1 = self.text_similarity_model.encode( | |
| [answer], convert_to_tensor=True) | |
| embedding2 = self.text_similarity_model.encode( | |
| [gpt35_answer], convert_to_tensor=True) | |
| cosine_scores = util.cos_sim(embedding1, embedding2) | |
| return cosine_scores.item() | |
| def calculate_similarity_gpt4(self, answer: str, gpt4_answer: str) -> float: | |
| embedding1 = self.text_similarity_model.encode( | |
| [answer], convert_to_tensor=True) | |
| embedding2 = self.text_similarity_model.encode( | |
| [gpt4_answer], convert_to_tensor=True) | |
| cosine_scores = util.cos_sim(embedding1, embedding2) | |
| return cosine_scores.item() | |