Spaces:

ganesh3
/

rag-youtube-assistant

Sleeping

App Files Files Community

ganesh3 commited on Oct 29, 2024

Commit

9da39b7

verified ·

1 Parent(s): e2cbf8c

Update app/evaluation.py

Browse files

Files changed (1) hide show

app/evaluation.py +108 -103

app/evaluation.py CHANGED Viewed

@@ -2,16 +2,46 @@ from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 import pandas as pd
 import json
-import ollama
 import requests
-import sqlite3
 from tqdm import tqdm
 import csv
 class EvaluationSystem:
     def __init__(self, data_processor, database_handler):
         self.data_processor = data_processor
         self.db_handler = database_handler
     def relevance_scoring(self, query, retrieved_docs, top_k=5):
         query_embedding = self.data_processor.embedding_model.encode(query)
@@ -35,44 +65,31 @@ class EvaluationSystem:
             result = cursor.fetchone()
             return result[0] if result[0] is not None else 0
-    def evaluate_rag_performance(self, rag_system, test_queries, reference_answers, index_name):
-        relevance_scores = []
-        similarity_scores = []
-        human_scores = []
-        for query, reference in zip(test_queries, reference_answers):
-            retrieved_docs = rag_system.data_processor.search(query, num_results=5, method='hybrid', index_name=index_name)
-            generated_answer, _ = rag_system.query(query, search_method='hybrid', index_name=index_name)
-            relevance_scores.append(self.relevance_scoring(query, retrieved_docs))
-            similarity_scores.append(self.answer_similarity(generated_answer, reference))
-            human_scores.append(self.human_evaluation(index_name, query))
-        return {
-            "avg_relevance_score": np.mean(relevance_scores),
-            "avg_similarity_score": np.mean(similarity_scores),
-            "avg_human_score": np.mean(human_scores)
-        }
     def llm_as_judge(self, question, generated_answer, prompt_template):
-        prompt = prompt_template.format(question=question, answer_llm=generated_answer)
         try:
-            response = ollama.chat(
-                model='phi3.5',
-                messages=[{"role": "user", "content": prompt}]
-            )
-            evaluation = json.loads(response['message']['content'])
-            return evaluation
         except Exception as e:
-            print(f"Error in LLM evaluation: {str(e)}")
             return None
     def evaluate_rag(self, rag_system, ground_truth_file, prompt_template=None):
         try:
             ground_truth = pd.read_csv(ground_truth_file)
         except FileNotFoundError:
-            print("Ground truth file not found. Please generate ground truth data first.")
             return None
         evaluations = []
@@ -84,13 +101,13 @@ class EvaluationSystem:
             index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
             if not index_name:
-                print(f"No index found for video {video_id}. Skipping this question.")
                 continue
             try:
                 answer_llm, _ = rag_system.query(question, search_method='hybrid', index_name=index_name)
             except ValueError as e:
-                print(f"Error querying RAG system: {str(e)}")
                 continue
             if prompt_template:
@@ -114,79 +131,25 @@ class EvaluationSystem:
                 })
         # Save evaluations to CSV
-        csv_path = 'data/evaluation_results.csv'
-        with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
-            fieldnames = ['video_id', 'question', 'answer', 'relevance', 'explanation']
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            writer.writeheader()
-            for eval_data in evaluations:
-                writer.writerow(eval_data)
-        print(f"Evaluation results saved to {csv_path}")
-        # Save evaluations to database
-        self.save_evaluations_to_db(evaluations)
         return evaluations
     def save_evaluations_to_db(self, evaluations):
-        with sqlite3.connect(self.db_handler.db_path) as conn:
-            cursor = conn.cursor()
-            cursor.execute('''
-            CREATE TABLE IF NOT EXISTS rag_evaluations (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                video_id TEXT,
-                question TEXT,
-                answer TEXT,
-                relevance TEXT,
-                explanation TEXT
-            )
-            ''')
-            for eval_data in evaluations:
-                cursor.execute('''
-                INSERT INTO rag_evaluations (video_id, question, answer, relevance, explanation)
-                VALUES (?, ?, ?, ?, ?)
-                ''', (eval_data['video_id'], eval_data['question'], eval_data['answer'],
-                      eval_data['relevance'], eval_data['explanation']))
-            conn.commit()
-        print("Evaluation results saved to database")
-    def run_full_evaluation(self, rag_system, ground_truth_file, prompt_template=None):
-        # Load ground truth
-        ground_truth = pd.read_csv(ground_truth_file)
-        # Evaluate RAG
-        rag_evaluations = self.evaluate_rag(rag_system, ground_truth_file, prompt_template)
-        # Evaluate search performance
-        def search_function(query, video_id):
-            index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
-            if index_name:
-                return rag_system.data_processor.search(query, num_results=10, method='hybrid', index_name=index_name)
-            return []
-        search_performance = self.evaluate_search(ground_truth, search_function)
-        # Optimize search parameters
-        param_ranges = {'content': (0.0, 3.0)}  # Example parameter range
-        def objective_function(params):
-            def parameterized_search(query, video_id):
-                index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
-                if index_name:
-                    return rag_system.data_processor.search(query, num_results=10, method='hybrid', index_name=index_name, boost_dict=params)
-                return []
-            return self.evaluate_search(ground_truth, parameterized_search)['mrr']
-        best_params, best_score = self.simple_optimize(param_ranges, objective_function)
-        return {
-            "rag_evaluations": rag_evaluations,
-            "search_performance": search_performance,
-            "best_params": best_params,
-            "best_score": best_score
-        }
     def hit_rate(self, relevance_total):
         return sum(any(line) for line in relevance_total) / len(relevance_total)
@@ -207,7 +170,7 @@ class EvaluationSystem:
         best_score = float('-inf')
         for _ in range(n_iterations):
             current_params = {param: np.random.uniform(min_val, max_val)
-                              for param, (min_val, max_val) in param_ranges.items()}
             current_score = objective_function(current_params)
             if current_score > best_score:
                 best_score = current_score
@@ -224,4 +187,46 @@ class EvaluationSystem:
         return {
             'hit_rate': self.hit_rate(relevance_total),
             'mrr': self.mrr(relevance_total),
         }

 import numpy as np
 import pandas as pd
 import json
 import requests
 from tqdm import tqdm
 import csv
+import logging
+import sys
+from transformers import pipeline
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    stream=sys.stdout
+)
+logger = logging.getLogger(__name__)
 class EvaluationSystem:
     def __init__(self, data_processor, database_handler):
         self.data_processor = data_processor
         self.db_handler = database_handler
+        # Initialize the model
+        self.model = pipeline(
+            "text-generation",
+            model="google/flan-t5-base",
+            device=-1  # Use CPU
+        )
+        logger.info("Initialized evaluation system with flan-t5-base model")
+    def generate_llm_response(self, prompt):
+        """Generate response using Hugging Face model"""
+        try:
+            response = self.model(
+                prompt,
+                max_length=512,
+                min_length=64,
+                num_return_sequences=1
+            )[0]['generated_text']
+            return response
+        except Exception as e:
+            logger.error(f"Error generating response: {str(e)}")
+            return None
     def relevance_scoring(self, query, retrieved_docs, top_k=5):
         query_embedding = self.data_processor.embedding_model.encode(query)
             result = cursor.fetchone()
             return result[0] if result[0] is not None else 0
     def llm_as_judge(self, question, generated_answer, prompt_template):
+        prompt = prompt_template.format(
+            question=question,
+            answer_llm=generated_answer
+        )
         try:
+            response = self.generate_llm_response(prompt)
+            if response:
+                # Try to parse JSON response
+                try:
+                    evaluation = json.loads(response)
+                    return evaluation
+                except json.JSONDecodeError:
+                    logger.error("Failed to parse LLM response as JSON")
+                    return None
+            return None
         except Exception as e:
+            logger.error(f"Error in LLM evaluation: {str(e)}")
             return None
     def evaluate_rag(self, rag_system, ground_truth_file, prompt_template=None):
         try:
             ground_truth = pd.read_csv(ground_truth_file)
         except FileNotFoundError:
+            logger.error("Ground truth file not found. Please generate ground truth data first.")
             return None
         evaluations = []
             index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
             if not index_name:
+                logger.warning(f"No index found for video {video_id}. Skipping this question.")
                 continue
             try:
                 answer_llm, _ = rag_system.query(question, search_method='hybrid', index_name=index_name)
             except ValueError as e:
+                logger.error(f"Error querying RAG system: {str(e)}")
                 continue
             if prompt_template:
                 })
         # Save evaluations to CSV
+        if evaluations:
+            csv_path = 'data/evaluation_results.csv'
+            with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
+                fieldnames = ['video_id', 'question', 'answer', 'relevance', 'explanation']
+                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+                writer.writeheader()
+                for eval_data in evaluations:
+                    writer.writerow(eval_data)
+            logger.info(f"Evaluation results saved to {csv_path}")
+            # Save evaluations to database
+            self.save_evaluations_to_db(evaluations)
         return evaluations
     def save_evaluations_to_db(self, evaluations):
+        for eval_data in evaluations:
+            self.db_handler.save_rag_evaluation(eval_data)
+        logger.info("Evaluation results saved to database")
     def hit_rate(self, relevance_total):
         return sum(any(line) for line in relevance_total) / len(relevance_total)
         best_score = float('-inf')
         for _ in range(n_iterations):
             current_params = {param: np.random.uniform(min_val, max_val)
+                            for param, (min_val, max_val) in param_ranges.items()}
             current_score = objective_function(current_params)
             if current_score > best_score:
                 best_score = current_score
         return {
             'hit_rate': self.hit_rate(relevance_total),
             'mrr': self.mrr(relevance_total),
+        }
+    def run_full_evaluation(self, rag_system, ground_truth_file, prompt_template=None):
+        # Load ground truth
+        ground_truth = pd.read_csv(ground_truth_file)
+        # Evaluate RAG
+        rag_evaluations = self.evaluate_rag(rag_system, ground_truth_file, prompt_template)
+        # Evaluate search performance
+        def search_function(query, video_id):
+            index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
+            if index_name:
+                return rag_system.data_processor.search(query, num_results=10, method='hybrid', index_name=index_name)
+            return []
+        search_performance = self.evaluate_search(ground_truth, search_function)
+        # Optimize search parameters
+        param_ranges = {'content': (0.0, 3.0)}  # Example parameter range
+        def objective_function(params):
+            def parameterized_search(query, video_id):
+                index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
+                if index_name:
+                    return rag_system.data_processor.search(
+                        query,
+                        num_results=10,
+                        method='hybrid',
+                        index_name=index_name,
+                        boost_dict=params
+                    )
+                return []
+            return self.evaluate_search(ground_truth, parameterized_search)['mrr']
+        best_params, best_score = self.simple_optimize(param_ranges, objective_function)
+        return {
+            "rag_evaluations": rag_evaluations,
+            "search_performance": search_performance,
+            "best_params": best_params,
+            "best_score": best_score
         }