Spaces:

Sa-m
/

llm-evaluation-framework

Running

App Files Files Community

Sa-m commited on Aug 15

Commit

96b1e9b

verified ·

1 Parent(s): 79e1931

Create app.py

Browse files

Files changed (1) hide show

app.py +715 -0

app.py ADDED Viewed

	@@ -0,0 +1,715 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import re
+import unicodedata
+from typing import Dict, Tuple, List
+import ftfy
+import nltk
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+from nltk.translate.meteor_score import meteor_score
+from bert_score import score as bert_score
+from deepeval.test_case import LLMTestCase
+from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, GEval
+from deepeval.models import DeepEvalBaseLLM
+import google.generativeai as genai
+import tempfile
+import os
+from pathlib import Path
+import logging
+# Download required NLTK data
+nltk.download('punkt', quiet=True)
+nltk.download('wordnet', quiet=True)
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global variables for API keys (in production, use environment variables)
+GEMINI_API_KEY = None  # Will be set from user input
+CONFIDENT_API_KEY = None  # Will be set from user input
+class LLMProvider:
+    """Abstract base class for LLM providers"""
+    def __init__(self, model):
+        self.model = model
+    def generate(self, prompt: str) -> str:
+        raise NotImplementedError
+    def get_model_name(self) -> str:
+        raise NotImplementedError
+class GeminiProvider(LLMProvider):
+    """Gemini implementation"""
+    def __init__(self, model_name="gemini-1.5-flash"):
+        self.model_name = model_name
+        genai.configure(api_key=GEMINI_API_KEY)
+        self.model = genai.GenerativeModel(model_name)
+    def generate(self, prompt: str) -> str:
+        try:
+            response = self.model.generate_content(prompt)
+            return response.text.strip()
+        except Exception as e:
+            logger.error(f"Error generating content with Gemini: {e}")
+            return f"Error: {str(e)}"
+    def get_model_name(self) -> str:
+        return self.model_name
+class GroqProvider(LLMProvider):
+    """Placeholder for Groq implementation"""
+    def __init__(self, model_name="llama3-70b-8192"):
+        self.model_name = model_name
+        # Implementation would go here
+        pass
+    def generate(self, prompt: str) -> str:
+        return "Groq implementation not available"
+    def get_model_name(self) -> str:
+        return self.model_name
+class GeminiLLM(DeepEvalBaseLLM):
+    """Wrapper for Gemini to work with DeepEval"""
+    def __init__(self, model):
+        self.model = model
+    def load_model(self):
+        return self.model
+    def generate(self, prompt: str) -> str:
+        return self.model.generate_content(prompt).text.strip()
+    async def a_generate(self, prompt: str) -> str:
+        return self.model.generate_content(prompt).text.strip()
+    def get_model_name(self) -> str:
+        return "gemini-pro"
+def clean_text(text: str) -> str:
+    """
+    Clean text by fixing encoding artifacts and normalizing characters.
+    Args:
+        text (str): Input text to clean
+    Returns:
+        str: Cleaned text
+    """
+    if not text or not isinstance(text, str):
+        return ""
+    # Fix common encoding artifacts
+    text = ftfy.fix_text(text)
+    text = unicodedata.normalize('NFKD', text)
+    # Replace smart quotes with standard ASCII quotes
+    text = text.replace('“', '"').replace('”', '"')
+    text = text.replace("‘", "'").replace("’", "'")
+    # Remove non-ASCII characters (optional, can be toggled)
+    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
+    # Normalize whitespace
+    text = ' '.join(text.split())
+    return text
+def create_prompts() -> Dict[str, str]:
+    """
+    Create different prompt variants for testing.
+    Returns:
+        Dict[str, str]: Dictionary of prompt names and their text
+    """
+    prompts = {
+        "Strategic Narrative Architect": """Role: Strategic Narrative Architect
+You are a professional content writer who transforms raw text into engaging, well-structured narratives.
+Your goal is to rewrite the following text while preserving all key facts and statistics, but enhancing:
+- Structure and flow
+- Engagement and readability
+- Professional tone
+- Strategic storytelling
+Guidelines:
+1. Maintain all factual information and numerical data
+2. Improve sentence structure for better readability
+3. Use active voice where appropriate
+4. Ensure professional tone suitable for publication
+5. Add logical transitions between ideas
+6. Keep the length similar to the original
+Rewrite the following text:
+{input_text}""",
+        "Precision Storyteller": """Role: Precision Storyteller
+You are a meticulous editor who ensures factual accuracy and clarity in all content.
+Your goal is to rewrite the following text with maximum precision while maintaining:
+- Factual accuracy above all
+- Clarity and conciseness
+- Proper grammar and punctuation
+- Consistent terminology
+Guidelines:
+1. Preserve every fact, statistic, and detail from the original
+2. Correct any grammatical errors or awkward phrasing
+3. Use precise, unambiguous language
+4. Avoid embellishment or subjective interpretation
+5. Maintain neutral, professional tone
+6. Ensure all claims are supported by the original text
+Rewrite the following text:
+{input_text}"""
+    }
+    return prompts
+def evaluate_text(input_text: str, candidate_text: str, reference_text: str,
+                  judge_model) -> Dict[str, float]:
+    """
+    Evaluate the quality of a rewritten text using multiple metrics.
+    Args:
+        input_text (str): Original raw input text
+        candidate_text (str): Generated candidate text
+        reference_text (str): Cleaned reference text
+        judge_model: Model for LLM-as-judge metrics
+    Returns:
+        Dict[str, float]: Dictionary of metric scores
+    """
+    results = {}
+    try:
+        # Initialize scorers
+        bleu_scorer = SmoothingFunction().method4
+        rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
+        # Tokenize for BLEU and METEOR
+        reference_tokens = reference_text.split()
+        candidate_tokens = candidate_text.split()
+        # BLEU Score
+        try:
+            bleu_score_val = sentence_bleu([reference_tokens], candidate_tokens,
+                                         smoothing_function=bleu_scorer)
+            results["BLEU"] = bleu_score_val
+        except Exception as e:
+            logger.warning(f"BLEU calculation failed: {e}")
+            results["BLEU"] = 0.0
+        # ROUGE Score
+        try:
+            rouge_scores = rouge_scorer_obj.score(reference_text, candidate_text)
+            # Average of ROUGE-1, ROUGE-2, and ROUGE-L F1 scores
+            rouge_avg = (rouge_scores['rouge1'].fmeasure +
+                        rouge_scores['rouge2'].fmeasure +
+                        rouge_scores['rougeL'].fmeasure) / 3
+            results["ROUGE"] = rouge_avg
+        except Exception as e:
+            logger.warning(f"ROUGE calculation failed: {e}")
+            results["ROUGE"] = 0.0
+        # METEOR Score
+        try:
+            meteor_score_val = meteor_score([reference_tokens], candidate_tokens)
+            results["METEOR"] = meteor_score_val
+        except Exception as e:
+            logger.warning(f"METEOR calculation failed: {e}")
+            results["METEOR"] = 0.0
+        # BERTScore
+        try:
+            P, R, F1 = bert_score([candidate_text], [reference_text], lang="en", verbose=False)
+            results["BERTScore"] = F1.item()
+        except Exception as e:
+            logger.warning(f"BERTScore calculation failed: {e}")
+            results["BERTScore"] = 0.0
+        # LLM-as-judge metrics
+        try:
+            test_case = LLMTestCase(
+                input=input_text,
+                actual_output=candidate_text,
+                expected_output=reference_text,
+                retrieval_context=[reference_text]
+            )
+            # Answer Relevancy
+            answer_rel = AnswerRelevancyMetric(model=judge_model)
+            answer_rel.measure(test_case)
+            results["AnswerRelevancy"] = answer_rel.score
+            # Faithfulness
+            faith = FaithfulnessMetric(model=judge_model)
+            faith.measure(test_case)
+            results["Faithfulness"] = faith.score
+            # GEval
+            geval = GEval(
+                name="OverallQuality",
+                criteria="Evaluate if the candidate response is accurate, complete, and well-written.",
+                evaluation_params=[
+                    "input",
+                    "actual_output",
+                    "expected_output"
+                ],
+                model=judge_model,
+                strict_mode=False
+            )
+            geval.measure(test_case)
+            results["GEval"] = geval.score
+        except Exception as e:
+            logger.warning(f"LLM-as-judge metrics failed: {e}")
+            # Set default values if LLM-as-judge fails
+            results["AnswerRelevancy"] = 0.5
+            results["Faithfulness"] = 0.5
+            results["GEval"] = 0.5
+    except Exception as e:
+        logger.error(f"Error in evaluation: {e}")
+        # Return default scores if everything fails
+        default_metrics = ["BLEU", "ROUGE", "METEOR", "BERTScore",
+                          "AnswerRelevancy", "Faithfulness", "GEval"]
+        for metric in default_metrics:
+            results[metric] = 0.0
+    return results
+def normalize_score(metric: str, value: float) -> float:
+    """
+    Normalize score to 0-1 scale based on metric's natural range.
+    Args:
+        metric (str): Name of the metric
+        value (float): Raw score value
+    Returns:
+        float: Normalized score between 0 and 1
+    """
+    # Define natural ranges for each metric
+    normalization_ranges = {
+        "AnswerRelevancy": (0.0, 1.0),
+        "Faithfulness": (0.0, 1.0),
+        "GEval": (0.0, 1.0),
+        "BERTScore": (0.7, 0.95),
+        "ROUGE": (0.0, 0.6),
+        "BLEU": (0.0, 0.4),
+        "METEOR": (0.0, 0.6)
+    }
+    if metric not in normalization_ranges or not isinstance(value, (int, float)):
+        return value
+    min_val, max_val = normalization_ranges[metric]
+    # Handle edge cases
+    if max_val <= min_val:
+        return 0.5  # Default middle value if range is invalid
+    # Normalize and clamp to [0,1]
+    normalized = (value - min_val) / (max_val - min_val)
+    return max(0.0, min(normalized, 1.0))
+def calculate_weighted_score(scores: Dict[str, float]) -> float:
+    """
+    Calculate weighted average of normalized scores.
+    Args:
+        scores (Dict[str, float]): Dictionary of metric scores
+    Returns:
+        float: Weighted average score
+    """
+    # Define weights for each metric
+    weights = {
+        "AnswerRelevancy": 0.10,
+        "Faithfulness": 0.10,
+        "GEval": 0.025,
+        "BERTScore": 0.20,
+        "ROUGE": 0.15,
+        "BLEU": 0.025,
+        "METEOR": 0.15
+    }
+    normalized_scores = {m: normalize_score(m, v) for m, v in scores.items()}
+    total_weight = 0
+    weighted_sum = 0
+    for metric, weight in weights.items():
+        if metric in normalized_scores:
+            weighted_sum += normalized_scores[metric] * weight
+            total_weight += weight
+    return weighted_sum / total_weight if total_weight > 0 else 0.0
+def process_single_text(input_text: str, gemini_api_key: str,
+                       confident_api_key: str, progress=gr.Progress()) -> Tuple[Dict, List[Dict]]:
+    """
+    Process a single text input and return evaluation results.
+    Args:
+        input_text (str): Input text to evaluate
+        gemini_api_key (str): Gemini API key
+        confident_api_key (str): Confident API key for DeepEval
+        progress: Gradio progress tracker
+    Returns:
+        Tuple[Dict, List[Dict]]: Summary results and detailed results for each prompt
+    """
+    global GEMINI_API_KEY, CONFIDENT_API_KEY
+    # Set API keys
+    GEMINI_API_KEY = gemini_api_key
+    CONFIDENT_API_KEY = confident_api_key
+    if not input_text or not input_text.strip():
+        return {"error": "Please provide valid input text"}, []
+    try:
+        # Clean the input text to create reference
+        progress(0.1, "Cleaning input text...")
+        reference_text = clean_text(input_text)
+        if not reference_text:
+            return {"error": "Could not process the input text"}, []
+        # Initialize Gemini model
+        progress(0.2, "Initializing Gemini model...")
+        try:
+            genai.configure(api_key=GEMINI_API_KEY)
+            gemini_model = genai.GenerativeModel("gemini-1.5-flash")
+            judge = GeminiLLM(gemini_model)
+        except Exception as e:
+            return {"error": f"Failed to initialize Gemini: {str(e)}"}, []
+        # Get prompts
+        progress(0.3, "Generating candidate texts...")
+        prompts = create_prompts()
+        detailed_results = []
+        # Process each prompt
+        for prompt_name, prompt_template in prompts.items():
+            progress(0.3 + 0.6 * (list(prompts.keys()).index(prompt_name) / len(prompts)),
+                    f"Processing {prompt_name}...")
+            # Generate candidate
+            full_prompt = prompt_template.format(input_text=input_text)
+            candidate_text = gemini_model.generate_content(full_prompt).text.strip()
+            # Clean candidate text
+            cleaned_candidate = clean_text(candidate_text)
+            # Evaluate
+            scores = evaluate_text(input_text, cleaned_candidate, reference_text, judge)
+            # Calculate hybrid scores
+            hybrid_avg = np.mean(list(scores.values()))
+            weighted_avg = calculate_weighted_score(scores)
+            # Add interpretation
+            if weighted_avg >= 0.85:
+                interpretation = "Outstanding performance (A) - ready for professional use"
+            elif weighted_avg >= 0.70:
+                interpretation = "Strong performance (B) - good quality with minor improvements"
+            elif weighted_avg >= 0.50:
+                interpretation = "Adequate performance (C) - usable but needs refinement"
+            elif weighted_avg >= 0.30:
+                interpretation = "Weak performance (D) - requires significant revision"
+            else:
+                interpretation = "Poor performance (F) - likely needs complete rewriting"
+            detailed_results.append({
+                "Prompt": prompt_name,
+                "Original Input": input_text[:500] + "..." if len(input_text) > 500 else input_text,
+                "Reference Text": reference_text[:500] + "..." if len(reference_text) > 500 else reference_text,
+                "Candidate Text": cleaned_candidate,
+                "Scores": scores,
+                "Hybrid Average": hybrid_avg,
+                "Weighted Average": weighted_avg,
+                "Interpretation": interpretation
+            })
+        # Create summary
+        summary = {
+            "Total Prompts Evaluated": len(detailed_results),
+            "Best Performing Prompt": max(detailed_results, key=lambda x: x["Weighted Average"])["Prompt"],
+            "Highest Weighted Score": max(detailed_results, key=lambda x: x["Weighted Average"])["Weighted Average"],
+            "Lowest Weighted Score": min(detailed_results, key=lambda x: x["Weighted Average"])["Weighted Average"]
+        }
+        progress(1.0, "Processing complete!")
+        return summary, detailed_results
+    except Exception as e:
+        logger.error(f"Error processing text: {e}")
+        return {"error": f"Processing failed: {str(e)}"}, []
+def process_uploaded_file(file_path: str, gemini_api_key: str,
+                         confident_api_key: str, progress=gr.Progress()) -> Tuple[Dict, List[Dict]]:
+    """
+    Process an uploaded CSV/Excel file containing texts to evaluate.
+    Args:
+        file_path (str): Path to uploaded file
+        gemini_api_key (str): Gemini API key
+        confident_api_key (str): Confident API key for DeepEval
+        progress: Gradio progress tracker
+    Returns:
+        Tuple[Dict, List[Dict]]: Summary results and detailed results
+    """
+    try:
+        # Read file based on extension
+        file_ext = Path(file_path).suffix.lower()
+        if file_ext in ['.csv']:
+            df = pd.read_csv(file_path)
+        elif file_ext in ['.xls', '.xlsx']:
+            df = pd.read_excel(file_path)
+        else:
+            return {"error": "Unsupported file format. Please upload CSV or Excel file."}, []
+        if df.empty:
+            return {"error": "File is empty"}, []
+        # Look for text column (case-insensitive)
+        text_column = None
+        for col in df.columns:
+            if 'text' in col.lower() or 'content' in col.lower() or 'article' in col.lower():
+                text_column = col
+                break
+        if not text_column:
+            # Use first column if no text-like column found
+            text_column = df.columns[0]
+        texts = df[text_column].dropna().astype(str).tolist()
+        if not texts:
+            return {"error": "No valid text data found in the file"}, []
+        all_results = []
+        summaries = []
+        # Process each text
+        for i, text in enumerate(texts):
+            progress(i / len(texts), f"Processing text {i+1} of {len(texts)}...")
+            summary, details = process_single_text(text, gemini_api_key, confident_api_key)
+            if "error" not in summary:
+                summaries.append(summary)
+                all_results.extend(details)
+        if not all_results:
+            return {"error": "Failed to process any texts"}, []
+        # Create overall summary
+        overall_summary = {
+            "Total Files Processed": len(texts),
+            "Total Prompts Evaluated": len(all_results),
+            "Average Weighted Score": np.mean([r["Weighted Average"] for r in all_results]),
+            "Best Performing Prompt": pd.DataFrame(all_results)["Prompt"].mode()[0]
+                if len(all_results) > 0 else "N/A"
+        }
+        progress(1.0, "Batch processing complete!")
+        return overall_summary, all_results
+    except Exception as e:
+        logger.error(f"Error processing file: {e}")
+        return {"error": f"File processing failed: {str(e)}"}, []
+def create_gradio_interface():
+    """Create the Gradio interface."""
+    with gr.Blocks(title="LLM Evaluation Framework") as demo:
+        gr.Markdown("# 📊 LLM Evaluation Framework for Content Rewriting")
+        gr.Markdown("Evaluate and compare different prompts for professional content rewriting tasks.")
+        with gr.Tabs():
+            with gr.Tab("Single Text Evaluation"):
+                gr.Markdown("### Evaluate a single piece of text")
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        input_text = gr.Textbox(
+                            label="Input Text",
+                            placeholder="Paste your text here...",
+                            lines=10
+                        )
+                    with gr.Column(scale=1):
+                        gemini_api_key = gr.Textbox(
+                            label="Gemini API Key",
+                            placeholder="Enter your Gemini API key",
+                            type="password"
+                        )
+                        confident_api_key = gr.Textbox(
+                            label="Confident API Key (for DeepEval)",
+                            placeholder="Enter your Confident API key",
+                            type="password"
+                        )
+                        evaluate_btn = gr.Button("Evaluate Text", variant="primary")
+                gr.Markdown("### Results")
+                with gr.Row():
+                    with gr.Column():
+                        summary_output = gr.JSON(label="Summary Results")
+                    with gr.Column():
+                        detailed_output = gr.Dataframe(
+                            label="Detailed Results",
+                            headers=["Prompt", "Weighted Average", "Interpretation"],
+                            datatype=["str", "number", "str"]
+                        )
+                # Hidden outputs for detailed data
+                hidden_detailed_results = gr.State()
+                def update_outputs(text, gemini_key, confident_key):
+                    if not text.strip():
+                        return {"error": "Please enter text"}, None, None
+                    summary, detailed = process_single_text(text, gemini_key, confident_key)
+                    if "error" in summary:
+                        return summary, None, None
+                    # Prepare dataframe data
+                    df_data = []
+                    for result in detailed:
+                        df_data.append([
+                            result["Prompt"],
+                            round(result["Weighted Average"], 3),
+                            result["Interpretation"]
+                        ])
+                    return summary, df_data, detailed
+                evaluate_btn.click(
+                    fn=update_outputs,
+                    inputs=[input_text, gemini_api_key, confident_api_key],
+                    outputs=[summary_output, detailed_output, hidden_detailed_results]
+                )
+                # Button to show full candidate texts
+                with gr.Row():
+                    show_details_btn = gr.Button("Show Full Results with Candidate Texts")
+                full_results_output = gr.JSON(label="Full Detailed Results", visible=False)
+                def show_full_results(detailed_results):
+                    if detailed_results is None:
+                        return {"error": "No results to display"}
+                    return detailed_results
+                show_details_btn.click(
+                    fn=show_full_results,
+                    inputs=[hidden_detailed_results],
+                    outputs=[full_results_output]
+                )
+            with gr.Tab("Batch File Evaluation"):
+                gr.Markdown("### Evaluate multiple texts from a file")
+                with gr.Row():
+                    with gr.Column():
+                        file_input = gr.File(
+                            label="Upload CSV or Excel file",
+                            file_types=['.csv', '.xls', '.xlsx']
+                        )
+                    with gr.Column():
+                        batch_gemini_key = gr.Textbox(
+                            label="Gemini API Key",
+                            placeholder="Enter your Gemini API key",
+                            type="password"
+                        )
+                        batch_confident_key = gr.Textbox(
+                            label="Confident API Key (for DeepEval)",
+                            placeholder="Enter your Confident API key",
+                            type="password"
+                        )
+                        batch_evaluate_btn = gr.Button("Process File", variant="primary")
+                gr.Markdown("### Batch Results")
+                batch_summary_output = gr.JSON(label="Batch Summary Results")
+                batch_detailed_output = gr.Dataframe(
+                    label="Detailed Results",
+                    headers=["Prompt", "Weighted Average", "Interpretation"],
+                    datatype=["str", "number", "str"]
+                )
+                # Hidden state for batch results
+                hidden_batch_results = gr.State()
+                def process_file(file, gemini_key, confident_key):
+                    if file is None:
+                        return {"error": "Please upload a file"}, None, None
+                    summary, detailed = process_uploaded_file(file.name, gemini_key, confident_key)
+                    if "error" in summary:
+                        return summary, None, None
+                    # Prepare dataframe data
+                    df_data = []
+                    for result in detailed:
+                        df_data.append([
+                            result["Prompt"],
+                            round(result["Weighted Average"], 3),
+                            result["Interpretation"]
+                        ])
+                    return summary, df_data, detailed
+                batch_evaluate_btn.click(
+                    fn=process_file,
+                    inputs=[file_input, batch_gemini_key, batch_confident_key],
+                    outputs=[batch_summary_output, batch_detailed_output, hidden_batch_results]
+                )
+                # Button to show full batch results
+                show_batch_details_btn = gr.Button("Show Full Batch Results")
+                batch_full_results_output = gr.JSON(label="Full Batch Results", visible=False)
+                show_batch_details_btn.click(
+                    fn=show_full_results,
+                    inputs=[hidden_batch_results],
+                    outputs=[batch_full_results_output]
+                )
+        gr.Markdown("""
+        ## How to Use
+        1. **Single Text Evaluation**:
+           - Enter your text in the input box
+           - Provide your API keys
+           - Click "Evaluate Text" to see results
+        2. **Batch File Evaluation**:
+           - Upload a CSV or Excel file with a column containing text
+           - Provide your API keys
+           - Click "Process File" to evaluate all texts
+        ### API Keys
+        - **Gemini API Key**: Get from Google AI Studio
+        - **Confident API Key**: Get from DeepEval dashboard
+        ### Interpreting Results
+        - **Weighted Average**: Our primary metric combining all evaluations
+        - **Interpretation**: Performance grade based on weighted score
+        """)
+    return demo
+# Launch the app
+if __name__ == "__main__":
+    app = create_gradio_interface()
+    app.launch(debug=True)