Spaces:

Sa-m
/

llm-evaluation-framework

Running

App Files Files Community

Sa-m commited on Aug 15

Commit

f9128f8

verified ·

1 Parent(s): 3beb0c2

Update app.py

Browse files

Files changed (1) hide show

app.py +576 -340

app.py CHANGED Viewed

@@ -3,33 +3,71 @@ import pandas as pd
 import numpy as np
 import re
 import unicodedata
-from typing import Dict, List, Tuple
 import ftfy
 import nltk
-from bert_score import score as bert_score
-from rouge_score import rouge_scorer
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 from nltk.translate.meteor_score import meteor_score
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, GEval
-from deepeval.models import DeepEvalBaseLLM
 import google.generativeai as genai
 from groq import Groq
-import os
-from io import StringIO
-# Download required NLTK data
 nltk.download('punkt', quiet=True)
 nltk.download('wordnet', quiet=True)
-# Initialize APIs
-genai.configure(api_key=GEMINI_API_KEY)
-groq_client = Groq(api_key=GROQ_API_KEY)
 class LLMProvider:
-    """Abstract base class for LLM providers"""
     def __init__(self, model_name: str):
         self.model_name = model_name
@@ -40,404 +78,602 @@ class LLMProvider:
         return self.model_name
 class GeminiProvider(LLMProvider):
-    """Gemini implementation"""
-    def __init__(self, model_name: str = "gemini-1.5-flash"):
         super().__init__(model_name)
-        self.model = genai.GenerativeModel(model_name)
     def generate(self, prompt: str) -> str:
         try:
             response = self.model.generate_content(prompt)
-            return response.text.strip()
         except Exception as e:
             return f"Error generating with Gemini: {str(e)}"
 class GroqProvider(LLMProvider):
-    """Groq implementation for LLaMA models"""
     def __init__(self, model_name: str = "llama3-70b-8192"):
         super().__init__(model_name)
     def generate(self, prompt: str) -> str:
         try:
             chat_completion = groq_client.chat.completions.create(
                 messages=[
                     {"role": "user", "content": prompt}
                 ],
                 model=self.model_name,
-                temperature=0.7,
-                max_tokens=2048
             )
-            return chat_completion.choices[0].message.content.strip()
         except Exception as e:
             return f"Error generating with Groq: {str(e)}"
-class DeepEvalLLMWrapper(DeepEvalBaseLLM):
-    """Wrapper for DeepEval to work with our providers"""
-    def __init__(self, provider: LLMProvider):
-        self.provider = provider
-    def load_model(self):
-        return self.provider
-    def generate(self, prompt: str) -> str:
-        return self.provider.generate(prompt)
-    def get_model_name(self) -> str:
-        return self.provider.get_model_name()
-def clean_text(text: str) -> str:
-    """Clean text by fixing encoding and normalizing"""
-    if not text or not isinstance(text, str):
-        return ""
-    # Fix encoding artifacts
-    text = ftfy.fix_text(text)
-    text = unicodedata.normalize('NFKD', text)
-    # Fix quotes and other common issues
-    text = text.replace('â€œ', '"').replace('â€', '"')
-    text = text.replace('â€“', '-').replace('â€”', '-')
-    text = text.replace('â€˜', "'").replace('â€™', "'")
-    # Remove non-ASCII characters
-    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
-    # Normalize whitespace
-    text = ' '.join(text.split())
-    return text.strip()
-def evaluate_metrics(input_text: str, candidate_text: str, reference_text: str) -> Dict:
-    """Run comprehensive evaluation on the generated text"""
-    # Clean the texts
-    cleaned_input = clean_text(input_text)
-    cleaned_candidate = clean_text(candidate_text)
-    cleaned_reference = clean_text(reference_text)
     results = {}
-    # Traditional metrics
     try:
-        # BLEU Score
         smooth = SmoothingFunction().method4
-        bleu_score = sentence_bleu(
-            [cleaned_reference.split()],
             cleaned_candidate.split(),
             smoothing_function=smooth
         )
-        results["BLEU"] = bleu_score
-        # ROUGE Score
-        rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
-        rouge_scores = rouge_scorer_obj.score(cleaned_reference, cleaned_candidate)
-        rouge_avg = (rouge_scores['rouge1'].fmeasure +
-                    rouge_scores['rouge2'].fmeasure +
-                    rouge_scores['rougeL'].fmeasure) / 3
-        results["ROUGE"] = rouge_avg
-        # METEOR Score
-        meteor = meteor_score([cleaned_reference.split()], cleaned_candidate.split())
-        results["METEOR"] = meteor
-        # BERT Score
-        P, R, F1 = bert_score([cleaned_candidate], [cleaned_reference], lang="en", verbose=False)
-        results["BERTScore"] = F1.item()
     except Exception as e:
-        results["Error"] = f"Traditional metrics error: {str(e)}"
-    # LLM-as-judge metrics (using Gemini for consistency)
     try:
-        judge_provider = GeminiProvider("gemini-1.5-flash")
-        judge_wrapper = DeepEvalLLMWrapper(judge_provider)
-        test_case = LLMTestCase(
-            input=cleaned_input,
-            actual_output=cleaned_candidate,
-            expected_output=cleaned_reference
         )
-        # Answer Relevancy
-        answer_rel = AnswerRelevancyMetric(model=judge_wrapper)
-        answer_rel.measure(test_case)
-        results["AnswerRelevancy"] = answer_rel.score
-        # Faithfulness
-        faith = FaithfulnessMetric(model=judge_wrapper)
-        faith.measure(test_case)
-        results["Faithfulness"] = faith.score
-        # GEval
-        geval = GEval(
-            name="OverallQuality",
-            criteria="Evaluate if the candidate response is accurate, complete, and well-written.",
-            evaluation_params=[
-                "input", "actual_output", "expected_output"
-            ],
-            model=judge_wrapper
         )
-        geval.measure(test_case)
-        results["GEval"] = geval.score
     except Exception as e:
-        results["LLM_Judge_Error"] = f"LLM-as-judge metrics error: {str(e)}"
-    # Normalization and Hybrid Score
-    normalization_ranges = {
-        "AnswerRelevancy": (0.0, 1.0),
-        "Faithfulness": (0.0, 1.0),
-        "GEval": (0.0, 1.0),
-        "BERTScore": (0.7, 0.95),
-        "ROUGE": (0.0, 0.6),
-        "BLEU": (0.0, 0.4),
-        "METEOR": (0.0, 0.6)
-    }
-    weights = {
-        "AnswerRelevancy": 0.10,
-        "Faithfulness": 0.10,
-        "GEval": 0.025,
-        "BERTScore": 0.20,
-        "ROUGE": 0.15,
-        "BLEU": 0.025,
-        "METEOR": 0.15
-    }
-    # Normalize scores
-    normalized_scores = {}
-    for metric, value in results.items():
-        if metric in normalization_ranges and isinstance(value, (int, float)):
-            min_v, max_v = normalization_ranges[metric]
-            if max_v > min_v:  # Avoid division by zero
-                norm = max(min((value - min_v) / (max_v - min_v), 1.0), 0.0)
-                normalized_scores[metric] = norm
-            else:
-                normalized_scores[metric] = 0.5
-        elif isinstance(value, (int, float)):
-            normalized_scores[metric] = value
-    # Calculate weighted average
-    if normalized_scores:
-        weighted_sum = sum(normalized_scores.get(m, 0) * w for m, w in weights.items())
-        total_weight = sum(w for m, w in weights.items() if m in normalized_scores)
-        results["WeightedAverage"] = weighted_sum / total_weight if total_weight > 0 else 0.0
     else:
-        results["WeightedAverage"] = 0.0
-    return results
-def process_single_text(input_text: str, model_choice: str) -> Tuple[str, str, Dict]:
-    """Process a single text input"""
-    if not input_text or len(input_text.strip()) < 10:
-        return "", "", {"Error": "Input text too short"}
-    # Choose model
-    if model_choice == "Gemini":
-        provider = GeminiProvider("gemini-1.5-flash")
-    elif model_choice == "LLaMA-3-70b":
-        provider = GroqProvider("llama3-70b-8192")
-    else:  # LLaMA-3-8b
-        provider = GroqProvider("llama3-8b-8192")
-    # Generate candidate
-    prompt = f"""Rewrite the following paragraph in a fresh, concise, and professional style while preserving its full meaning and key information:
-{input_text}
-Provide only the rewritten text without any additional commentary."""
-    candidate = provider.generate(prompt)
-    # Use cleaned input as reference (simulating human-quality standard)
-    reference = clean_text(input_text)
-    # Evaluate
-    scores = evaluate_metrics(input_text, candidate, reference)
-    return candidate, reference, scores
-def process_file(file_obj, model_choice: str) -> Tuple[pd.DataFrame, str]:
-    """Process a CSV file with multiple articles"""
-    try:
-        # Read the file
-        content = file_obj.read().decode('utf-8')
-        df = pd.read_csv(StringIO(content))
-        # Assume first column is the text
-        text_column = df.columns[0]
-        results = []
-        for idx, row in df.iterrows():
-            text = str(row[text_column])
-            candidate, reference, scores = process_single_text(text, model_choice)
-            result_row = {
-                'Original_Text': text,
-                'Generated_Candidate': candidate,
-                'Reference_Text': reference
-            }
-            result_row.update(scores)
-            results.append(result_row)
-        results_df = pd.DataFrame(results)
-        return results_df, "File processed successfully!"
-    except Exception as e:
-        return pd.DataFrame(), f"Error processing file: {str(e)}"
-def create_gradio_interface():
-    """Create the Gradio interface"""
-    with gr.Blocks(title="LLM Evaluation Framework") as demo:
-        gr.Markdown("# 📊 LLM Evaluation Framework for Professional Content Rewriting")
-        gr.Markdown("Evaluate and compare LLM-generated content using multiple metrics. Choose between Gemini and LLaMA models.")
-        with gr.Tabs():
-            with gr.Tab("Single Text Processing"):
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        input_text = gr.Textbox(
-                            label="Input Text",
-                            placeholder="Enter the text you want to rewrite...",
-                            lines=10
-                        )
-                        model_choice_single = gr.Radio(
-                            ["Gemini", "LLaMA-3-70b", "LLaMA-3-8b"],
-                            label="Choose Model",
-                            value="Gemini"
-                        )
-                        submit_btn = gr.Button("Generate & Evaluate", variant="primary")
-                    with gr.Column(scale=3):
-                        gr.Markdown("### Results")
-                        with gr.Tabs():
-                            with gr.Tab("Generated Text"):
-                                candidate_output = gr.Textbox(
-                                    label="Generated Candidate",
-                                    lines=10,
-                                    show_copy_button=True
-                                )
-                                reference_output = gr.Textbox(
-                                    label="Reference Text (Cleaned Input)",
-                                    lines=5,
-                                    show_copy_button=True
-                                )
-                            with gr.Tab("Evaluation Scores"):
-                                scores_output = gr.JSON(label="Detailed Scores")
-                                weighted_avg = gr.Number(
-                                    label="Weighted Average Score (0-1)",
-                                    precision=4
-                                )
-                                interpretation = gr.Textbox(
-                                    label="Interpretation",
-                                    interactive=False
-                                )
-            with gr.Tab("Batch Processing (CSV File)"):
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        file_input = gr.File(
-                            label="Upload CSV File",
-                            file_types=['.csv']
-                        )
-                        model_choice_file = gr.Radio(
-                            ["Gemini", "LLaMA-3-70b", "LLaMA-3-8b"],
-                            label="Choose Model for Batch Processing",
-                            value="Gemini"
-                        )
-                        process_file_btn = gr.Button("Process File", variant="primary")
-                    with gr.Column(scale=2):
-                        gr.Markdown("### Results")
-                        file_results = gr.Dataframe(
-                            label="Evaluation Results",
-                            interactive=False
-                        )
-                        file_status = gr.Textbox(label="Status")
-        # Examples
-        gr.Examples(
-            examples=[
-                ["The immune system plays a crucial role in protecting the human body from pathogens such as bacteria, viruses, and other harmful invaders. It is composed of innate and adaptive components that work together to detect and eliminate foreign threats.", "Gemini"],
-                ["Climate change is one of the most pressing challenges facing humanity today. Rising global temperatures have led to severe weather patterns, including more intense storms, droughts, and heatwaves.", "LLaMA-3-70b"]
-            ],
-            inputs=[input_text, model_choice_single],
-            outputs=[candidate_output, reference_output, scores_output, weighted_avg, interpretation]
-        )
-        # Event handlers
-        def handle_single_process(text, model):
-            if not text:
-                return "", "", {}, 0, "Please enter some text."
-            candidate, reference, scores = process_single_text(text, model)
-            # Get weighted average
-            weighted_avg_val = scores.get("WeightedAverage", 0)
-            # Interpretation
-            if weighted_avg_val >= 0.85:
-                interpretation_text = "✅ Outstanding performance (A) - ready for professional use"
-            elif weighted_avg_val >= 0.70:
-                interpretation_text = "✅ Strong performance (B) - good quality with minor improvements"
-            elif weighted_avg_val >= 0.50:
-                interpretation_text = "⚠️ Adequate performance (C) - usable but needs refinement"
-            elif weighted_avg_val >= 0.30:
-                interpretation_text = "❌ Weak performance (D) - requires significant revision"
             else:
-                interpretation_text = "❌ Poor performance (F) - likely needs complete rewriting"
-            return candidate, reference, scores, weighted_avg_val, interpretation_text
-        def handle_file_process(file, model):
-            if file is None:
-                return pd.DataFrame(), "Please upload a file."
-            return process_file(file, model)
-        submit_btn.click(
-            fn=handle_single_process,
-            inputs=[input_text, model_choice_single],
-            outputs=[candidate_output, reference_output, scores_output, weighted_avg, interpretation]
-        )
-        process_file_btn.click(
-            fn=handle_file_process,
-            inputs=[file_input, model_choice_file],
-            outputs=[file_results, file_status]
-        )
         gr.Markdown("""
-        ## 📝 How to Use
-        1. **Single Text Processing**: Enter your text and choose a model to generate a professional rewrite.
-        2. **Batch Processing**: Upload a CSV file with one article per row in the first column.
-        3. **Model Options**:
-           - **Gemini**: Google's advanced language model
-           - **LLaMA-3-70b**: Large Meta model (70B parameters)
-           - **LLaMA-3-8b**: Smaller Meta model (8B parameters)
-        ## 📊 Evaluation Metrics
-        The system evaluates performance using multiple metrics:
-        - **Traditional**: BLEU, ROUGE, METEOR (n-gram overlap)
-        - **Semantic**: BERTScore (embedding similarity)
-        - **LLM-as-Judge**: AnswerRelevancy, Faithfulness, GEval
-        - **Final Score**: Weighted average of all metrics (0-1 scale)
         """)
-    return demo
 # Launch the app
 if __name__ == "__main__":
-    app = create_gradio_interface()
-    app.launch(share=True)

 import numpy as np
 import re
 import unicodedata
 import ftfy
 import nltk
+import os
+import json
+import time
+from typing import Dict, Any, List, Tuple, Optional
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+from bert_score import score as bert_score
 from nltk.translate.meteor_score import meteor_score
 import google.generativeai as genai
 from groq import Groq
+from dotenv import load_dotenv
+# Download necessary NLTK resources
 nltk.download('punkt', quiet=True)
 nltk.download('wordnet', quiet=True)
+# Load environment variables
+load_dotenv()
+# Initialize API clients (with graceful fallback if keys missing)
+try:
+    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+    if GEMINI_API_KEY:
+        genai.configure(api_key=GEMINI_API_KEY)
+    else:
+        print("Warning: GEMINI_API_KEY not found in environment variables")
+except Exception as e:
+    print(f"Error configuring Gemini: {str(e)}")
+    GEMINI_API_KEY = None
+try:
+    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+    if GROQ_API_KEY:
+        groq_client = Groq(api_key=GROQ_API_KEY)
+    else:
+        print("Warning: GROQ_API_KEY not found in environment variables")
+        groq_client = None
+except Exception as e:
+    print(f"Error configuring Groq: {str(e)}")
+    groq_client = None
+# Text cleaning function
+def clean_text(text: str) -> str:
+    """Clean text by fixing encoding issues and standardizing format"""
+    if not isinstance(text, str) or not text.strip():
+        return ""
+    text = ftfy.fix_text(text)  # Fixes encoding artifacts
+    text = unicodedata.normalize('NFKD', text)
+    # Replace common smart quotes and dashes
+    replacements = {
+        'â€œ': '"', 'â€': '"', 'â€“': '-', 'â€”': '--',
+        'â€¢': '*', 'â€¦': '...', 'Â': ''
+    }
+    for old, new in replacements.items():
+        text = text.replace(old, new)
+    # Remove non-ASCII characters
+    text = re.sub(r'[^\x00-\x7F]+', '', text)
+    # Normalize whitespace
+    return ' '.join(text.split())
+# LLM Provider classes
 class LLMProvider:
     def __init__(self, model_name: str):
         self.model_name = model_name
         return self.model_name
 class GeminiProvider(LLMProvider):
+    def __init__(self, model_name: str = "gemini-1.5-flash-latest"):
         super().__init__(model_name)
+        self.available = bool(GEMINI_API_KEY)
+        if self.available:
+            try:
+                self.model = genai.GenerativeModel(model_name)
+            except Exception as e:
+                print(f"Error initializing Gemini model: {str(e)}")
+                self.available = False
     def generate(self, prompt: str) -> str:
+        if not self.available:
+            return "Error: Gemini API not configured properly. Check your API key."
         try:
             response = self.model.generate_content(prompt)
+            return response.text
         except Exception as e:
             return f"Error generating with Gemini: {str(e)}"
 class GroqProvider(LLMProvider):
     def __init__(self, model_name: str = "llama3-70b-8192"):
         super().__init__(model_name)
+        self.available = bool(groq_client)
     def generate(self, prompt: str) -> str:
+        if not self.available:
+            return "Error: Groq API not configured properly. Check your API key."
         try:
             chat_completion = groq_client.chat.completions.create(
                 messages=[
                     {"role": "user", "content": prompt}
                 ],
                 model=self.model_name,
+                temperature=0.3
             )
+            return chat_completion.choices[0].message.content
         except Exception as e:
             return f"Error generating with Groq: {str(e)}"
+# Prompt templates
+PROMPT_TEMPLATES = {
+    "Strategic Narrative Architect": """Role: Strategic Narrative Architect
+You are a professional content writer with expertise in creating engaging, well-structured narratives.
+Your task is to rewrite the following text in a professional, engaging style while preserving all key facts and information:
+{text}
+Instructions:
+1. Maintain all factual information and key details
+2. Improve structure and flow for better readability
+3. Enhance engagement through appropriate storytelling techniques
+4. Use professional language appropriate for the content domain
+5. Ensure the output is concise yet comprehensive
+Rewritten content:""",
+    "Precision Storyteller": """Role: Precision Storyteller
+You are a professional editor focused on accuracy, clarity, and precision.
+Your task is to rewrite the following text with maximum factual accuracy while improving clarity:
+{text}
+Instructions:
+1. Preserve all factual information with absolute precision
+2. Correct any grammatical errors or awkward phrasing
+3. Ensure logical flow and coherence
+4. Use clear, concise language without unnecessary embellishment
+5. Maintain professional tone appropriate for the content domain
+Rewritten content:"""
+}
+# Metric normalization ranges
+NORMALIZATION_RANGES = {
+    "AnswerRelevancy": (0.0, 1.0),
+    "Faithfulness": (0.0, 1.0),
+    "GEval": (0.0, 1.0),
+    "BERTScore": (0.7, 0.95),
+    "ROUGE": (0.0, 0.6),
+    "BLEU": (0.0, 0.4),
+    "METEOR": (0.0, 0.6)
+}
+# Metric weights
+METRIC_WEIGHTS = {
+    "AnswerRelevancy": 0.10,
+    "Faithfulness": 0.10,
+    "GEval": 0.025,
+    "BERTScore": 0.20,
+    "ROUGE": 0.15,
+    "BLEU": 0.025,
+    "METEOR": 0.15
+}
+def normalize_score(metric: str, value: float) -> float:
+    """Normalize score to 0-1 scale based on metric's natural range"""
+    if metric not in NORMALIZATION_RANGES or not isinstance(value, (int, float)):
+        return value
+    min_val, max_val = NORMALIZATION_RANGES[metric]
+    # Handle edge cases
+    if max_val <= min_val:
+        return 0.5  # Default middle value if range is invalid
+    # Normalize and clamp to [0,1]
+    normalized = (value - min_val) / (max_val - min_val)
+    return max(0.0, min(normalized, 1.0))
+def calculate_weighted_score(scores: Dict[str, float]) -> float:
+    """Calculate weighted average of normalized scores"""
+    normalized_scores = {m: normalize_score(m, v) for m, v in scores.items()}
+    total_weight = 0
+    weighted_sum = 0
+    for metric, weight in METRIC_WEIGHTS.items():
+        if metric in normalized_scores:
+            weighted_sum += normalized_scores[metric] * weight
+            total_weight += weight
+    return weighted_sum / total_weight if total_weight > 0 else 0
+def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template: str) -> Dict[str, Any]:
+    """Evaluate a single text using the selected model and prompt"""
+    # Create clean reference text
+    reference_text = clean_text(raw_input)
+    # Generate candidate using the selected model and prompt
+    prompt = prompt_template.replace("{text}", raw_input)
+    candidate = model_provider.generate(prompt)
+    # Clean candidate output for consistent evaluation
+    cleaned_candidate = clean_text(candidate)
+    # Initialize evaluation metrics
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
+    # Calculate traditional metrics
     results = {}
+    # BLEU Score
     try:
         smooth = SmoothingFunction().method4
+        bleu = sentence_bleu(
+            [reference_text.split()],
             cleaned_candidate.split(),
             smoothing_function=smooth
         )
+        results["BLEU"] = bleu
     except Exception as e:
+        print(f"BLEU error: {str(e)}")
+        results["BLEU"] = 0.0
+    # ROUGE Score
     try:
+        rouge_scores = scorer.score(reference_text, cleaned_candidate)
+        rouge = (rouge_scores['rouge1'].fmeasure +
+                 rouge_scores['rouge2'].fmeasure +
+                 rouge_scores['rougeL'].fmeasure) / 3
+        results["ROUGE"] = rouge
+    except Exception as e:
+        print(f"ROUGE error: {str(e)}")
+        results["ROUGE"] = 0.0
+    # METEOR Score
+    try:
+        meteor = meteor_score(
+            [reference_text.split()],
+            cleaned_candidate.split()
         )
+        results["METEOR"] = meteor
+    except Exception as e:
+        print(f"METEOR error: {str(e)}")
+        results["METEOR"] = 0.0
+    # BERTScore
+    try:
+        P, R, F1 = bert_score(
+            [cleaned_candidate],
+            [reference_text],
+            lang="en",
+            verbose=False
         )
+        results["BERTScore"] = F1.item()
     except Exception as e:
+        print(f"BERTScore error: {str(e)}")
+        results["BERTScore"] = 0.7  # Default low value
+    # LLM-as-judge metrics - simplified implementation since DeepEval might not be available
+    try:
+        # Use Gemini as judge if available
+        if GEMINI_API_KEY:
+            judge_model = GeminiProvider("gemini-1.5-flash-latest")
+            # Answer Relevancy
+            relevancy_prompt = f"""
+            On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input?
+            Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
+            Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
+            Provide only a single number between 0.0 and 1.0 with no explanation.
+            """
+            relevancy_response = judge_model.generate(relevancy_prompt)
+            try:
+                relevancy_score = float(relevancy_response.strip())
+                results["AnswerRelevancy"] = max(0.0, min(1.0, relevancy_score))
+            except:
+                results["AnswerRelevancy"] = 0.5
+            # Faithfulness
+            faithfulness_prompt = f"""
+            On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy?
+            Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
+            Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
+            Provide only a single number between 0.0 and 1.0 with no explanation.
+            """
+            faithfulness_response = judge_model.generate(faithfulness_prompt)
+            try:
+                faithfulness_score = float(faithfulness_response.strip())
+                results["Faithfulness"] = max(0.0, min(1.0, faithfulness_score))
+            except:
+                results["Faithfulness"] = 0.5
+            # GEval
+            geval_prompt = f"""
+            On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text.
+            Consider accuracy, completeness, fluency, and professionalism.
+            Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
+            Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
+            Provide only a single number between 0.0 and 1.0 with no explanation.
+            """
+            geval_response = judge_model.generate(geval_prompt)
+            try:
+                geval_score = float(geval_response.strip())
+                results["GEval"] = max(0.0, min(1.0, geval_score))
+            except:
+                results["GEval"] = 0.5
+        else:
+            # Default values if no judge model available
+            results["AnswerRelevancy"] = 0.5
+            results["Faithfulness"] = 0.5
+            results["GEval"] = 0.5
+    except Exception as e:
+        print(f"LLM-as-judge error: {str(e)}")
+        # Default values if DeepEval fails
+        results["AnswerRelevancy"] = 0.5
+        results["Faithfulness"] = 0.5
+        results["GEval"] = 0.5
+    # Calculate normalized and weighted scores
+    normalized_scores = {m: normalize_score(m, v) for m, v in results.items()}
+    weighted_score = calculate_weighted_score(results)
+    # Determine interpretation
+    if weighted_score >= 0.85:
+        interpretation = "Outstanding performance (A) - ready for professional use"
+    elif weighted_score >= 0.70:
+        interpretation = "Strong performance (B) - good quality with minor improvements"
+    elif weighted_score >= 0.50:
+        interpretation = "Adequate performance (C) - usable but needs refinement"
+    elif weighted_score >= 0.30:
+        interpretation = "Weak performance (D) - requires significant revision"
     else:
+        interpretation = "Poor performance (F) - likely needs complete rewriting"
+    return {
+        "candidate": cleaned_candidate,
+        "metrics": results,
+        "normalized": normalized_scores,
+        "weighted_score": weighted_score,
+        "interpretation": interpretation
+    }
+def process_input(input_text: str, file_upload, model_choice: str, prompt_choice: str) -> Tuple[str, List[List[str]], str]:
+    """Process either input text or uploaded file"""
+    if input_text and file_upload:
+        return "Please use either text input or file upload, not both.", [], ""
+    if not input_text and not file_upload:
+        return "Please provide input text or upload a file.", [], ""
+    # Determine model provider
+    if model_choice == "Gemini":
+        model_provider = GeminiProvider("gemini-1.5-flash-latest")
+    elif model_choice == "Llama-3-70b":
+        model_provider = GroqProvider("llama3-70b-8192")
+    else:  # Llama-3-8b
+        model_provider = GroqProvider("llama3-8b-8192")
+    # Check if model is available
+    if not model_provider.available:
+        return f"Error: {model_choice} is not properly configured. Check your API key.", [], ""
+    # Get prompt template
+    prompt_template = PROMPT_TEMPLATES[prompt_choice]
+    # Process single text input
+    if input_text:
+        with gr.Progress() as progress:
+            progress(0.1, desc="Starting evaluation...")
+            time.sleep(0.2)
+            progress(0.3, desc="Generating rewritten content...")
+            time.sleep(0.2)
+            progress(0.6, desc="Calculating metrics...")
+            result = evaluate_text(input_text, model_provider, prompt_template)
+            progress(0.9, desc="Finalizing results...")
+            time.sleep(0.2)
+            # Format metrics for display
+            metrics_table = [
+                ["Metric", "Raw Score", "Normalized"],
+                ["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
+                ["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
+                ["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
+                ["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
+                ["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
+                ["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
+                ["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
+                ["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
+            ]
+            return (
+                result["candidate"],
+                metrics_table,
+                f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
+            )
+    # Process file upload
+    if file_upload:
+        with gr.Progress() as progress:
+            progress(0.1, desc="Reading file...")
+            time.sleep(0.2)
+            # Read the file (assuming CSV with one column of text)
+            try:
+                df = pd.read_csv(file_upload.name)
+                progress(0.3, desc="Processing entries...")
+                time.sleep(0.2)
+            except Exception as e:
+                return f"Error reading file: {str(e)}", [], ""
+            # Assuming the first column contains the text
+            text_column = df.columns[0]
+            results = []
+            detailed_results = []
+            # Process each entry with progress updates
+            for i, row in df.iterrows():
+                progress((i + 1) / len(df) * 0.6 + 0.3, desc=f"Processing entry {i+1}/{len(df)}")
+                text = str(row[text_column])
+                try:
+                    result = evaluate_text(text, model_provider, prompt_template)
+                    # Add to results
+                    results.append(result["weighted_score"])
+                    # Store detailed results
+                    detailed_results.append({
+                        "input_preview": text[:100] + "..." if len(text) > 100 else text,
+                        "weighted_score": result["weighted_score"],
+                        "interpretation": result["interpretation"],
+                        "candidate": result["candidate"]
+                    })
+                except Exception as e:
+                    print(f"Error processing entry {i}: {str(e)}")
+                    results.append(0.0)
+                    detailed_results.append({
+                        "input_preview": text[:100] + "..." if len(text) > 100 else text,
+                        "weighted_score": 0.0,
+                        "interpretation": "Error processing this entry",
+                        "candidate": ""
+                    })
+            progress(0.9, desc="Generating summary...")
+            time.sleep(0.2)
+            # Create results dataframe
+            results_df = pd.DataFrame(detailed_results)
+            # Generate summary statistics
+            valid_scores = [s for s in results if s > 0]
+            if valid_scores:
+                avg_score = sum(valid_scores) / len(valid_scores)
+                min_score = min(valid_scores)
+                max_score = max(valid_scores)
+                if avg_score >= 0.85:
+                    summary = "Excellent performance across inputs"
+                elif avg_score >= 0.70:
+                    summary = "Good performance with room for minor improvements"
+                elif avg_score >= 0.50:
+                    summary = "Adequate performance but needs refinement"
+                else:
+                    summary = "Significant improvements needed"
+                # Format summary
+                summary_text = (
+                    f"Processed {len(results)} entries ({len(valid_scores)} successful)\n"
+                    f"Average Hybrid Score: {avg_score:.4f}\n"
+                    f"Range: {min_score:.4f} - {max_score:.4f}\n\n"
+                    f"{summary}"
+                )
+                # Create metrics table for summary
+                metrics_table = [
+                    ["Metric", "Value"],
+                    ["Entries Processed", f"{len(results)}"],
+                    ["Successful Entries", f"{len(valid_scores)}"],
+                    ["Average Score", f"{avg_score:.4f}"],
+                    ["Best Score", f"{max_score:.4f}"],
+                    ["Worst Score", f"{min_score:.4f}"],
+                    ["Overall Assessment", summary]
+                ]
+                return (
+                    "Batch processing complete. Use the 'Show Details' button to see individual results.",
+                    metrics_table,
+                    summary_text
+                )
             else:
+                return (
+                    "No successful evaluations. Check your API configuration and input data.",
+                    [["Error", "All evaluations failed"]],
+                    "Error: No successful evaluations. Check your API configuration and input data."
+                )
+def show_detailed_results(input_text, file_upload, model_choice, prompt_choice):
+    """Show detailed results for batch processing"""
+    if not file_upload:
+        return "No file uploaded for batch processing."
+    # Read the file
+    df = pd.read_csv(file_upload.name)
+    text_column = df.columns[0]
+    # Determine model provider
+    if model_choice == "Gemini":
+        model_provider = GeminiProvider("gemini-1.5-flash-latest")
+    elif model_choice == "Llama-3-70b":
+        model_provider = GroqProvider("llama3-70b-8192")
+    else:  # Llama-3-8b
+        model_provider = GroqProvider("llama3-8b-8192")
+    # Get prompt template
+    prompt_template = PROMPT_TEMPLATES[prompt_choice]
+    # Process each entry
+    results = []
+    for _, row in df.iterrows():
+        text = str(row[text_column])
+        try:
+            result = evaluate_text(text, model_provider, prompt_template)
+            results.append({
+                "Input Preview": text[:100] + "..." if len(text) > 100 else text,
+                "Weighted Score": f"{result['weighted_score']:.4f}",
+                "Interpretation": result['interpretation'],
+                "Candidate Text": result['candidate']
+            })
+        except:
+            results.append({
+                "Input Preview": text[:100] + "..." if len(text) > 100 else text,
+                "Weighted Score": "Error",
+                "Interpretation": "Processing error",
+                "Candidate Text": ""
+            })
+    return gr.Dataframe(value=pd.DataFrame(results))
+# Create Gradio interface
+with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📊 LLM Evaluation Framework for Professional Content Rewriting")
+    gr.Markdown("Evaluate the quality of LLM-generated content using multiple metrics with proper normalization.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 📥 Input Options")
+            input_text = gr.Textbox(
+                label="Input Text",
+                lines=10,
+                placeholder="Enter text to evaluate...",
+                elem_id="input-text"
+            )
+            gr.Markdown("or")
+            file_upload = gr.File(
+                label="Upload CSV file (single column of text)",
+                file_types=[".csv", ".txt"],
+                elem_id="file-upload"
+            )
+            gr.Markdown("### ⚙️ Configuration")
+            model_choice = gr.Radio(
+                ["Gemini", "Llama-3-70b", "Llama-3-8b"],
+                label="Select Model",
+                value="Gemini",
+                elem_id="model-choice"
+            )
+            prompt_choice = gr.Radio(
+                ["Strategic Narrative Architect", "Precision Storyteller"],
+                label="Select Prompt Template",
+                value="Strategic Narrative Architect",
+                elem_id="prompt-choice"
+            )
+            submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn")
+        with gr.Column(scale=2):
+            gr.Markdown("### ✍️ Rewritten Content")
+            candidate_output = gr.Textbox(
+                label="Rewritten Content",
+                lines=15,
+                elem_id="candidate-output"
+            )
+            gr.Markdown("### 📈 Evaluation Metrics")
+            metrics_output = gr.Dataframe(
+                label="Evaluation Metrics",
+                interactive=False,
+                elem_id="metrics-output"
+            )
+            gr.Markdown("### 📌 Overall Assessment")
+            summary_output = gr.Textbox(
+                label="Summary",
+                elem_id="summary-output"
+            )
+            detailed_results_btn = gr.Button("Show Detailed Results (Batch)", visible=False)
+            detailed_results = gr.Dataframe(visible=False)
+    # Update visibility of detailed results button
+    def update_detailed_results_visibility(file_upload, summary):
+        has_file = file_upload is not None
+        has_batch_results = "Processed" in summary and "entries" in summary
+        return gr.update(visible=has_file and has_batch_results)
+    # Event handlers
+    submit_btn.click(
+        fn=process_input,
+        inputs=[input_text, file_upload, model_choice, prompt_choice],
+        outputs=[candidate_output, metrics_output, summary_output]
+    ).then(
+        fn=update_detailed_results_visibility,
+        inputs=[file_upload, summary_output],
+        outputs=detailed_results_btn
+    )
+    detailed_results_btn.click(
+        fn=show_detailed_results,
+        inputs=[input_text, file_upload, model_choice, prompt_choice],
+        outputs=detailed_results
+    ).then(
+        fn=lambda: gr.update(visible=True),
+        outputs=detailed_results
+    )
+    # Add interpretation guide in an accordion
+    with gr.Accordion("📚 Interpretation Guide", open=False):
         gr.Markdown("""
+        ### Hybrid Score Interpretation
+        The Hybrid Score combines multiple evaluation metrics into a single score with proper normalization:
+        - **0.85+**: Outstanding performance (A) - ready for professional use
+        - **0.70-0.85**: Strong performance (B) - good quality with minor improvements
+        - **0.50-0.70**: Adequate performance (C) - usable but needs refinement
+        - **0.30-0.50**: Weak performance (D) - requires significant revision
+        - **<0.30**: Poor performance (F) - likely needs complete rewriting
+        ### Key Metrics Explained
+        | Metric | What It Measures | Why It Matters |
+        |--------|------------------|----------------|
+        | **AnswerRelevancy** | Is output on-topic with input? | Does the prompt stay focused despite messy input? |
+        | **Faithfulness** | Are ALL facts preserved correctly? | Does it maintain accuracy when input has encoding errors? |
+        | **GEval** | Overall quality assessment by another AI | How professional does the output appear? |
+        | **BERTScore** | Semantic similarity to reference | How well does it capture the meaning of cleaned text? |
+        | **ROUGE** | Content overlap with reference | How much key information is preserved? |
+        | **BLEU** | Phrasing precision | How closely does wording match human-quality standard? |
+        | **METEOR** | Linguistic quality with synonyms | How natural does the cleaned output read? |
         """)
 # Launch the app
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )