Spaces:

Sa-m
/

llm-evaluation-framework

Running

App Files Files Community

Sa-m commited on Aug 26

Commit

0ef5ef9

verified ·

1 Parent(s): 7517d1f

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -249

app.py CHANGED Viewed

@@ -200,17 +200,11 @@ def calculate_weighted_score(scores: Dict[str, float]) -> float:
     return weighted_sum / total_weight if total_weight > 0 else 0
-def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template: str) -> Dict[str, Any]:
-    """Evaluate a single text using the selected model and prompt"""
-    # Create clean reference text
-    reference_text = clean_text(raw_input)
-    # Generate candidate using the selected model and prompt
-    prompt = prompt_template.replace("{text}", raw_input)
-    candidate = model_provider.generate(prompt)
-    # Clean candidate output for consistent evaluation
-    cleaned_candidate = clean_text(candidate)
     # Initialize evaluation metrics
     scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
@@ -223,7 +217,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
         smooth = SmoothingFunction().method4
         bleu = sentence_bleu(
             [reference_text.split()],
-            cleaned_candidate.split(),
             smoothing_function=smooth
         )
         results["BLEU"] = bleu
@@ -233,7 +227,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
     # ROUGE Score
     try:
-        rouge_scores = scorer.score(reference_text, cleaned_candidate)
         rouge = (rouge_scores['rouge1'].fmeasure +
                  rouge_scores['rouge2'].fmeasure +
                  rouge_scores['rougeL'].fmeasure) / 3
@@ -246,7 +240,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
     try:
         meteor = meteor_score(
             [reference_text.split()],
-            cleaned_candidate.split()
         )
         results["METEOR"] = meteor
     except Exception as e:
@@ -256,7 +250,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
     # BERTScore
     try:
         P, R, F1 = bert_score(
-            [cleaned_candidate],
             [reference_text],
             lang="en",
             verbose=False
@@ -276,8 +270,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
             relevancy_prompt = f"""
             On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input?
-            Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
-            Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
             Provide only a single number between 0.0 and 1.0 with no explanation.
             """
@@ -292,8 +286,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
             faithfulness_prompt = f"""
             On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy?
-            Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
-            Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
             Provide only a single number between 0.0 and 1.0 with no explanation.
             """
@@ -309,8 +303,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
             On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text.
             Consider accuracy, completeness, fluency, and professionalism.
-            Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
-            Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
             Provide only a single number between 0.0 and 1.0 with no explanation.
             """
@@ -349,20 +343,17 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
         interpretation = "Poor performance (F) - likely needs complete rewriting"
     return {
-        "candidate": cleaned_candidate,
         "metrics": results,
         "normalized": normalized_scores,
         "weighted_score": weighted_score,
         "interpretation": interpretation
     }
-def process_input(input_text: str, file_upload, model_choice: str, prompt_choice: str, progress=gr.Progress()) -> Tuple[str, List[List[str]], str]:
-    """Process either input text or uploaded file with progress tracking"""
-    if input_text and file_upload:
-        return "Please use either text input or file upload, not both.", [], ""
-    if not input_text and not file_upload:
-        return "Please provide input text or upload a file.", [], ""
     # Determine model provider
     if model_choice == "Gemini":
@@ -374,199 +365,63 @@ def process_input(input_text: str, file_upload, model_choice: str, prompt_choice
     # Check if model is available
     if not model_provider.available:
-        return f"Error: {model_choice} is not properly configured. Check your API key.", [], ""
-    # Get prompt template
-    prompt_template = PROMPT_TEMPLATES[prompt_choice]
-    # Process single text input
-    if input_text:
         progress(0.1, desc="Starting evaluation...")
-        time.sleep(0.2)
-        progress(0.3, desc="Generating rewritten content...")
-        time.sleep(0.2)
-        progress(0.6, desc="Calculating metrics...")
-        result = evaluate_text(input_text, model_provider, prompt_template)
-        progress(0.9, desc="Finalizing results...")
-        time.sleep(0.2)
-        # Format metrics for display
-        metrics_table = [
-            ["Metric", "Raw Score", "Normalized"],
-            ["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
-            ["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
-            ["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
-            ["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
-            ["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
-            ["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
-            ["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
-            ["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
-        ]
-        return (
-            result["candidate"],
-            metrics_table,
-            f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
-        )
-    # Process file upload
-    if file_upload:
-        progress(0.1, desc="Reading file...")
-        time.sleep(0.2)
-        # Read the file (assuming CSV with one column of text)
-        try:
-            df = pd.read_csv(file_upload.name)
-            progress(0.3, desc="Processing entries...")
-            time.sleep(0.2)
-        except Exception as e:
-            return f"Error reading file: {str(e)}", [], ""
-        # Assuming the first column contains the text
-        text_column = df.columns[0]
-        results = []
-        detailed_results = []
-        # Process each entry with progress updates
-        for i, row in df.iterrows():
-            progress((i + 1) / len(df) * 0.6 + 0.3, desc=f"Processing entry {i+1}/{len(df)}")
-            text = str(row[text_column])
-            try:
-                result = evaluate_text(text, model_provider, prompt_template)
-                # Add to results
-                results.append(result["weighted_score"])
-                # Store detailed results
-                detailed_results.append({
-                    "input_preview": text[:100] + "..." if len(text) > 100 else text,
-                    "weighted_score": result["weighted_score"],
-                    "interpretation": result["interpretation"],
-                    "candidate": result["candidate"]
-                })
-            except Exception as e:
-                print(f"Error processing entry {i}: {str(e)}")
-                results.append(0.0)
-                detailed_results.append({
-                    "input_preview": text[:100] + "..." if len(text) > 100 else text,
-                    "weighted_score": 0.0,
-                    "interpretation": "Error processing this entry",
-                    "candidate": ""
-                })
-        progress(0.9, desc="Generating summary...")
         time.sleep(0.2)
-        # Create results dataframe
-        results_df = pd.DataFrame(detailed_results)
-        # Generate summary statistics
-        valid_scores = [s for s in results if s > 0]
-        if valid_scores:
-            avg_score = sum(valid_scores) / len(valid_scores)
-            min_score = min(valid_scores)
-            max_score = max(valid_scores)
-            if avg_score >= 0.85:
-                summary = "Excellent performance across inputs"
-            elif avg_score >= 0.70:
-                summary = "Good performance with room for minor improvements"
-            elif avg_score >= 0.50:
-                summary = "Adequate performance but needs refinement"
-            else:
-                summary = "Significant improvements needed"
-            # Format summary
-            summary_text = (
-                f"Processed {len(results)} entries ({len(valid_scores)} successful)\n"
-                f"Average Hybrid Score: {avg_score:.4f}\n"
-                f"Range: {min_score:.4f} - {max_score:.4f}\n\n"
-                f"{summary}"
-            )
-            # Create metrics table for summary
-            metrics_table = [
-                ["Metric", "Value"],
-                ["Entries Processed", f"{len(results)}"],
-                ["Successful Entries", f"{len(valid_scores)}"],
-                ["Average Score", f"{avg_score:.4f}"],
-                ["Best Score", f"{max_score:.4f}"],
-                ["Worst Score", f"{min_score:.4f}"],
-                ["Overall Assessment", summary]
-            ]
-            return (
-                "Batch processing complete. Use the 'Show Details' button to see individual results.",
-                metrics_table,
-                summary_text
-            )
-        else:
-            return (
-                "No successful evaluations. Check your API configuration and input data.",
-                [["Error", "All evaluations failed"]],
-                "Error: No successful evaluations. Check your API configuration and input data."
-            )
-def show_detailed_results(input_text, file_upload, model_choice, prompt_choice, progress=gr.Progress()):
-    """Show detailed results for batch processing"""
-    if not file_upload:
-        return "No file uploaded for batch processing."
-    progress(0.1, desc="Reading file...")
-    time.sleep(0.1)
-    # Read the file
-    df = pd.read_csv(file_upload.name)
-    text_column = df.columns[0]
-    progress(0.3, desc="Determining model provider...")
-    time.sleep(0.1)
-    # Determine model provider
-    if model_choice == "Gemini":
-        model_provider = GeminiProvider("gemini-1.5-flash-latest")
-    elif model_choice == "Llama-3-70b":
-        model_provider = GroqProvider("llama3-70b-8192")
-    else:  # Llama-3-8b
-        model_provider = GroqProvider("llama3-8b-8192")
-    progress(0.5, desc="Getting prompt template...")
-    time.sleep(0.1)
-    # Get prompt template
-    prompt_template = PROMPT_TEMPLATES[prompt_choice]
-    progress(0.7, desc="Processing entries...")
-    time.sleep(0.1)
-    # Process each entry
-    results = []
-    for i, row in enumerate(df.iterrows()):
-        _, row = row  # Unpack the tuple
-        text = str(row[text_column])
-        try:
-            result = evaluate_text(text, model_provider, prompt_template)
-            results.append({
-                "Input Preview": text[:100] + "..." if len(text) > 100 else text,
-                "Weighted Score": f"{result['weighted_score']:.4f}",
-                "Interpretation": result['interpretation'],
-                "Candidate Text": result['candidate']
-            })
-        except:
-            results.append({
-                "Input Preview": text[:100] + "..." if len(text) > 100 else text,
-                "Weighted Score": "Error",
-                "Interpretation": "Processing error",
-                "Candidate Text": ""
-            })
-        progress(0.7 + (i + 1) / len(df) * 0.3, desc=f"Processing entry {i+1}/{len(df)}")
-    progress(1.0, desc="Completed!")
-    return gr.Dataframe(value=pd.DataFrame(results))
 # Create Gradio interface
 with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
@@ -576,17 +431,38 @@ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 📥 Input Options")
-            input_text = gr.Textbox(
-                label="Input Text",
                 lines=10,
-                placeholder="Enter text to evaluate...",
-                elem_id="input-text"
             )
-            gr.Markdown("or")
-            file_upload = gr.File(
-                label="Upload CSV file (single column of text)",
-                file_types=[".csv", ".txt"],
-                elem_id="file-upload"
             )
             gr.Markdown("### ⚙️ Configuration")
@@ -607,12 +483,22 @@ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo
             submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn")
         with gr.Column(scale=2):
-            gr.Markdown("### ✍️ Rewritten Content")
-            candidate_output = gr.Textbox(
-                label="Rewritten Content",
-                lines=15,
-                elem_id="candidate-output"
-            )
             gr.Markdown("### 📈 Evaluation Metrics")
             metrics_output = gr.Dataframe(
@@ -626,34 +512,12 @@ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo
                 label="Summary",
                 elem_id="summary-output"
             )
-            detailed_results_btn = gr.Button("Show Detailed Results (Batch)", visible=False)
-            detailed_results = gr.Dataframe(visible=False)
-    # Update visibility of detailed results button
-    def update_detailed_results_visibility(file_upload, summary):
-        has_file = file_upload is not None
-        has_batch_results = "Processed" in summary and "entries" in summary
-        return gr.update(visible=has_file and has_batch_results)
     # Event handlers
     submit_btn.click(
         fn=process_input,
-        inputs=[input_text, file_upload, model_choice, prompt_choice],
-        outputs=[candidate_output, metrics_output, summary_output]
-    ).then(
-        fn=update_detailed_results_visibility,
-        inputs=[file_upload, summary_output],
-        outputs=detailed_results_btn
-    )
-    detailed_results_btn.click(
-        fn=show_detailed_results,
-        inputs=[input_text, file_upload, model_choice, prompt_choice],
-        outputs=detailed_results
-    ).then(
-        fn=lambda: gr.update(visible=True),
-        outputs=detailed_results
     )
     # Add interpretation guide in an accordion

     return weighted_sum / total_weight if total_weight > 0 else 0
+def evaluate_text(reference_text: str, candidate_text: str) -> Dict[str, Any]:
+    """Evaluate a candidate text against a reference text"""
+    # Clean both texts for consistent evaluation
+    reference_text = clean_text(reference_text)
+    candidate_text = clean_text(candidate_text)
     # Initialize evaluation metrics
     scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
         smooth = SmoothingFunction().method4
         bleu = sentence_bleu(
             [reference_text.split()],
+            candidate_text.split(),
             smoothing_function=smooth
         )
         results["BLEU"] = bleu
     # ROUGE Score
     try:
+        rouge_scores = scorer.score(reference_text, candidate_text)
         rouge = (rouge_scores['rouge1'].fmeasure +
                  rouge_scores['rouge2'].fmeasure +
                  rouge_scores['rougeL'].fmeasure) / 3
     try:
         meteor = meteor_score(
             [reference_text.split()],
+            candidate_text.split()
         )
         results["METEOR"] = meteor
     except Exception as e:
     # BERTScore
     try:
         P, R, F1 = bert_score(
+            [candidate_text],
             [reference_text],
             lang="en",
             verbose=False
             relevancy_prompt = f"""
             On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input?
+            Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
+            Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
             Provide only a single number between 0.0 and 1.0 with no explanation.
             """
             faithfulness_prompt = f"""
             On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy?
+            Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
+            Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
             Provide only a single number between 0.0 and 1.0 with no explanation.
             """
             On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text.
             Consider accuracy, completeness, fluency, and professionalism.
+            Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
+            Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
             Provide only a single number between 0.0 and 1.0 with no explanation.
             """
         interpretation = "Poor performance (F) - likely needs complete rewriting"
     return {
+        "candidate": candidate_text,
         "metrics": results,
         "normalized": normalized_scores,
         "weighted_score": weighted_score,
         "interpretation": interpretation
     }
+def process_input(input_mode: str, reference_text: str, candidate_text: str, model_choice: str, prompt_choice: str, progress=gr.Progress()) -> Tuple[str, str, List[List[str]], str]:
+    """Process input based on selected mode"""
+    if not reference_text:
+        return "", "", [], "Please provide reference text."
     # Determine model provider
     if model_choice == "Gemini":
     # Check if model is available
     if not model_provider.available:
+        return "", "", [], f"Error: {model_choice} is not properly configured. Check your API key."
+    # Process based on input mode
+    if input_mode == "Reference Only (Generate Candidate)":
         progress(0.1, desc="Starting evaluation...")
+        time.sleep(0.1)
+        progress(0.3, desc="Generating rewritten content using prompt...")
+        time.sleep(0.1)
+        # Get prompt template
+        prompt_template = PROMPT_TEMPLATES[prompt_choice]
+        # Generate candidate using the selected model and prompt
+        prompt = prompt_template.replace("{text}", reference_text)
+        candidate = model_provider.generate(prompt)
+        cleaned_candidate = clean_text(candidate)
+        progress(0.6, desc="Calculating metrics...")
+        time.sleep(0.1)
+        # Evaluate the generated candidate
+        result = evaluate_text(reference_text, cleaned_candidate)
+        progress(0.9, desc="Finalizing results...")
+        time.sleep(0.1)
+    else:  # "Both Reference and Candidate"
+        progress(0.3, desc="Calculating metrics...")
         time.sleep(0.2)
+        # Evaluate the provided candidate
+        result = evaluate_text(reference_text, candidate_text)
+        progress(0.8, desc="Finalizing results...")
+        time.sleep(0.1)
+        cleaned_candidate = clean_text(candidate_text)
+    # Format metrics for display
+    metrics_table = [
+        ["Metric", "Raw Score", "Normalized"],
+        ["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
+        ["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
+        ["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
+        ["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
+        ["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
+        ["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
+        ["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
+        ["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
+    ]
+    return (
+        reference_text,
+        result["candidate"],
+        metrics_table,
+        f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
+    )
 # Create Gradio interface
 with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 📥 Input Options")
+            input_mode = gr.Radio(
+                ["Reference Only (Generate Candidate)", "Both Reference and Candidate"],
+                label="Input Mode",
+                value="Reference Only (Generate Candidate)",
+                elem_id="input-mode"
+            )
+            reference_text = gr.Textbox(
+                label="Reference Text",
                 lines=10,
+                placeholder="Enter reference text to evaluate against...",
+                elem_id="reference-text"
             )
+            # Conditionally show candidate text box
+            with gr.Group(visible=False) as candidate_group:
+                candidate_text = gr.Textbox(
+                    label="Candidate Text",
+                    lines=10,
+                    placeholder="Enter candidate text to evaluate...",
+                    elem_id="candidate-text"
+                )
+            # Update visibility of candidate text box based on input mode
+            def update_candidate_visibility(mode):
+                return gr.update(visible=(mode == "Both Reference and Candidate"))
+            input_mode.change(
+                fn=update_candidate_visibility,
+                inputs=input_mode,
+                outputs=candidate_group
             )
             gr.Markdown("### ⚙️ Configuration")
             submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn")
         with gr.Column(scale=2):
+            gr.Markdown("### 📄 Text Comparison")
+            with gr.Tabs():
+                with gr.TabItem("Reference Text"):
+                    reference_output = gr.Textbox(
+                        label="Reference Text",
+                        lines=8,
+                        elem_id="reference-output"
+                    )
+                with gr.TabItem("Candidate Text"):
+                    candidate_output = gr.Textbox(
+                        label="Candidate Text",
+                        lines=8,
+                        elem_id="candidate-output"
+                    )
             gr.Markdown("### 📈 Evaluation Metrics")
             metrics_output = gr.Dataframe(
                 label="Summary",
                 elem_id="summary-output"
             )
     # Event handlers
     submit_btn.click(
         fn=process_input,
+        inputs=[input_mode, reference_text, candidate_text, model_choice, prompt_choice],
+        outputs=[reference_output, candidate_output, metrics_output, summary_output]
     )
     # Add interpretation guide in an accordion