Spaces:

Sa-m
/

llm-evaluation-framework

Running

App Files Files Community

Sa-m commited on Aug 15

Commit

70bcfc2

verified ·

1 Parent(s): f9128f8

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -124

app.py CHANGED Viewed

@@ -357,8 +357,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
         "interpretation": interpretation
     }
-def process_input(input_text: str, file_upload, model_choice: str, prompt_choice: str) -> Tuple[str, List[List[str]], str]:
-    """Process either input text or uploaded file"""
     if input_text and file_upload:
         return "Please use either text input or file upload, not both.", [], ""
@@ -382,147 +382,151 @@ def process_input(input_text: str, file_upload, model_choice: str, prompt_choice
     # Process single text input
     if input_text:
-        with gr.Progress() as progress:
-            progress(0.1, desc="Starting evaluation...")
-            time.sleep(0.2)
-            progress(0.3, desc="Generating rewritten content...")
-            time.sleep(0.2)
-            progress(0.6, desc="Calculating metrics...")
-            result = evaluate_text(input_text, model_provider, prompt_template)
-            progress(0.9, desc="Finalizing results...")
-            time.sleep(0.2)
-            # Format metrics for display
-            metrics_table = [
-                ["Metric", "Raw Score", "Normalized"],
-                ["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
-                ["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
-                ["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
-                ["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
-                ["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
-                ["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
-                ["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
-                ["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
-            ]
-            return (
-                result["candidate"],
-                metrics_table,
-                f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
-            )
     # Process file upload
     if file_upload:
-        with gr.Progress() as progress:
-            progress(0.1, desc="Reading file...")
             time.sleep(0.2)
-            # Read the file (assuming CSV with one column of text)
             try:
-                df = pd.read_csv(file_upload.name)
-                progress(0.3, desc="Processing entries...")
-                time.sleep(0.2)
             except Exception as e:
-                return f"Error reading file: {str(e)}", [], ""
-            # Assuming the first column contains the text
-            text_column = df.columns[0]
-            results = []
-            detailed_results = []
-            # Process each entry with progress updates
-            for i, row in df.iterrows():
-                progress((i + 1) / len(df) * 0.6 + 0.3, desc=f"Processing entry {i+1}/{len(df)}")
-                text = str(row[text_column])
-                try:
-                    result = evaluate_text(text, model_provider, prompt_template)
-                    # Add to results
-                    results.append(result["weighted_score"])
-                    # Store detailed results
-                    detailed_results.append({
-                        "input_preview": text[:100] + "..." if len(text) > 100 else text,
-                        "weighted_score": result["weighted_score"],
-                        "interpretation": result["interpretation"],
-                        "candidate": result["candidate"]
-                    })
-                except Exception as e:
-                    print(f"Error processing entry {i}: {str(e)}")
-                    results.append(0.0)
-                    detailed_results.append({
-                        "input_preview": text[:100] + "..." if len(text) > 100 else text,
-                        "weighted_score": 0.0,
-                        "interpretation": "Error processing this entry",
-                        "candidate": ""
-                    })
-            progress(0.9, desc="Generating summary...")
-            time.sleep(0.2)
-            # Create results dataframe
-            results_df = pd.DataFrame(detailed_results)
-            # Generate summary statistics
-            valid_scores = [s for s in results if s > 0]
-            if valid_scores:
-                avg_score = sum(valid_scores) / len(valid_scores)
-                min_score = min(valid_scores)
-                max_score = max(valid_scores)
-                if avg_score >= 0.85:
-                    summary = "Excellent performance across inputs"
-                elif avg_score >= 0.70:
-                    summary = "Good performance with room for minor improvements"
-                elif avg_score >= 0.50:
-                    summary = "Adequate performance but needs refinement"
-                else:
-                    summary = "Significant improvements needed"
-                # Format summary
-                summary_text = (
-                    f"Processed {len(results)} entries ({len(valid_scores)} successful)\n"
-                    f"Average Hybrid Score: {avg_score:.4f}\n"
-                    f"Range: {min_score:.4f} - {max_score:.4f}\n\n"
-                    f"{summary}"
-                )
-                # Create metrics table for summary
-                metrics_table = [
-                    ["Metric", "Value"],
-                    ["Entries Processed", f"{len(results)}"],
-                    ["Successful Entries", f"{len(valid_scores)}"],
-                    ["Average Score", f"{avg_score:.4f}"],
-                    ["Best Score", f"{max_score:.4f}"],
-                    ["Worst Score", f"{min_score:.4f}"],
-                    ["Overall Assessment", summary]
-                ]
-                return (
-                    "Batch processing complete. Use the 'Show Details' button to see individual results.",
-                    metrics_table,
-                    summary_text
-                )
-            else:
-                return (
-                    "No successful evaluations. Check your API configuration and input data.",
-                    [["Error", "All evaluations failed"]],
-                    "Error: No successful evaluations. Check your API configuration and input data."
-                )
-def show_detailed_results(input_text, file_upload, model_choice, prompt_choice):
     """Show detailed results for batch processing"""
     if not file_upload:
         return "No file uploaded for batch processing."
     # Read the file
     df = pd.read_csv(file_upload.name)
     text_column = df.columns[0]
     # Determine model provider
     if model_choice == "Gemini":
         model_provider = GeminiProvider("gemini-1.5-flash-latest")
@@ -531,12 +535,19 @@ def show_detailed_results(input_text, file_upload, model_choice, prompt_choice):
     else:  # Llama-3-8b
         model_provider = GroqProvider("llama3-8b-8192")
     # Get prompt template
     prompt_template = PROMPT_TEMPLATES[prompt_choice]
     # Process each entry
     results = []
-    for _, row in df.iterrows():
         text = str(row[text_column])
         try:
             result = evaluate_text(text, model_provider, prompt_template)
@@ -553,7 +564,9 @@ def show_detailed_results(input_text, file_upload, model_choice, prompt_choice):
                 "Interpretation": "Processing error",
                 "Candidate Text": ""
             })
     return gr.Dataframe(value=pd.DataFrame(results))
 # Create Gradio interface

         "interpretation": interpretation
     }
+def process_input(input_text: str, file_upload, model_choice: str, prompt_choice: str, progress=gr.Progress()) -> Tuple[str, List[List[str]], str]:
+    """Process either input text or uploaded file with progress tracking"""
     if input_text and file_upload:
         return "Please use either text input or file upload, not both.", [], ""
     # Process single text input
     if input_text:
+        progress(0.1, desc="Starting evaluation...")
+        time.sleep(0.2)
+        progress(0.3, desc="Generating rewritten content...")
+        time.sleep(0.2)
+        progress(0.6, desc="Calculating metrics...")
+        result = evaluate_text(input_text, model_provider, prompt_template)
+        progress(0.9, desc="Finalizing results...")
+        time.sleep(0.2)
+        # Format metrics for display
+        metrics_table = [
+            ["Metric", "Raw Score", "Normalized"],
+            ["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
+            ["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
+            ["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
+            ["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
+            ["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
+            ["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
+            ["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
+            ["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
+        ]
+        return (
+            result["candidate"],
+            metrics_table,
+            f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
+        )
     # Process file upload
     if file_upload:
+        progress(0.1, desc="Reading file...")
+        time.sleep(0.2)
+        # Read the file (assuming CSV with one column of text)
+        try:
+            df = pd.read_csv(file_upload.name)
+            progress(0.3, desc="Processing entries...")
             time.sleep(0.2)
+        except Exception as e:
+            return f"Error reading file: {str(e)}", [], ""
+        # Assuming the first column contains the text
+        text_column = df.columns[0]
+        results = []
+        detailed_results = []
+        # Process each entry with progress updates
+        for i, row in df.iterrows():
+            progress((i + 1) / len(df) * 0.6 + 0.3, desc=f"Processing entry {i+1}/{len(df)}")
+            text = str(row[text_column])
             try:
+                result = evaluate_text(text, model_provider, prompt_template)
+                # Add to results
+                results.append(result["weighted_score"])
+                # Store detailed results
+                detailed_results.append({
+                    "input_preview": text[:100] + "..." if len(text) > 100 else text,
+                    "weighted_score": result["weighted_score"],
+                    "interpretation": result["interpretation"],
+                    "candidate": result["candidate"]
+                })
             except Exception as e:
+                print(f"Error processing entry {i}: {str(e)}")
+                results.append(0.0)
+                detailed_results.append({
+                    "input_preview": text[:100] + "..." if len(text) > 100 else text,
+                    "weighted_score": 0.0,
+                    "interpretation": "Error processing this entry",
+                    "candidate": ""
+                })
+        progress(0.9, desc="Generating summary...")
+        time.sleep(0.2)
+        # Create results dataframe
+        results_df = pd.DataFrame(detailed_results)
+        # Generate summary statistics
+        valid_scores = [s for s in results if s > 0]
+        if valid_scores:
+            avg_score = sum(valid_scores) / len(valid_scores)
+            min_score = min(valid_scores)
+            max_score = max(valid_scores)
+            if avg_score >= 0.85:
+                summary = "Excellent performance across inputs"
+            elif avg_score >= 0.70:
+                summary = "Good performance with room for minor improvements"
+            elif avg_score >= 0.50:
+                summary = "Adequate performance but needs refinement"
+            else:
+                summary = "Significant improvements needed"
+            # Format summary
+            summary_text = (
+                f"Processed {len(results)} entries ({len(valid_scores)} successful)\n"
+                f"Average Hybrid Score: {avg_score:.4f}\n"
+                f"Range: {min_score:.4f} - {max_score:.4f}\n\n"
+                f"{summary}"
+            )
+            # Create metrics table for summary
+            metrics_table = [
+                ["Metric", "Value"],
+                ["Entries Processed", f"{len(results)}"],
+                ["Successful Entries", f"{len(valid_scores)}"],
+                ["Average Score", f"{avg_score:.4f}"],
+                ["Best Score", f"{max_score:.4f}"],
+                ["Worst Score", f"{min_score:.4f}"],
+                ["Overall Assessment", summary]
+            ]
+            return (
+                "Batch processing complete. Use the 'Show Details' button to see individual results.",
+                metrics_table,
+                summary_text
+            )
+        else:
+            return (
+                "No successful evaluations. Check your API configuration and input data.",
+                [["Error", "All evaluations failed"]],
+                "Error: No successful evaluations. Check your API configuration and input data."
+            )
+def show_detailed_results(input_text, file_upload, model_choice, prompt_choice, progress=gr.Progress()):
     """Show detailed results for batch processing"""
     if not file_upload:
         return "No file uploaded for batch processing."
+    progress(0.1, desc="Reading file...")
+    time.sleep(0.1)
     # Read the file
     df = pd.read_csv(file_upload.name)
     text_column = df.columns[0]
+    progress(0.3, desc="Determining model provider...")
+    time.sleep(0.1)
     # Determine model provider
     if model_choice == "Gemini":
         model_provider = GeminiProvider("gemini-1.5-flash-latest")
     else:  # Llama-3-8b
         model_provider = GroqProvider("llama3-8b-8192")
+    progress(0.5, desc="Getting prompt template...")
+    time.sleep(0.1)
     # Get prompt template
     prompt_template = PROMPT_TEMPLATES[prompt_choice]
+    progress(0.7, desc="Processing entries...")
+    time.sleep(0.1)
     # Process each entry
     results = []
+    for i, row in enumerate(df.iterrows()):
+        _, row = row  # Unpack the tuple
         text = str(row[text_column])
         try:
             result = evaluate_text(text, model_provider, prompt_template)
                 "Interpretation": "Processing error",
                 "Candidate Text": ""
             })
+        progress(0.7 + (i + 1) / len(df) * 0.3, desc=f"Processing entry {i+1}/{len(df)}")
+    progress(1.0, desc="Completed!")
     return gr.Dataframe(value=pd.DataFrame(results))
 # Create Gradio interface