Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import unicodedata | |
| import ftfy | |
| import nltk | |
| import os | |
| import json | |
| import time | |
| from typing import Dict, Any, List, Tuple, Optional | |
| from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction | |
| from rouge_score import rouge_scorer | |
| from bert_score import score as bert_score | |
| from nltk.translate.meteor_score import meteor_score | |
| import google.generativeai as genai | |
| from groq import Groq | |
| from dotenv import load_dotenv | |
| # Download necessary NLTK resources | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('wordnet', quiet=True) | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize API clients (with graceful fallback if keys missing) | |
| try: | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| if GEMINI_API_KEY: | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| else: | |
| print("Warning: GEMINI_API_KEY not found in environment variables") | |
| except Exception as e: | |
| print(f"Error configuring Gemini: {str(e)}") | |
| GEMINI_API_KEY = None | |
| try: | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| if GROQ_API_KEY: | |
| groq_client = Groq(api_key=GROQ_API_KEY) | |
| else: | |
| print("Warning: GROQ_API_KEY not found in environment variables") | |
| groq_client = None | |
| except Exception as e: | |
| print(f"Error configuring Groq: {str(e)}") | |
| groq_client = None | |
| # Text cleaning function | |
| def clean_text(text: str) -> str: | |
| """Clean text by fixing encoding issues and standardizing format""" | |
| if not isinstance(text, str) or not text.strip(): | |
| return "" | |
| text = ftfy.fix_text(text) # Fixes encoding artifacts | |
| text = unicodedata.normalize('NFKD', text) | |
| # Replace common smart quotes and dashes | |
| replacements = { | |
| 'Γ’β¬Ε': '"', 'Γ’β¬': '"', 'Γ’β¬β': '-', 'Γ’β¬β': '--', | |
| 'Γ’β¬Β’': '*', 'Γ’β¬Β¦': '...', 'Γ': '' | |
| } | |
| for old, new in replacements.items(): | |
| text = text.replace(old, new) | |
| # Remove non-ASCII characters | |
| text = re.sub(r'[^\x00-\x7F]+', '', text) | |
| # Normalize whitespace | |
| return ' '.join(text.split()) | |
| # LLM Provider classes | |
| class LLMProvider: | |
| def __init__(self, model_name: str): | |
| self.model_name = model_name | |
| def generate(self, prompt: str) -> str: | |
| raise NotImplementedError | |
| def get_model_name(self) -> str: | |
| return self.model_name | |
| class GeminiProvider(LLMProvider): | |
| def __init__(self, model_name: str = "gemini-1.5-flash-latest"): | |
| super().__init__(model_name) | |
| self.available = bool(GEMINI_API_KEY) | |
| if self.available: | |
| try: | |
| self.model = genai.GenerativeModel(model_name) | |
| except Exception as e: | |
| print(f"Error initializing Gemini model: {str(e)}") | |
| self.available = False | |
| def generate(self, prompt: str) -> str: | |
| if not self.available: | |
| return "Error: Gemini API not configured properly. Check your API key." | |
| try: | |
| response = self.model.generate_content(prompt) | |
| return response.text | |
| except Exception as e: | |
| return f"Error generating with Gemini: {str(e)}" | |
| class GroqProvider(LLMProvider): | |
| def __init__(self, model_name: str = "llama3-70b-8192"): | |
| super().__init__(model_name) | |
| self.available = bool(groq_client) | |
| def generate(self, prompt: str) -> str: | |
| if not self.available: | |
| return "Error: Groq API not configured properly. Check your API key." | |
| try: | |
| chat_completion = groq_client.chat.completions.create( | |
| messages=[ | |
| {"role": "user", "content": prompt} | |
| ], | |
| model=self.model_name, | |
| temperature=0.3 | |
| ) | |
| return chat_completion.choices[0].message.content | |
| except Exception as e: | |
| return f"Error generating with Groq: {str(e)}" | |
| # Prompt templates | |
| PROMPT_TEMPLATES = { | |
| "Strategic Narrative Architect": """Role: Strategic Narrative Architect | |
| You are a professional content writer with expertise in creating engaging, well-structured narratives. | |
| Your task is to rewrite the following text in a professional, engaging style while preserving all key facts and information: | |
| {text} | |
| Instructions: | |
| 1. Maintain all factual information and key details | |
| 2. Improve structure and flow for better readability | |
| 3. Enhance engagement through appropriate storytelling techniques | |
| 4. Use professional language appropriate for the content domain | |
| 5. Ensure the output is concise yet comprehensive | |
| 6. Begin directly with the content - do NOT include introductory phrases like "Here's a rewritten version" or "Rewritten content" | |
| 7. Write as if this is the final published version, not as a response to a rewrite request | |
| Output:""", | |
| "Precision Storyteller": """Role: Precision Storyteller | |
| You are a professional editor focused on accuracy, clarity, and precision. | |
| Your task is to rewrite the following text with maximum factual accuracy while improving clarity: | |
| {text} | |
| Instructions: | |
| 1. Preserve all factual information with absolute precision | |
| 2. Correct any grammatical errors or awkward phrasing | |
| 3. Ensure logical flow and coherence | |
| 4. Use clear, concise language without unnecessary embellishment | |
| 5. Maintain professional tone appropriate for the content domain | |
| 6. Begin directly with the content - do NOT include introductory phrases like "Here's a rewritten version" or "Rewritten content" | |
| 7. Write as if this is the final published version, not as a response to a rewrite request | |
| Output:""" | |
| } | |
| # Metric normalization ranges | |
| NORMALIZATION_RANGES = { | |
| "AnswerRelevancy": (0.0, 1.0), | |
| "Faithfulness": (0.0, 1.0), | |
| "GEval": (0.0, 1.0), | |
| "BERTScore": (0.7, 0.95), | |
| "ROUGE": (0.0, 0.6), | |
| "BLEU": (0.0, 0.4), | |
| "METEOR": (0.0, 0.6) | |
| } | |
| # Metric weights | |
| METRIC_WEIGHTS = { | |
| "AnswerRelevancy": 0.10, | |
| "Faithfulness": 0.10, | |
| "GEval": 0.025, | |
| "BERTScore": 0.20, | |
| "ROUGE": 0.15, | |
| "BLEU": 0.025, | |
| "METEOR": 0.15 | |
| } | |
| def normalize_score(metric: str, value: float) -> float: | |
| """Normalize score to 0-1 scale based on metric's natural range""" | |
| if metric not in NORMALIZATION_RANGES or not isinstance(value, (int, float)): | |
| return value | |
| min_val, max_val = NORMALIZATION_RANGES[metric] | |
| # Handle edge cases | |
| if max_val <= min_val: | |
| return 0.5 # Default middle value if range is invalid | |
| # Normalize and clamp to [0,1] | |
| normalized = (value - min_val) / (max_val - min_val) | |
| return max(0.0, min(normalized, 1.0)) | |
| def calculate_weighted_score(scores: Dict[str, float]) -> float: | |
| """Calculate weighted average of normalized scores""" | |
| normalized_scores = {m: normalize_score(m, v) for m, v in scores.items()} | |
| total_weight = 0 | |
| weighted_sum = 0 | |
| for metric, weight in METRIC_WEIGHTS.items(): | |
| if metric in normalized_scores: | |
| weighted_sum += normalized_scores[metric] * weight | |
| total_weight += weight | |
| return weighted_sum / total_weight if total_weight > 0 else 0 | |
| def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template: str) -> Dict[str, Any]: | |
| """Evaluate a single text using the selected model and prompt""" | |
| # Create clean reference text | |
| reference_text = clean_text(raw_input) | |
| # Generate candidate using the selected model and prompt | |
| prompt = prompt_template.replace("{text}", raw_input) | |
| candidate = model_provider.generate(prompt) | |
| # Clean candidate output for consistent evaluation | |
| cleaned_candidate = clean_text(candidate) | |
| # Initialize evaluation metrics | |
| scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) | |
| # Calculate traditional metrics | |
| results = {} | |
| # BLEU Score | |
| try: | |
| smooth = SmoothingFunction().method4 | |
| bleu = sentence_bleu( | |
| [reference_text.split()], | |
| cleaned_candidate.split(), | |
| smoothing_function=smooth | |
| ) | |
| results["BLEU"] = bleu | |
| except Exception as e: | |
| print(f"BLEU error: {str(e)}") | |
| results["BLEU"] = 0.0 | |
| # ROUGE Score | |
| try: | |
| rouge_scores = scorer.score(reference_text, cleaned_candidate) | |
| rouge = (rouge_scores['rouge1'].fmeasure + | |
| rouge_scores['rouge2'].fmeasure + | |
| rouge_scores['rougeL'].fmeasure) / 3 | |
| results["ROUGE"] = rouge | |
| except Exception as e: | |
| print(f"ROUGE error: {str(e)}") | |
| results["ROUGE"] = 0.0 | |
| # METEOR Score | |
| try: | |
| meteor = meteor_score( | |
| [reference_text.split()], | |
| cleaned_candidate.split() | |
| ) | |
| results["METEOR"] = meteor | |
| except Exception as e: | |
| print(f"METEOR error: {str(e)}") | |
| results["METEOR"] = 0.0 | |
| # BERTScore | |
| try: | |
| P, R, F1 = bert_score( | |
| [cleaned_candidate], | |
| [reference_text], | |
| lang="en", | |
| verbose=False | |
| ) | |
| results["BERTScore"] = F1.item() | |
| except Exception as e: | |
| print(f"BERTScore error: {str(e)}") | |
| results["BERTScore"] = 0.7 # Default low value | |
| # LLM-as-judge metrics - simplified implementation since DeepEval might not be available | |
| try: | |
| # Use Gemini as judge if available | |
| if GEMINI_API_KEY: | |
| judge_model = GeminiProvider("gemini-1.5-flash-latest") | |
| # Answer Relevancy | |
| relevancy_prompt = f""" | |
| On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input? | |
| Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''} | |
| Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''} | |
| Provide only a single number between 0.0 and 1.0 with no explanation. | |
| """ | |
| relevancy_response = judge_model.generate(relevancy_prompt) | |
| try: | |
| relevancy_score = float(relevancy_response.strip()) | |
| results["AnswerRelevancy"] = max(0.0, min(1.0, relevancy_score)) | |
| except: | |
| results["AnswerRelevancy"] = 0.5 | |
| # Faithfulness | |
| faithfulness_prompt = f""" | |
| On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy? | |
| Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''} | |
| Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''} | |
| Provide only a single number between 0.0 and 1.0 with no explanation. | |
| """ | |
| faithfulness_response = judge_model.generate(faithfulness_prompt) | |
| try: | |
| faithfulness_score = float(faithfulness_response.strip()) | |
| results["Faithfulness"] = max(0.0, min(1.0, faithfulness_score)) | |
| except: | |
| results["Faithfulness"] = 0.5 | |
| # GEval | |
| geval_prompt = f""" | |
| On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text. | |
| Consider accuracy, completeness, fluency, and professionalism. | |
| Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''} | |
| Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''} | |
| Provide only a single number between 0.0 and 1.0 with no explanation. | |
| """ | |
| geval_response = judge_model.generate(geval_prompt) | |
| try: | |
| geval_score = float(geval_response.strip()) | |
| results["GEval"] = max(0.0, min(1.0, geval_score)) | |
| except: | |
| results["GEval"] = 0.5 | |
| else: | |
| # Default values if no judge model available | |
| results["AnswerRelevancy"] = 0.5 | |
| results["Faithfulness"] = 0.5 | |
| results["GEval"] = 0.5 | |
| except Exception as e: | |
| print(f"LLM-as-judge error: {str(e)}") | |
| # Default values if DeepEval fails | |
| results["AnswerRelevancy"] = 0.5 | |
| results["Faithfulness"] = 0.5 | |
| results["GEval"] = 0.5 | |
| # Calculate normalized and weighted scores | |
| normalized_scores = {m: normalize_score(m, v) for m, v in results.items()} | |
| weighted_score = calculate_weighted_score(results) | |
| # Determine interpretation | |
| if weighted_score >= 0.85: | |
| interpretation = "Outstanding performance (A) - ready for professional use" | |
| elif weighted_score >= 0.70: | |
| interpretation = "Strong performance (B) - good quality with minor improvements" | |
| elif weighted_score >= 0.50: | |
| interpretation = "Adequate performance (C) - usable but needs refinement" | |
| elif weighted_score >= 0.30: | |
| interpretation = "Weak performance (D) - requires significant revision" | |
| else: | |
| interpretation = "Poor performance (F) - likely needs complete rewriting" | |
| return { | |
| "candidate": cleaned_candidate, | |
| "metrics": results, | |
| "normalized": normalized_scores, | |
| "weighted_score": weighted_score, | |
| "interpretation": interpretation | |
| } | |
| def process_input(input_text: str, file_upload, model_choice: str, prompt_choice: str, progress=gr.Progress()) -> Tuple[str, List[List[str]], str]: | |
| """Process either input text or uploaded file with progress tracking""" | |
| if input_text and file_upload: | |
| return "Please use either text input or file upload, not both.", [], "" | |
| if not input_text and not file_upload: | |
| return "Please provide input text or upload a file.", [], "" | |
| # Determine model provider | |
| if model_choice == "Gemini": | |
| model_provider = GeminiProvider("gemini-1.5-flash-latest") | |
| elif model_choice == "Llama-3-70b": | |
| model_provider = GroqProvider("llama3-70b-8192") | |
| else: # Llama-3-8b | |
| model_provider = GroqProvider("llama3-8b-8192") | |
| # Check if model is available | |
| if not model_provider.available: | |
| return f"Error: {model_choice} is not properly configured. Check your API key.", [], "" | |
| # Get prompt template | |
| prompt_template = PROMPT_TEMPLATES[prompt_choice] | |
| # Process single text input | |
| if input_text: | |
| progress(0.1, desc="Starting evaluation...") | |
| time.sleep(0.2) | |
| progress(0.3, desc="Generating rewritten content...") | |
| time.sleep(0.2) | |
| progress(0.6, desc="Calculating metrics...") | |
| result = evaluate_text(input_text, model_provider, prompt_template) | |
| progress(0.9, desc="Finalizing results...") | |
| time.sleep(0.2) | |
| # Format metrics for display | |
| metrics_table = [ | |
| ["Metric", "Raw Score", "Normalized"], | |
| ["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"], | |
| ["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"], | |
| ["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"], | |
| ["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"], | |
| ["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"], | |
| ["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"], | |
| ["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"], | |
| ["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"] | |
| ] | |
| return ( | |
| result["candidate"], | |
| metrics_table, | |
| f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}" | |
| ) | |
| # Process file upload | |
| if file_upload: | |
| progress(0.1, desc="Reading file...") | |
| time.sleep(0.2) | |
| # Read the file (assuming CSV with one column of text) | |
| try: | |
| df = pd.read_csv(file_upload.name) | |
| progress(0.3, desc="Processing entries...") | |
| time.sleep(0.2) | |
| except Exception as e: | |
| return f"Error reading file: {str(e)}", [], "" | |
| # Assuming the first column contains the text | |
| text_column = df.columns[0] | |
| results = [] | |
| detailed_results = [] | |
| # Process each entry with progress updates | |
| for i, row in df.iterrows(): | |
| progress((i + 1) / len(df) * 0.6 + 0.3, desc=f"Processing entry {i+1}/{len(df)}") | |
| text = str(row[text_column]) | |
| try: | |
| result = evaluate_text(text, model_provider, prompt_template) | |
| # Add to results | |
| results.append(result["weighted_score"]) | |
| # Store detailed results | |
| detailed_results.append({ | |
| "input_preview": text[:100] + "..." if len(text) > 100 else text, | |
| "weighted_score": result["weighted_score"], | |
| "interpretation": result["interpretation"], | |
| "candidate": result["candidate"] | |
| }) | |
| except Exception as e: | |
| print(f"Error processing entry {i}: {str(e)}") | |
| results.append(0.0) | |
| detailed_results.append({ | |
| "input_preview": text[:100] + "..." if len(text) > 100 else text, | |
| "weighted_score": 0.0, | |
| "interpretation": "Error processing this entry", | |
| "candidate": "" | |
| }) | |
| progress(0.9, desc="Generating summary...") | |
| time.sleep(0.2) | |
| # Create results dataframe | |
| results_df = pd.DataFrame(detailed_results) | |
| # Generate summary statistics | |
| valid_scores = [s for s in results if s > 0] | |
| if valid_scores: | |
| avg_score = sum(valid_scores) / len(valid_scores) | |
| min_score = min(valid_scores) | |
| max_score = max(valid_scores) | |
| if avg_score >= 0.85: | |
| summary = "Excellent performance across inputs" | |
| elif avg_score >= 0.70: | |
| summary = "Good performance with room for minor improvements" | |
| elif avg_score >= 0.50: | |
| summary = "Adequate performance but needs refinement" | |
| else: | |
| summary = "Significant improvements needed" | |
| # Format summary | |
| summary_text = ( | |
| f"Processed {len(results)} entries ({len(valid_scores)} successful)\n" | |
| f"Average Hybrid Score: {avg_score:.4f}\n" | |
| f"Range: {min_score:.4f} - {max_score:.4f}\n\n" | |
| f"{summary}" | |
| ) | |
| # Create metrics table for summary | |
| metrics_table = [ | |
| ["Metric", "Value"], | |
| ["Entries Processed", f"{len(results)}"], | |
| ["Successful Entries", f"{len(valid_scores)}"], | |
| ["Average Score", f"{avg_score:.4f}"], | |
| ["Best Score", f"{max_score:.4f}"], | |
| ["Worst Score", f"{min_score:.4f}"], | |
| ["Overall Assessment", summary] | |
| ] | |
| return ( | |
| "Batch processing complete. Use the 'Show Details' button to see individual results.", | |
| metrics_table, | |
| summary_text | |
| ) | |
| else: | |
| return ( | |
| "No successful evaluations. Check your API configuration and input data.", | |
| [["Error", "All evaluations failed"]], | |
| "Error: No successful evaluations. Check your API configuration and input data." | |
| ) | |
| def show_detailed_results(input_text, file_upload, model_choice, prompt_choice, progress=gr.Progress()): | |
| """Show detailed results for batch processing""" | |
| if not file_upload: | |
| return "No file uploaded for batch processing." | |
| progress(0.1, desc="Reading file...") | |
| time.sleep(0.1) | |
| # Read the file | |
| df = pd.read_csv(file_upload.name) | |
| text_column = df.columns[0] | |
| progress(0.3, desc="Determining model provider...") | |
| time.sleep(0.1) | |
| # Determine model provider | |
| if model_choice == "Gemini": | |
| model_provider = GeminiProvider("gemini-1.5-flash-latest") | |
| elif model_choice == "Llama-3-70b": | |
| model_provider = GroqProvider("llama3-70b-8192") | |
| else: # Llama-3-8b | |
| model_provider = GroqProvider("llama3-8b-8192") | |
| progress(0.5, desc="Getting prompt template...") | |
| time.sleep(0.1) | |
| # Get prompt template | |
| prompt_template = PROMPT_TEMPLATES[prompt_choice] | |
| progress(0.7, desc="Processing entries...") | |
| time.sleep(0.1) | |
| # Process each entry | |
| results = [] | |
| for i, row in enumerate(df.iterrows()): | |
| _, row = row # Unpack the tuple | |
| text = str(row[text_column]) | |
| try: | |
| result = evaluate_text(text, model_provider, prompt_template) | |
| results.append({ | |
| "Input Preview": text[:100] + "..." if len(text) > 100 else text, | |
| "Weighted Score": f"{result['weighted_score']:.4f}", | |
| "Interpretation": result['interpretation'], | |
| "Candidate Text": result['candidate'] | |
| }) | |
| except: | |
| results.append({ | |
| "Input Preview": text[:100] + "..." if len(text) > 100 else text, | |
| "Weighted Score": "Error", | |
| "Interpretation": "Processing error", | |
| "Candidate Text": "" | |
| }) | |
| progress(0.7 + (i + 1) / len(df) * 0.3, desc=f"Processing entry {i+1}/{len(df)}") | |
| progress(1.0, desc="Completed!") | |
| return gr.Dataframe(value=pd.DataFrame(results)) | |
| # Create Gradio interface | |
| with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π LLM Evaluation Framework for Professional Content Rewriting") | |
| gr.Markdown("Evaluate the quality of LLM-generated content using multiple metrics with proper normalization.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π₯ Input Options") | |
| input_text = gr.Textbox( | |
| label="Input Text", | |
| lines=10, | |
| placeholder="Enter text to evaluate...", | |
| elem_id="input-text" | |
| ) | |
| gr.Markdown("or") | |
| file_upload = gr.File( | |
| label="Upload CSV file (single column of text)", | |
| file_types=[".csv", ".txt"], | |
| elem_id="file-upload" | |
| ) | |
| gr.Markdown("### βοΈ Configuration") | |
| model_choice = gr.Radio( | |
| ["Gemini", "Llama-3-70b", "Llama-3-8b"], | |
| label="Select Model", | |
| value="Gemini", | |
| elem_id="model-choice" | |
| ) | |
| prompt_choice = gr.Radio( | |
| ["Strategic Narrative Architect", "Precision Storyteller"], | |
| label="Select Prompt Template", | |
| value="Strategic Narrative Architect", | |
| elem_id="prompt-choice" | |
| ) | |
| submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### βοΈ Rewritten Content") | |
| candidate_output = gr.Textbox( | |
| label="Rewritten Content", | |
| lines=15, | |
| elem_id="candidate-output" | |
| ) | |
| gr.Markdown("### π Evaluation Metrics") | |
| metrics_output = gr.Dataframe( | |
| label="Evaluation Metrics", | |
| interactive=False, | |
| elem_id="metrics-output" | |
| ) | |
| gr.Markdown("### π Overall Assessment") | |
| summary_output = gr.Textbox( | |
| label="Summary", | |
| elem_id="summary-output" | |
| ) | |
| detailed_results_btn = gr.Button("Show Detailed Results (Batch)", visible=False) | |
| detailed_results = gr.Dataframe(visible=False) | |
| # Update visibility of detailed results button | |
| def update_detailed_results_visibility(file_upload, summary): | |
| has_file = file_upload is not None | |
| has_batch_results = "Processed" in summary and "entries" in summary | |
| return gr.update(visible=has_file and has_batch_results) | |
| # Event handlers | |
| submit_btn.click( | |
| fn=process_input, | |
| inputs=[input_text, file_upload, model_choice, prompt_choice], | |
| outputs=[candidate_output, metrics_output, summary_output] | |
| ).then( | |
| fn=update_detailed_results_visibility, | |
| inputs=[file_upload, summary_output], | |
| outputs=detailed_results_btn | |
| ) | |
| detailed_results_btn.click( | |
| fn=show_detailed_results, | |
| inputs=[input_text, file_upload, model_choice, prompt_choice], | |
| outputs=detailed_results | |
| ).then( | |
| fn=lambda: gr.update(visible=True), | |
| outputs=detailed_results | |
| ) | |
| # Add interpretation guide in an accordion | |
| with gr.Accordion("π Interpretation Guide", open=False): | |
| gr.Markdown(""" | |
| ### Hybrid Score Interpretation | |
| The Hybrid Score combines multiple evaluation metrics into a single score with proper normalization: | |
| - **0.85+**: Outstanding performance (A) - ready for professional use | |
| - **0.70-0.85**: Strong performance (B) - good quality with minor improvements | |
| - **0.50-0.70**: Adequate performance (C) - usable but needs refinement | |
| - **0.30-0.50**: Weak performance (D) - requires significant revision | |
| - **<0.30**: Poor performance (F) - likely needs complete rewriting | |
| ### Key Metrics Explained | |
| | Metric | What It Measures | Why It Matters | | |
| |--------|------------------|----------------| | |
| | **AnswerRelevancy** | Is output on-topic with input? | Does the prompt stay focused despite messy input? | | |
| | **Faithfulness** | Are ALL facts preserved correctly? | Does it maintain accuracy when input has encoding errors? | | |
| | **GEval** | Overall quality assessment by another AI | How professional does the output appear? | | |
| | **BERTScore** | Semantic similarity to reference | How well does it capture the meaning of cleaned text? | | |
| | **ROUGE** | Content overlap with reference | How much key information is preserved? | | |
| | **BLEU** | Phrasing precision | How closely does wording match human-quality standard? | | |
| | **METEOR** | Linguistic quality with synonyms | How natural does the cleaned output read? | | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True | |
| ) |