#!/usr/bin/env python3 """ Run Evaluation Pipeline This orchestrates the evaluation workflow on existing final_response data: 1. Count tactic occurrences (count_tactics.py) 2. Generate evaluation metrics (evaluate_metrics.py) 3. Compare models (compare_models.py) 4. Generate CSV with simple metrics (generate_metrics_csv.py) NOTE: This does NOT run the full 3-agent pipeline. Use execute_pipeline.py separately to generate final_response data first. Usage: python run_evaluation.py [--skip-counting] """ import subprocess import sys from pathlib import Path from datetime import datetime import argparse def find_project_root(start: Path) -> Path: """Find the project root by looking for common markers.""" for p in [start] + list(start.parents): if (p / 'final_response').exists() or (p / 'src').exists() or (p / '.git').exists(): return p return start.parent class EvaluationRunner: """Orchestrates the evaluation workflow""" def __init__(self, skip_counting: bool = False): self.skip_counting = skip_counting current_file = Path(__file__).resolve() self.project_root = find_project_root(current_file.parent) # Point to the full_pipeline_evaluation directory for scripts self.eval_dir = self.project_root / "src" / "full_pipeline_evaluation" # Output directory at project root self.output_dir = self.project_root / "evaluation_results" self.start_time = None def print_header(self, step: str, description: str): """Print a formatted step header""" print("\n" + "="*80) print(f"STEP {step}: {description}") print("="*80) def run_command(self, description: str, cmd: list) -> bool: """Run a command and handle errors""" print(f"\n{description}") print(f"Command: {' '.join(str(c) for c in cmd)}\n") try: result = subprocess.run(cmd, check=True) print(f"\n[SUCCESS] {description} completed") return True except subprocess.CalledProcessError as e: print(f"\n[ERROR] {description} failed with exit code {e.returncode}") return False except Exception as e: print(f"\n[ERROR] Unexpected error during {description}: {e}") return False def step_1_count_tactics(self) -> bool: """Step 1: Count tactic occurrences""" self.print_header("1/3", "Counting Tactic Occurrences") if self.skip_counting: print("Skipping tactic counting (--skip-counting flag set)") print("Using existing tactic_counts_summary.json") return True final_response_dir = self.project_root / "final_response" # Ensure output directory exists self.output_dir.mkdir(exist_ok=True) output_file = self.output_dir / "tactic_counts_summary.json" if not final_response_dir.exists(): print(f"[ERROR] final_response directory not found at: {final_response_dir}") print("Run execute_pipeline_all_datasets.py first to generate analysis results") return False # Count response_analysis.json files analysis_files = list(final_response_dir.rglob("*_response_analysis.json")) if not analysis_files: print(f"[ERROR] No *_response_analysis.json files found in final_response") print("Run execute_pipeline_all_datasets.py first to generate analysis results") return False print(f"Found {len(analysis_files)} analysis files") print(f"Output: {output_file}") script_path = self.eval_dir / "count_tactics.py" return self.run_command( "Count tactic occurrences", [ sys.executable, str(script_path), "--output", str(output_file) ] ) def step_2_evaluate_metrics(self) -> bool: """Step 2: Generate evaluation metrics for each model""" self.print_header("2/3", "Generating Evaluation Metrics") tactic_counts_file = self.output_dir / "tactic_counts_summary.json" output_file = self.output_dir / "evaluation_report.json" if not tactic_counts_file.exists(): print(f"[ERROR] Tactic counts file not found: {tactic_counts_file}") print("Run step 1 first or remove --skip-counting flag") return False print(f"Input: {tactic_counts_file}") print(f"Output: {output_file}") print("Note: Individual model reports will be saved as evaluation_report_[model_name].json") script_path = self.eval_dir / "evaluate_metrics.py" return self.run_command( "Generate evaluation metrics for each model", [ sys.executable, str(script_path), "--input", str(tactic_counts_file), "--output", str(output_file) ] ) def step_3_compare_models(self) -> bool: """Step 3: Compare models""" self.print_header("3/4", "Comparing Models") tactic_counts_file = self.output_dir / "tactic_counts_summary.json" output_file = self.output_dir / "model_comparison.json" if not tactic_counts_file.exists(): print(f"[ERROR] Tactic counts file not found: {tactic_counts_file}") print("Run step 1 first or remove --skip-counting flag") return False print(f"Input: {tactic_counts_file}") print(f"Output: {output_file}") script_path = self.eval_dir / "compare_models.py" return self.run_command( "Compare models", [ sys.executable, str(script_path), "--input", str(tactic_counts_file), "--output", str(output_file) ] ) def step_4_generate_csv(self) -> bool: """Step 4: Generate CSV with simple metrics""" self.print_header("4/4", "Generating CSV Metrics") tactic_counts_file = self.output_dir / "tactic_counts_summary.json" output_file = self.output_dir / "model_metrics.csv" if not tactic_counts_file.exists(): print(f"[ERROR] Tactic counts file not found: {tactic_counts_file}") print("Run step 1 first or remove --skip-counting flag") return False print(f"Input: {tactic_counts_file}") print(f"Output: {output_file}") script_path = self.eval_dir / "generate_metrics_csv.py" return self.run_command( "Generate CSV with simple metrics (F1, accuracy, precision, recall)", [ sys.executable, str(script_path), "--input", str(tactic_counts_file), "--output", str(output_file) ] ) def run(self) -> int: """Run the evaluation pipeline""" self.start_time = datetime.now() print("\n" + "="*80) print("EVALUATION PIPELINE") print("="*80) print(f"Project Root: {self.project_root}") print(f"Evaluation Dir: {self.eval_dir}") print(f"Output Dir: {self.output_dir}") print(f"Start Time: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}") # Step 1: Count tactics if not self.step_1_count_tactics(): print("\n[ERROR] Evaluation failed at Step 1") return 1 # Step 2: Evaluate metrics if not self.step_2_evaluate_metrics(): print("\n[ERROR] Evaluation failed at Step 2") return 1 # Step 3: Compare models if not self.step_3_compare_models(): print("\n[ERROR] Evaluation failed at Step 3") return 1 # Step 4: Generate CSV metrics if not self.step_4_generate_csv(): print("\n[ERROR] Evaluation failed at Step 4") return 1 # Success summary end_time = datetime.now() duration = (end_time - self.start_time).total_seconds() print("\n" + "="*80) print("EVALUATION PIPELINE COMPLETED SUCCESSFULLY") print("="*80) print(f"Duration: {duration:.1f} seconds") print(f"\nOutput Files:") print(f" - {self.output_dir / 'tactic_counts_summary.json'}") print(f" - {self.output_dir / 'evaluation_report.json'} (summary)") print(f" - {self.output_dir / 'evaluation_report_[model_name].json'} (per model)") print(f" - {self.output_dir / 'model_comparison.json'}") print(f" - {self.output_dir / 'model_metrics.csv'} (simple metrics: F1, accuracy, precision, recall)") print("="*80 + "\n") return 0 def main(): parser = argparse.ArgumentParser( description="Run evaluation pipeline on existing final_response data", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Run full evaluation (count tactics + evaluate metrics + compare models) python run_evaluation.py # Skip counting, only evaluate (use existing tactic_counts_summary.json) python run_evaluation.py --skip-counting Note: This does NOT run the 3-agent pipeline. Use execute_pipeline_all_datasets.py separately to process mordor dataset files. """ ) parser.add_argument( "--skip-counting", action="store_true", help="Skip counting tactics, use existing tactic_counts_summary.json" ) args = parser.parse_args() runner = EvaluationRunner(skip_counting=args.skip_counting) exit_code = runner.run() sys.exit(exit_code) if __name__ == "__main__": main()