H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 27

Commit

30e6a06

verified ·

1 Parent(s): e6f8dd1

Update run_evaluation.py

Browse files

Files changed (1) hide show

run_evaluation.py +24 -27

run_evaluation.py CHANGED Viewed

@@ -119,37 +119,34 @@ def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects,
         comparison_df = pd.DataFrame(comparison_data)
         # Format the report
-        report = (
-            f"### Head-to-Head Comparison Results\n\n"
-            f"#### Model 1: {model1_config['name']}\n"
-            f"* Overall Accuracy: {model1_overall_acc:.3f}\n"
-            f"* Best Performance: {model1_max_subject} ({model1_max_acc:.3f})\n"
-            f"* Worst Performance: {model1_min_subject} ({model1_min_acc:.3f})\n"
-            f"* Evaluation completed in {model1_elapsed_time:.2f} seconds\n\n"
-            f"#### Model 2: {model2_config['name']}\n"
-            f"* Overall Accuracy: {model2_overall_acc:.3f}\n"
-            f"* Best Performance: {model2_max_subject} ({model2_max_acc:.3f})\n"
-            f"* Worst Performance: {model2_min_subject} ({model2_min_acc:.3f})\n"
-            f"* Evaluation completed in {model2_elapsed_time:.2f} seconds\n\n"
-            f"#### Overall Winner: {overall_winner}\n"
-            f"* Margin: {abs(overall_diff):.3f}\n"
         )
-        # Return values that re-enable UI components after completion
-        return {
-            'report': report,
-            'comparison_df': comparison_df,
-            'success': True
-        }
     except Exception as e:
         # Handle errors gracefully
         error_trace = traceback.format_exc()
         error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
-        # Return error information
-        return {
-            'report': error_message,
-            'comparison_df': None,
-            'success': False
-        }

         comparison_df = pd.DataFrame(comparison_data)
         # Format the report
+        return (
+            report,                    # Report markdown
+            comparison_df,             # Results dataframe
+            gr.Button.update(interactive=True),  # Enable eval button
+            gr.Button.update(interactive=False), # Disable cancel button
+            gr.Radio.update(interactive=True),   # Enable subject selection mode
+            gr.Slider.update(interactive=True),  # Enable subjects slider
+            gr.Checkbox.update(interactive=True), # Enable all questions checkbox
+            gr.Slider.update(interactive=True),   # Enable questions slider
+            gr.Dropdown.update(interactive=True), # Enable model dropdown
+            gr.Column.update(visible=True)        # Make table container visible
         )
     except Exception as e:
         # Handle errors gracefully
         error_trace = traceback.format_exc()
         error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
+        # Return error values for all 10 components
+        return (
+            error_message,              # Error message in markdown
+            None,                       # No dataframe
+            gr.Button.update(interactive=True),  # Enable eval button
+            gr.Button.update(interactive=False), # Disable cancel button
+            gr.Radio.update(interactive=True),   # Enable subject selection mode
+            gr.Slider.update(interactive=True),  # Enable subjects slider
+            gr.Checkbox.update(interactive=True), # Enable all questions checkbox
+            gr.Slider.update(interactive=True),   # Enable questions slider
+            gr.Dropdown.update(interactive=True), # Enable model dropdown
+            gr.Column.update(visible=False)       # Hide table container
+        )