Update app.py
Browse files
app.py
CHANGED
|
@@ -41,44 +41,57 @@ context_grounding_data = {
|
|
| 41 |
"Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81]
|
| 42 |
}
|
| 43 |
|
| 44 |
-
# Function to bold the highest score per column (excluding "Model Name")
|
| 45 |
-
def format_table(df):
|
| 46 |
styled_df = df.copy()
|
| 47 |
numeric_columns = [col for col in df.columns if col != "Model Name"]
|
| 48 |
|
|
|
|
|
|
|
|
|
|
| 49 |
for col in numeric_columns:
|
| 50 |
if col in ["Baseline", "Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]:
|
| 51 |
# Convert string values (e.g., "0.95 (0.0)") to float for comparison, or use direct float values
|
| 52 |
-
if any(" (" in str(x) for x in
|
| 53 |
# Handle string values with deltas (e.g., "0.95 (0.0)")
|
| 54 |
-
values = [float(str(x).split(" (")[0]) for x in
|
| 55 |
else:
|
| 56 |
# Handle direct float values
|
| 57 |
-
values =
|
| 58 |
|
| 59 |
max_value = np.max(values)
|
| 60 |
-
styled_df[col] =
|
| 61 |
|
| 62 |
return styled_df
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# Function to calculate top 3 models based on combined score (average of numeric columns)
|
| 65 |
def get_top_3_models(robustness_df, context_grounding_df):
|
| 66 |
# Combine numeric columns from both datasets
|
| 67 |
numeric_cols_robustness = ["Baseline", "Robustness (Δ)"] # Columns with numeric or string-numeric data
|
| 68 |
numeric_cols_context = ["Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"] # From context grounding
|
| 69 |
|
| 70 |
-
# Extract numeric values for each column in robustness_df
|
| 71 |
robustness_scores = pd.DataFrame()
|
| 72 |
for col in numeric_cols_robustness:
|
| 73 |
-
|
| 74 |
-
# Handle string values with deltas (e.g., "0.95 (0.0)")
|
| 75 |
-
robustness_scores[col] = robustness_df[col].apply(lambda x: float(str(x).split(" (")[0]) if " (" in str(x) else float(x))
|
| 76 |
-
else:
|
| 77 |
-
# Handle direct float values
|
| 78 |
-
robustness_scores[col] = robustness_df[col].astype(float)
|
| 79 |
|
| 80 |
-
# Extract numeric values for context_grounding_df (all are already float values)
|
| 81 |
-
context_scores =
|
|
|
|
|
|
|
| 82 |
|
| 83 |
# Combine scores by averaging
|
| 84 |
combined_scores = (robustness_scores.mean(axis=1) + context_scores.mean(axis=1)) / 2
|
|
@@ -107,11 +120,11 @@ def create_leaderboard():
|
|
| 107 |
robustness_df = pd.DataFrame(robustness_data)
|
| 108 |
context_grounding_df = pd.DataFrame(context_grounding_data)
|
| 109 |
|
| 110 |
-
# Format tables to bold highest scores
|
| 111 |
-
|
| 112 |
-
|
| 113 |
|
| 114 |
-
# Get top 3 winners
|
| 115 |
winners_df = get_top_3_models(robustness_df, context_grounding_df)
|
| 116 |
|
| 117 |
# Create Gradio interface with a nice theme
|
|
@@ -122,14 +135,14 @@ def create_leaderboard():
|
|
| 122 |
with gr.Column():
|
| 123 |
with gr.Tab("Robustness Results"):
|
| 124 |
gr.DataFrame(
|
| 125 |
-
value=
|
| 126 |
label="Robustness Results",
|
| 127 |
wrap=True,
|
| 128 |
elem_classes=["custom-table"]
|
| 129 |
)
|
| 130 |
with gr.Tab("Context Grounding Results"):
|
| 131 |
gr.DataFrame(
|
| 132 |
-
value=
|
| 133 |
label="Context Grounding Results",
|
| 134 |
wrap=True,
|
| 135 |
elem_classes=["custom-table"]
|
|
|
|
| 41 |
"Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81]
|
| 42 |
}
|
| 43 |
|
| 44 |
+
# Function to bold the highest score per column (excluding "Model Name") but keep original data for calculations
|
| 45 |
+
def format_table(df, original_df=None):
|
| 46 |
styled_df = df.copy()
|
| 47 |
numeric_columns = [col for col in df.columns if col != "Model Name"]
|
| 48 |
|
| 49 |
+
if original_df is None:
|
| 50 |
+
original_df = df.copy() # Use the input df as original if none provided
|
| 51 |
+
|
| 52 |
for col in numeric_columns:
|
| 53 |
if col in ["Baseline", "Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]:
|
| 54 |
# Convert string values (e.g., "0.95 (0.0)") to float for comparison, or use direct float values
|
| 55 |
+
if any(" (" in str(x) for x in original_df[col]):
|
| 56 |
# Handle string values with deltas (e.g., "0.95 (0.0)")
|
| 57 |
+
values = [float(str(x).split(" (")[0]) for x in original_df[col]]
|
| 58 |
else:
|
| 59 |
# Handle direct float values
|
| 60 |
+
values = original_df[col].astype(float)
|
| 61 |
|
| 62 |
max_value = np.max(values)
|
| 63 |
+
styled_df[col] = original_df[col].apply(lambda x: f"**{x}**" if (float(str(x).split(" (")[0]) if " (" in str(x) else float(x)) == max_value else x)
|
| 64 |
|
| 65 |
return styled_df
|
| 66 |
|
| 67 |
+
# Function to extract numeric value from a string (removing bold markup and deltas)
|
| 68 |
+
def extract_numeric(value):
|
| 69 |
+
if pd.isna(value):
|
| 70 |
+
return np.nan
|
| 71 |
+
if isinstance(value, str):
|
| 72 |
+
# Remove bold markup (**)
|
| 73 |
+
value = value.replace("**", "")
|
| 74 |
+
# Extract numeric part before the delta (if present)
|
| 75 |
+
if " (" in value:
|
| 76 |
+
return float(value.split(" (")[0])
|
| 77 |
+
return float(value)
|
| 78 |
+
return float(value)
|
| 79 |
+
|
| 80 |
# Function to calculate top 3 models based on combined score (average of numeric columns)
|
| 81 |
def get_top_3_models(robustness_df, context_grounding_df):
|
| 82 |
# Combine numeric columns from both datasets
|
| 83 |
numeric_cols_robustness = ["Baseline", "Robustness (Δ)"] # Columns with numeric or string-numeric data
|
| 84 |
numeric_cols_context = ["Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"] # From context grounding
|
| 85 |
|
| 86 |
+
# Extract numeric values for each column in robustness_df, handling bold markup and deltas
|
| 87 |
robustness_scores = pd.DataFrame()
|
| 88 |
for col in numeric_cols_robustness:
|
| 89 |
+
robustness_scores[col] = robustness_df[col].apply(extract_numeric)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
+
# Extract numeric values for context_grounding_df (all are already float values, but use extract_numeric for consistency)
|
| 92 |
+
context_scores = pd.DataFrame()
|
| 93 |
+
for col in numeric_cols_context:
|
| 94 |
+
context_scores[col] = context_grounding_df[col].apply(extract_numeric)
|
| 95 |
|
| 96 |
# Combine scores by averaging
|
| 97 |
combined_scores = (robustness_scores.mean(axis=1) + context_scores.mean(axis=1)) / 2
|
|
|
|
| 120 |
robustness_df = pd.DataFrame(robustness_data)
|
| 121 |
context_grounding_df = pd.DataFrame(context_grounding_data)
|
| 122 |
|
| 123 |
+
# Format tables to bold highest scores, but keep original data for calculations
|
| 124 |
+
formatted_robustness_df = format_table(robustness_df, robustness_df.copy()) # Pass original data for calculations
|
| 125 |
+
formatted_context_grounding_df = format_table(context_grounding_df, context_grounding_df.copy()) # Pass original data for calculations
|
| 126 |
|
| 127 |
+
# Get top 3 winners using the original (unformatted) DataFrames
|
| 128 |
winners_df = get_top_3_models(robustness_df, context_grounding_df)
|
| 129 |
|
| 130 |
# Create Gradio interface with a nice theme
|
|
|
|
| 135 |
with gr.Column():
|
| 136 |
with gr.Tab("Robustness Results"):
|
| 137 |
gr.DataFrame(
|
| 138 |
+
value=formatted_robustness_df,
|
| 139 |
label="Robustness Results",
|
| 140 |
wrap=True,
|
| 141 |
elem_classes=["custom-table"]
|
| 142 |
)
|
| 143 |
with gr.Tab("Context Grounding Results"):
|
| 144 |
gr.DataFrame(
|
| 145 |
+
value=formatted_context_grounding_df,
|
| 146 |
label="Context Grounding Results",
|
| 147 |
wrap=True,
|
| 148 |
elem_classes=["custom-table"]
|