Spaces:
Build error
Build error
allow multiple metrics + concise the cols
Browse files
app.py
CHANGED
|
@@ -31,7 +31,8 @@ def get_task_type(df):
|
|
| 31 |
def fix_df(df):
|
| 32 |
# For some reason some metrics and predictions are stored as strings
|
| 33 |
for col in ["predictions", "metrics", "choices", "gold", "gold_index"]:
|
| 34 |
-
|
|
|
|
| 35 |
return df
|
| 36 |
|
| 37 |
def get_run_name_seed(run_name):
|
|
@@ -119,15 +120,18 @@ def fetch_run_results(repo_name, runs_to_fetch, checkpoint,
|
|
| 119 |
return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
|
| 120 |
|
| 121 |
|
| 122 |
-
def
|
| 123 |
-
if df is None or not selected_runs or not
|
| 124 |
return None
|
| 125 |
-
kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs]
|
| 126 |
other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
|
| 127 |
df = df.drop(columns=other_metrics)
|
| 128 |
# widths = get_column_widths(df)
|
| 129 |
-
df =
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
def get_column_widths(df):
|
| 133 |
column_widths = []
|
|
@@ -143,17 +147,26 @@ def get_column_widths(df):
|
|
| 143 |
return column_widths
|
| 144 |
|
| 145 |
|
| 146 |
-
def
|
| 147 |
"""
|
| 148 |
Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
|
|
|
|
| 149 |
"""
|
| 150 |
-
#
|
|
|
|
|
|
|
| 151 |
for idx, run_name in enumerate(run_names):
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
return df
|
| 158 |
|
| 159 |
|
|
@@ -192,16 +205,23 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
|
|
| 192 |
|
| 193 |
if task_type == "multiple_choice":
|
| 194 |
n_choices = len(df['choices'])
|
| 195 |
-
return
|
| 196 |
|
| 197 |
if task_type == "mixed":
|
| 198 |
return predictions[0]
|
| 199 |
|
| 200 |
return predictions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
prepared_df = pd.DataFrame({
|
| 203 |
'full_prompt': df['full_prompt'],
|
| 204 |
-
|
| 205 |
})
|
| 206 |
# For some reason some metrics are stored as strings
|
| 207 |
metrics = df['metrics']
|
|
@@ -213,10 +233,13 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
|
|
| 213 |
def get_gold_label(df, task_type):
|
| 214 |
if task_type == "generative":
|
| 215 |
return df['gold']
|
| 216 |
-
return
|
| 217 |
|
| 218 |
# Prepare the first DataFrame with choices and gold
|
| 219 |
-
combined_df = dfs[0][['full_prompt'
|
|
|
|
|
|
|
|
|
|
| 220 |
combined_df['gold'] = dfs[0].apply(lambda row: get_gold_label(row, task_type), axis=1).values
|
| 221 |
|
| 222 |
# Join all prepared DataFrames
|
|
@@ -227,32 +250,9 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
|
|
| 227 |
|
| 228 |
available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
|
| 229 |
combined_df = combined_df.reset_index()
|
|
|
|
| 230 |
|
| 231 |
-
return combined_df,
|
| 232 |
-
|
| 233 |
-
def render_results_table(df: pd.DataFrame):
|
| 234 |
-
if df is None or df.empty:
|
| 235 |
-
return None
|
| 236 |
-
|
| 237 |
-
# Select a subset of 100 examples
|
| 238 |
-
df_subset = df.sample(n=min(100, len(df)), random_state=42)
|
| 239 |
-
|
| 240 |
-
# Prepare the data for display
|
| 241 |
-
display_data = []
|
| 242 |
-
for _, row in df_subset.iterrows():
|
| 243 |
-
example_data = {
|
| 244 |
-
'text': row['example'],
|
| 245 |
-
'choices': row['choices'],
|
| 246 |
-
'gold_index': row['gold_index'],
|
| 247 |
-
}
|
| 248 |
-
for run in df['run'].unique():
|
| 249 |
-
run_data = df[(df['run'] == run) & (df['example'] == row['example'])]
|
| 250 |
-
if not run_data.empty:
|
| 251 |
-
example_data[f'{run}_prediction'] = run_data['predictions'].values[0]
|
| 252 |
-
example_data[f'{run}_score'] = run_data['metrics'].values[0]
|
| 253 |
-
display_data.append(example_data)
|
| 254 |
-
|
| 255 |
-
return pd.DataFrame(display_data)
|
| 256 |
|
| 257 |
with gr.Blocks() as demo:
|
| 258 |
runs_checkpoints = gr.State({})
|
|
@@ -275,7 +275,7 @@ with gr.Blocks() as demo:
|
|
| 275 |
checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint")
|
| 276 |
fetch_res = gr.Button("Fetch results")
|
| 277 |
task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
|
| 278 |
-
|
| 279 |
results_df = gr.Dataframe(interactive=False, wrap=True)
|
| 280 |
|
| 281 |
# Run selection
|
|
@@ -316,13 +316,13 @@ with gr.Blocks() as demo:
|
|
| 316 |
triggers=[task_name.change],
|
| 317 |
fn=load_task_data,
|
| 318 |
inputs=[repo, selected_runs, checkpoint, task_name, tasks_files],
|
| 319 |
-
outputs=[results_df_full, results_df,
|
| 320 |
)
|
| 321 |
|
| 322 |
gr.on(
|
| 323 |
-
triggers=[
|
| 324 |
-
fn=
|
| 325 |
-
inputs=[results_df_full, selected_runs,
|
| 326 |
outputs=[results_df]
|
| 327 |
)
|
| 328 |
|
|
|
|
| 31 |
def fix_df(df):
|
| 32 |
# For some reason some metrics and predictions are stored as strings
|
| 33 |
for col in ["predictions", "metrics", "choices", "gold", "gold_index"]:
|
| 34 |
+
if col in df.columns:
|
| 35 |
+
df[col] = [ast.literal_eval(x) if isinstance(x, str) else x for x in df[col].values]
|
| 36 |
return df
|
| 37 |
|
| 38 |
def get_run_name_seed(run_name):
|
|
|
|
| 120 |
return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
|
| 121 |
|
| 122 |
|
| 123 |
+
def render_table(df, selected_runs, metric_names):
|
| 124 |
+
if df is None or not selected_runs or not metric_names:
|
| 125 |
return None
|
| 126 |
+
kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
|
| 127 |
other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
|
| 128 |
df = df.drop(columns=other_metrics)
|
| 129 |
# widths = get_column_widths(df)
|
| 130 |
+
df = shorten_column_names(df, selected_runs, metric_names)
|
| 131 |
+
|
| 132 |
+
# Sample 100
|
| 133 |
+
df = df.sample(n=min(100, len(df)), random_state=42)
|
| 134 |
+
return df
|
| 135 |
|
| 136 |
def get_column_widths(df):
|
| 137 |
column_widths = []
|
|
|
|
| 147 |
return column_widths
|
| 148 |
|
| 149 |
|
| 150 |
+
def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
|
| 151 |
"""
|
| 152 |
Turns metric columns (metric_{metric}_{run_name}) into {metric}_i
|
| 153 |
+
Turns generation_{run_name} into generation_i
|
| 154 |
"""
|
| 155 |
+
# Handle metric columns
|
| 156 |
+
# Aggregate columns to rename
|
| 157 |
+
columns_to_rename = {}
|
| 158 |
for idx, run_name in enumerate(run_names):
|
| 159 |
+
for metric_name in metric_names:
|
| 160 |
+
original_metric_column = f"metric_{metric_name}_{run_name}"
|
| 161 |
+
if original_metric_column in df.columns:
|
| 162 |
+
columns_to_rename[original_metric_column] = f"{metric_name}_{idx}"
|
| 163 |
+
|
| 164 |
+
original_generation_column = f"generation_{run_name}"
|
| 165 |
+
if original_generation_column in df.columns:
|
| 166 |
+
columns_to_rename[original_generation_column] = f"generation_{idx}"
|
| 167 |
+
|
| 168 |
+
# Rename columns in a single operation
|
| 169 |
+
df = df.rename(columns=columns_to_rename)
|
| 170 |
return df
|
| 171 |
|
| 172 |
|
|
|
|
| 205 |
|
| 206 |
if task_type == "multiple_choice":
|
| 207 |
n_choices = len(df['choices'])
|
| 208 |
+
return [pred[0] for pred in predictions[:n_choices]]
|
| 209 |
|
| 210 |
if task_type == "mixed":
|
| 211 |
return predictions[0]
|
| 212 |
|
| 213 |
return predictions
|
| 214 |
+
|
| 215 |
+
generative_columns = {
|
| 216 |
+
f"generation_{run_name}": df.apply(partial(get_choice_predictions, task_type=task_type), axis=1)
|
| 217 |
+
} if task_type == "generative" or task_type == "mixed" else {}
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
|
| 221 |
|
| 222 |
prepared_df = pd.DataFrame({
|
| 223 |
'full_prompt': df['full_prompt'],
|
| 224 |
+
**generative_columns,
|
| 225 |
})
|
| 226 |
# For some reason some metrics are stored as strings
|
| 227 |
metrics = df['metrics']
|
|
|
|
| 233 |
def get_gold_label(df, task_type):
|
| 234 |
if task_type == "generative":
|
| 235 |
return df['gold']
|
| 236 |
+
return df['gold_index']
|
| 237 |
|
| 238 |
# Prepare the first DataFrame with choices and gold
|
| 239 |
+
combined_df = dfs[0][['full_prompt']].set_index('full_prompt')
|
| 240 |
+
if task_type in ["multiple_choice", "mixed"]:
|
| 241 |
+
combined_df["choices"] = dfs[0]["choices"].values
|
| 242 |
+
|
| 243 |
combined_df['gold'] = dfs[0].apply(lambda row: get_gold_label(row, task_type), axis=1).values
|
| 244 |
|
| 245 |
# Join all prepared DataFrames
|
|
|
|
| 250 |
|
| 251 |
available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
|
| 252 |
combined_df = combined_df.reset_index()
|
| 253 |
+
chosen_metrics = available_metrics[:1]
|
| 254 |
|
| 255 |
+
return combined_df, render_table(combined_df, runs_to_fetch, chosen_metrics), gr.update(choices=available_metrics, value=chosen_metrics)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
with gr.Blocks() as demo:
|
| 258 |
runs_checkpoints = gr.State({})
|
|
|
|
| 275 |
checkpoint = gr.Dropdown(choices=[], interactive=True, label="Checkpoint")
|
| 276 |
fetch_res = gr.Button("Fetch results")
|
| 277 |
task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
|
| 278 |
+
metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
|
| 279 |
results_df = gr.Dataframe(interactive=False, wrap=True)
|
| 280 |
|
| 281 |
# Run selection
|
|
|
|
| 316 |
triggers=[task_name.change],
|
| 317 |
fn=load_task_data,
|
| 318 |
inputs=[repo, selected_runs, checkpoint, task_name, tasks_files],
|
| 319 |
+
outputs=[results_df_full, results_df, metric_names]
|
| 320 |
)
|
| 321 |
|
| 322 |
gr.on(
|
| 323 |
+
triggers=[metric_names.change],
|
| 324 |
+
fn=render_table,
|
| 325 |
+
inputs=[results_df_full, selected_runs, metric_names],
|
| 326 |
outputs=[results_df]
|
| 327 |
)
|
| 328 |
|