Spaces:
Running
Running
root
commited on
Commit
·
c1765cf
1
Parent(s):
99399ee
fix
Browse files
app.py
CHANGED
|
@@ -33,16 +33,6 @@ repo = snapshot_download(
|
|
| 33 |
)
|
| 34 |
|
| 35 |
def avg_over_rewardbench_v2(dataframe_core):
|
| 36 |
-
"""
|
| 37 |
-
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
| 38 |
-
|
| 39 |
-
We average over 4 core sections (per prompt weighting):
|
| 40 |
-
1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
|
| 41 |
-
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
| 42 |
-
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
| 43 |
-
4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
| 44 |
-
5. Prior Sets (0.5 weight): Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
|
| 45 |
-
"""
|
| 46 |
domain_cols = ['factuality', 'coconot/safety', 'math', 'precise instruction following']
|
| 47 |
new_df = dataframe_core.copy()
|
| 48 |
|
|
@@ -165,8 +155,8 @@ def length_bias_check(dataframe):
|
|
| 165 |
|
| 166 |
|
| 167 |
rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
|
| 168 |
-
rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
|
| 169 |
-
prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
|
| 170 |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
| 171 |
|
| 172 |
rewardbench_data_avg = avg_over_rewardbenc_v2(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
|
|
@@ -193,7 +183,7 @@ rewardbench_data = prep_df(rewardbench_data)
|
|
| 193 |
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
| 194 |
# adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
|
| 195 |
|
| 196 |
-
rewardbench_data_length = prep_df(rewardbench_data_length)
|
| 197 |
prefs_data = prep_df(prefs_data)
|
| 198 |
|
| 199 |
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
|
|
|
|
| 33 |
)
|
| 34 |
|
| 35 |
def avg_over_rewardbench_v2(dataframe_core):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
domain_cols = ['factuality', 'coconot/safety', 'math', 'precise instruction following']
|
| 37 |
new_df = dataframe_core.copy()
|
| 38 |
|
|
|
|
| 155 |
|
| 156 |
|
| 157 |
rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
|
| 158 |
+
# rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
|
| 159 |
+
# prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
|
| 160 |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
| 161 |
|
| 162 |
rewardbench_data_avg = avg_over_rewardbenc_v2(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
|
|
|
|
| 183 |
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
| 184 |
# adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
|
| 185 |
|
| 186 |
+
# rewardbench_data_length = prep_df(rewardbench_data_length)
|
| 187 |
prefs_data = prep_df(prefs_data)
|
| 188 |
|
| 189 |
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
|