Spaces:
Running
Running
Correcting significance in `tournament_results` with FDR correction method
Browse files
server.py
CHANGED
|
@@ -167,6 +167,30 @@ def check_significance(model_a_path, model_b_path):
|
|
| 167 |
result = check_significance_wait_for_result(result_url)
|
| 168 |
return result
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
class NoneLock:
|
| 171 |
def __init__(self, *args, **kwargs):
|
| 172 |
pass
|
|
@@ -543,6 +567,7 @@ class LeaderboardServer:
|
|
| 543 |
|
| 544 |
with self.var_lock.ro:
|
| 545 |
tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
|
|
|
|
| 546 |
|
| 547 |
for competitor_id in tournament_results[submission_id].keys() - {submission_id}: # without self
|
| 548 |
if competitor_id not in self.submission_id_to_data:
|
|
@@ -560,7 +585,7 @@ class LeaderboardServer:
|
|
| 560 |
if to_csv:
|
| 561 |
match_results[task] = tournament_results[submission_id][competitor_id][task]["significant"]
|
| 562 |
else:
|
| 563 |
-
match_task_result_details = dict.fromkeys(["significant", "p_value"]) # order has impact to sorting DataFrame
|
| 564 |
match_task_result_details.update(copy.deepcopy(tournament_results[submission_id][competitor_id][task]))
|
| 565 |
match_task_result_details["significant"] = str(match_task_result_details["significant"]).lower() # originaly bool
|
| 566 |
match_task_result_significant = match_task_result_details["significant"]
|
|
@@ -611,6 +636,20 @@ class LeaderboardServer:
|
|
| 611 |
dataframe = dataframe.style.apply(self._model_tournament_table_highlight_true_and_false, axis=None)
|
| 612 |
return dataframe
|
| 613 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
def _dataframe_to_csv(self, dataframe, filename):
|
| 615 |
try:
|
| 616 |
if not os.path.isdir(self.DIR_DATAFRAMES_CSV):
|
|
@@ -689,6 +728,7 @@ class LeaderboardServer:
|
|
| 689 |
def _get_leaderboard(self, pre_submit=None, category=None, to_csv=False):
|
| 690 |
with self.var_lock.ro:
|
| 691 |
tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
|
|
|
|
| 692 |
category = category if category else self.TASKS_CATEGORY_OVERALL
|
| 693 |
|
| 694 |
if len(tournament_results) == 0:
|
|
|
|
| 167 |
result = check_significance_wait_for_result(result_url)
|
| 168 |
return result
|
| 169 |
|
| 170 |
+
def correct_pvals_for_fdr(model_task_pvals, fdr_alpha=0.05, repeat_on_conn_timeout=10):
|
| 171 |
+
url = 'https://czechllm.fit.vutbr.cz/benczechmark-leaderboard/compare_significance/correct_pvals_for_fdr'
|
| 172 |
+
|
| 173 |
+
data = {
|
| 174 |
+
"pvals": model_task_pvals,
|
| 175 |
+
"fdr_alpha": fdr_alpha
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
# prepare and send request
|
| 179 |
+
response = check_significance_repeat_on_conn_timeout(
|
| 180 |
+
repeat_on_conn_timeout,
|
| 181 |
+
requests.post, url, json=data, timeout=60 * 5
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# check response
|
| 185 |
+
if response.status_code == 200:
|
| 186 |
+
result = response.json()
|
| 187 |
+
elif response.status_code == 429:
|
| 188 |
+
raise CheckSignificanceError('Server is too busy. Please try again later.')
|
| 189 |
+
else:
|
| 190 |
+
raise CheckSignificanceError(f'Failed to submit task. Status code: {response.status_code}')
|
| 191 |
+
|
| 192 |
+
return result
|
| 193 |
+
|
| 194 |
class NoneLock:
|
| 195 |
def __init__(self, *args, **kwargs):
|
| 196 |
pass
|
|
|
|
| 567 |
|
| 568 |
with self.var_lock.ro:
|
| 569 |
tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
|
| 570 |
+
tournament_results = self._correct_significance_in_tournament_results(tournament_results)
|
| 571 |
|
| 572 |
for competitor_id in tournament_results[submission_id].keys() - {submission_id}: # without self
|
| 573 |
if competitor_id not in self.submission_id_to_data:
|
|
|
|
| 585 |
if to_csv:
|
| 586 |
match_results[task] = tournament_results[submission_id][competitor_id][task]["significant"]
|
| 587 |
else:
|
| 588 |
+
match_task_result_details = dict.fromkeys(["significant", "corrected_p_value", "p_value"]) # order has impact to sorting DataFrame
|
| 589 |
match_task_result_details.update(copy.deepcopy(tournament_results[submission_id][competitor_id][task]))
|
| 590 |
match_task_result_details["significant"] = str(match_task_result_details["significant"]).lower() # originaly bool
|
| 591 |
match_task_result_significant = match_task_result_details["significant"]
|
|
|
|
| 636 |
dataframe = dataframe.style.apply(self._model_tournament_table_highlight_true_and_false, axis=None)
|
| 637 |
return dataframe
|
| 638 |
|
| 639 |
+
def _correct_significance_in_tournament_results(self, tournament_results, alpha=0.05):
|
| 640 |
+
tournament_results = copy.deepcopy(tournament_results)
|
| 641 |
+
|
| 642 |
+
for submission_id in tournament_results:
|
| 643 |
+
for task in self.TASKS_METADATA:
|
| 644 |
+
competitors = [competitor_id for competitor_id in tournament_results[submission_id].keys() - {submission_id}] # without self
|
| 645 |
+
model_task_pvals = [tournament_results[submission_id][competitor_id][task]["p_value"] for competitor_id in competitors]
|
| 646 |
+
corrected_model_task_pvals = correct_pvals_for_fdr(model_task_pvals)
|
| 647 |
+
for competitor_id, task_pval in zip(competitors, corrected_model_task_pvals):
|
| 648 |
+
tournament_results[submission_id][competitor_id][task]["corrected_p_value"] = task_pval
|
| 649 |
+
tournament_results[submission_id][competitor_id][task]["significant"] = bool(task_pval < alpha)
|
| 650 |
+
|
| 651 |
+
return tournament_results
|
| 652 |
+
|
| 653 |
def _dataframe_to_csv(self, dataframe, filename):
|
| 654 |
try:
|
| 655 |
if not os.path.isdir(self.DIR_DATAFRAMES_CSV):
|
|
|
|
| 728 |
def _get_leaderboard(self, pre_submit=None, category=None, to_csv=False):
|
| 729 |
with self.var_lock.ro:
|
| 730 |
tournament_results = pre_submit.tournament_results if pre_submit else self.tournament_results
|
| 731 |
+
tournament_results = self._correct_significance_in_tournament_results(tournament_results)
|
| 732 |
category = category if category else self.TASKS_CATEGORY_OVERALL
|
| 733 |
|
| 734 |
if len(tournament_results) == 0:
|