Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -61,7 +61,7 @@ def run_toy_evaluation():
|
|
| 61 |
# 3. MMLU Evaluation call
|
| 62 |
# ---------------------------------------------------------------------------
|
| 63 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
| 64 |
-
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots,
|
| 65 |
"""
|
| 66 |
Runs the MMLU evaluation with the specified parameters.
|
| 67 |
|
|
@@ -69,7 +69,8 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, num_examples):
|
|
| 69 |
all_subjects (bool): Whether to evaluate all subjects
|
| 70 |
num_subjects (int): Number of subjects to evaluate (1-57)
|
| 71 |
num_shots (int): Number of few-shot examples (0-5)
|
| 72 |
-
|
|
|
|
| 73 |
"""
|
| 74 |
|
| 75 |
if not model_loaded:
|
|
@@ -81,13 +82,17 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, num_examples):
|
|
| 81 |
# Convert num_subjects to -1 if all_subjects is True
|
| 82 |
if all_subjects:
|
| 83 |
num_subjects = -1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
# Run evaluation
|
| 86 |
results = evaluate_mmlu(
|
| 87 |
model,
|
| 88 |
tokenizer,
|
| 89 |
num_subjects=num_subjects,
|
| 90 |
-
num_questions=
|
| 91 |
num_shots=num_shots
|
| 92 |
)
|
| 93 |
|
|
@@ -138,13 +143,13 @@ with gr.Blocks() as demo:
|
|
| 138 |
with gr.Row():
|
| 139 |
all_subjects_checkbox = gr.Checkbox(
|
| 140 |
label="Evaluate All Subjects",
|
| 141 |
-
value=
|
| 142 |
info="When checked, evaluates all 57 MMLU subjects"
|
| 143 |
)
|
| 144 |
num_subjects_slider = gr.Slider(
|
| 145 |
minimum=1,
|
| 146 |
maximum=57,
|
| 147 |
-
value=
|
| 148 |
step=1,
|
| 149 |
label="Number of Subjects",
|
| 150 |
info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order.",
|
|
@@ -155,18 +160,26 @@ with gr.Blocks() as demo:
|
|
| 155 |
num_shots_slider = gr.Slider(
|
| 156 |
minimum=0,
|
| 157 |
maximum=5,
|
| 158 |
-
value=5,
|
| 159 |
step=1,
|
| 160 |
label="Number of Few-shot Examples",
|
| 161 |
info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."
|
| 162 |
)
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
minimum=1,
|
| 165 |
-
maximum=
|
| 166 |
-
value=
|
| 167 |
step=1,
|
| 168 |
-
label="
|
| 169 |
-
info="
|
|
|
|
| 170 |
)
|
| 171 |
|
| 172 |
with gr.Row():
|
|
@@ -184,12 +197,31 @@ with gr.Blocks() as demo:
|
|
| 184 |
)
|
| 185 |
|
| 186 |
# Update num_subjects_slider interactivity based on all_subjects checkbox
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
all_subjects_checkbox.change(
|
| 188 |
-
fn=
|
| 189 |
inputs=[all_subjects_checkbox],
|
| 190 |
outputs=[num_subjects_slider]
|
| 191 |
)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# Connect MMLU evaluation button
|
| 194 |
eval_mmlu_button.click(
|
| 195 |
fn=run_mmlu_evaluation,
|
|
@@ -197,9 +229,10 @@ with gr.Blocks() as demo:
|
|
| 197 |
all_subjects_checkbox,
|
| 198 |
num_subjects_slider,
|
| 199 |
num_shots_slider,
|
| 200 |
-
|
|
|
|
| 201 |
],
|
| 202 |
outputs=results_output
|
| 203 |
)
|
| 204 |
|
| 205 |
-
demo.launch()
|
|
|
|
| 61 |
# 3. MMLU Evaluation call
|
| 62 |
# ---------------------------------------------------------------------------
|
| 63 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
| 64 |
+
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions):
|
| 65 |
"""
|
| 66 |
Runs the MMLU evaluation with the specified parameters.
|
| 67 |
|
|
|
|
| 69 |
all_subjects (bool): Whether to evaluate all subjects
|
| 70 |
num_subjects (int): Number of subjects to evaluate (1-57)
|
| 71 |
num_shots (int): Number of few-shot examples (0-5)
|
| 72 |
+
all_questions (bool): Whether to evaluate all questions per subject
|
| 73 |
+
num_questions (int): Number of examples per subject (1-20 or -1 for all)
|
| 74 |
"""
|
| 75 |
|
| 76 |
if not model_loaded:
|
|
|
|
| 82 |
# Convert num_subjects to -1 if all_subjects is True
|
| 83 |
if all_subjects:
|
| 84 |
num_subjects = -1
|
| 85 |
+
|
| 86 |
+
# Convert num_questions to -1 if all_questions is True
|
| 87 |
+
if all_questions:
|
| 88 |
+
num_questions = -1
|
| 89 |
|
| 90 |
# Run evaluation
|
| 91 |
results = evaluate_mmlu(
|
| 92 |
model,
|
| 93 |
tokenizer,
|
| 94 |
num_subjects=num_subjects,
|
| 95 |
+
num_questions=num_questions,
|
| 96 |
num_shots=num_shots
|
| 97 |
)
|
| 98 |
|
|
|
|
| 143 |
with gr.Row():
|
| 144 |
all_subjects_checkbox = gr.Checkbox(
|
| 145 |
label="Evaluate All Subjects",
|
| 146 |
+
value=False, # Default is unchecked
|
| 147 |
info="When checked, evaluates all 57 MMLU subjects"
|
| 148 |
)
|
| 149 |
num_subjects_slider = gr.Slider(
|
| 150 |
minimum=1,
|
| 151 |
maximum=57,
|
| 152 |
+
value=10, # Default is 10 subjects
|
| 153 |
step=1,
|
| 154 |
label="Number of Subjects",
|
| 155 |
info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order.",
|
|
|
|
| 160 |
num_shots_slider = gr.Slider(
|
| 161 |
minimum=0,
|
| 162 |
maximum=5,
|
| 163 |
+
value=5, # Default is 5 few-shot examples
|
| 164 |
step=1,
|
| 165 |
label="Number of Few-shot Examples",
|
| 166 |
info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."
|
| 167 |
)
|
| 168 |
+
|
| 169 |
+
with gr.Row():
|
| 170 |
+
all_questions_checkbox = gr.Checkbox(
|
| 171 |
+
label="Evaluate All Questions",
|
| 172 |
+
value=False, # Default is unchecked
|
| 173 |
+
info="When checked, evaluates all available questions for each subject"
|
| 174 |
+
)
|
| 175 |
+
num_questions_slider = gr.Slider(
|
| 176 |
minimum=1,
|
| 177 |
+
maximum=20,
|
| 178 |
+
value=10, # Default is 10 questions
|
| 179 |
step=1,
|
| 180 |
+
label="Questions per Subject",
|
| 181 |
+
info="Choose a subset of questions (1-20), or click the checkbox for All Questions",
|
| 182 |
+
interactive=True
|
| 183 |
)
|
| 184 |
|
| 185 |
with gr.Row():
|
|
|
|
| 197 |
)
|
| 198 |
|
| 199 |
# Update num_subjects_slider interactivity based on all_subjects checkbox
|
| 200 |
+
def update_subjects_slider(checked):
|
| 201 |
+
if checked:
|
| 202 |
+
return gr.update(value=57, interactive=False)
|
| 203 |
+
else:
|
| 204 |
+
return gr.update(interactive=True)
|
| 205 |
+
|
| 206 |
all_subjects_checkbox.change(
|
| 207 |
+
fn=update_subjects_slider,
|
| 208 |
inputs=[all_subjects_checkbox],
|
| 209 |
outputs=[num_subjects_slider]
|
| 210 |
)
|
| 211 |
|
| 212 |
+
# Update num_questions_slider interactivity based on all_questions checkbox
|
| 213 |
+
def update_questions_slider(checked):
|
| 214 |
+
if checked:
|
| 215 |
+
return gr.update(interactive=False)
|
| 216 |
+
else:
|
| 217 |
+
return gr.update(interactive=True)
|
| 218 |
+
|
| 219 |
+
all_questions_checkbox.change(
|
| 220 |
+
fn=update_questions_slider,
|
| 221 |
+
inputs=[all_questions_checkbox],
|
| 222 |
+
outputs=[num_questions_slider]
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
# Connect MMLU evaluation button
|
| 226 |
eval_mmlu_button.click(
|
| 227 |
fn=run_mmlu_evaluation,
|
|
|
|
| 229 |
all_subjects_checkbox,
|
| 230 |
num_subjects_slider,
|
| 231 |
num_shots_slider,
|
| 232 |
+
all_questions_checkbox,
|
| 233 |
+
num_questions_slider
|
| 234 |
],
|
| 235 |
outputs=results_output
|
| 236 |
)
|
| 237 |
|
| 238 |
+
demo.launch()
|