Spaces:
Running
Running
Added examples in
Browse files
app.py
CHANGED
|
@@ -318,6 +318,55 @@ if __name__ == "__main__":
|
|
| 318 |
|
| 319 |
# ... rest of your Gradio app setup ...
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
|
| 322 |
judge_id = gr.State(get_new_session_id())
|
| 323 |
gr.Markdown(MAIN_TITLE)
|
|
@@ -331,6 +380,16 @@ with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
|
|
| 331 |
gr.Markdown(BATTLE_RULES)
|
| 332 |
gr.Markdown(EVAL_DESCRIPTION)
|
| 333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
# Eval Prompt and Variables side by side
|
| 335 |
with gr.Row():
|
| 336 |
# Left column - Eval Prompt
|
|
@@ -582,4 +641,41 @@ with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
|
|
| 582 |
outputs=[leaderboard_table, stats_display]
|
| 583 |
)
|
| 584 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
demo.launch()
|
|
|
|
| 318 |
|
| 319 |
# ... rest of your Gradio app setup ...
|
| 320 |
|
| 321 |
+
# Example evaluation metrics data
|
| 322 |
+
EXAMPLE_METRICS = {
|
| 323 |
+
"Hallucination": {
|
| 324 |
+
"prompt": DEFAULT_EVAL_PROMPT, # We'll replace these with actual examples
|
| 325 |
+
"input": DEFAULT_INPUT,
|
| 326 |
+
"response": DEFAULT_RESPONSE
|
| 327 |
+
},
|
| 328 |
+
"Precision": {
|
| 329 |
+
"prompt": DEFAULT_EVAL_PROMPT,
|
| 330 |
+
"input": DEFAULT_INPUT,
|
| 331 |
+
"response": DEFAULT_RESPONSE
|
| 332 |
+
},
|
| 333 |
+
"Recall": {
|
| 334 |
+
"prompt": DEFAULT_EVAL_PROMPT,
|
| 335 |
+
"input": DEFAULT_INPUT,
|
| 336 |
+
"response": DEFAULT_RESPONSE
|
| 337 |
+
},
|
| 338 |
+
"Logical coherence": {
|
| 339 |
+
"prompt": DEFAULT_EVAL_PROMPT,
|
| 340 |
+
"input": DEFAULT_INPUT,
|
| 341 |
+
"response": DEFAULT_RESPONSE
|
| 342 |
+
},
|
| 343 |
+
"Faithfulness": {
|
| 344 |
+
"prompt": DEFAULT_EVAL_PROMPT,
|
| 345 |
+
"input": DEFAULT_INPUT,
|
| 346 |
+
"response": DEFAULT_RESPONSE
|
| 347 |
+
}
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
def set_example_metric(metric_name):
|
| 351 |
+
if metric_name == "Custom":
|
| 352 |
+
return [
|
| 353 |
+
DEFAULT_EVAL_PROMPT,
|
| 354 |
+
DEFAULT_INPUT,
|
| 355 |
+
DEFAULT_RESPONSE
|
| 356 |
+
]
|
| 357 |
+
|
| 358 |
+
metric_data = EXAMPLE_METRICS[metric_name]
|
| 359 |
+
return [
|
| 360 |
+
metric_data["prompt"],
|
| 361 |
+
metric_data["input"],
|
| 362 |
+
metric_data["response"]
|
| 363 |
+
]
|
| 364 |
+
|
| 365 |
+
# Select random metric at startup
|
| 366 |
+
def get_random_metric():
|
| 367 |
+
metrics = list(EXAMPLE_METRICS.keys())
|
| 368 |
+
return set_example_metric(random.choice(metrics))
|
| 369 |
+
|
| 370 |
with gr.Blocks(theme='default', css=CSS_STYLES) as demo:
|
| 371 |
judge_id = gr.State(get_new_session_id())
|
| 372 |
gr.Markdown(MAIN_TITLE)
|
|
|
|
| 380 |
gr.Markdown(BATTLE_RULES)
|
| 381 |
gr.Markdown(EVAL_DESCRIPTION)
|
| 382 |
|
| 383 |
+
# Add Example Metrics Section
|
| 384 |
+
with gr.Accordion("Example evaluation metrics", open=True):
|
| 385 |
+
with gr.Row():
|
| 386 |
+
custom_btn = gr.Button("Custom", variant="secondary")
|
| 387 |
+
hallucination_btn = gr.Button("Hallucination")
|
| 388 |
+
precision_btn = gr.Button("Precision")
|
| 389 |
+
recall_btn = gr.Button("Recall")
|
| 390 |
+
coherence_btn = gr.Button("Logical coherence")
|
| 391 |
+
faithfulness_btn = gr.Button("Faithfulness")
|
| 392 |
+
|
| 393 |
# Eval Prompt and Variables side by side
|
| 394 |
with gr.Row():
|
| 395 |
# Left column - Eval Prompt
|
|
|
|
| 641 |
outputs=[leaderboard_table, stats_display]
|
| 642 |
)
|
| 643 |
|
| 644 |
+
# Add click handlers for metric buttons
|
| 645 |
+
custom_btn.click(
|
| 646 |
+
fn=lambda: set_example_metric("Custom"),
|
| 647 |
+
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
|
| 648 |
+
)
|
| 649 |
+
|
| 650 |
+
hallucination_btn.click(
|
| 651 |
+
fn=lambda: set_example_metric("Hallucination"),
|
| 652 |
+
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
|
| 653 |
+
)
|
| 654 |
+
|
| 655 |
+
precision_btn.click(
|
| 656 |
+
fn=lambda: set_example_metric("Precision"),
|
| 657 |
+
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
|
| 658 |
+
)
|
| 659 |
+
|
| 660 |
+
recall_btn.click(
|
| 661 |
+
fn=lambda: set_example_metric("Recall"),
|
| 662 |
+
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
coherence_btn.click(
|
| 666 |
+
fn=lambda: set_example_metric("Logical coherence"),
|
| 667 |
+
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
faithfulness_btn.click(
|
| 671 |
+
fn=lambda: set_example_metric("Faithfulness"),
|
| 672 |
+
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
|
| 673 |
+
)
|
| 674 |
+
|
| 675 |
+
# Set random metric at startup
|
| 676 |
+
demo.load(
|
| 677 |
+
fn=get_random_metric,
|
| 678 |
+
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]]
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
demo.launch()
|