Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,10 +13,15 @@ load_dotenv()
|
|
| 13 |
import gradio as gr
|
| 14 |
from gen_api_answer import (
|
| 15 |
get_model_response,
|
| 16 |
-
parse_model_response,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
get_random_human_ai_pair,
|
|
|
|
| 18 |
generate_ai_response
|
| 19 |
-
)
|
| 20 |
from db import add_vote, create_db_connection, get_votes
|
| 21 |
from utils import Vote
|
| 22 |
from common import (
|
|
@@ -33,6 +38,12 @@ from common import (
|
|
| 33 |
VOTING_HEADER,
|
| 34 |
DEFAULT_EVAL_PROMPT_EDITABLE,
|
| 35 |
FIXED_EVAL_SUFFIX,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
)
|
| 37 |
from leaderboard import (
|
| 38 |
get_leaderboard,
|
|
@@ -292,9 +303,16 @@ leaderboard_table = gr.Dataframe(
|
|
| 292 |
)
|
| 293 |
|
| 294 |
|
| 295 |
-
def populate_random_example(request: gr.Request):
|
| 296 |
"""Generate a random human-AI conversation example and reset judge outputs."""
|
| 297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
return [
|
| 299 |
gr.update(value=human_msg),
|
| 300 |
gr.update(value=ai_msg),
|
|
@@ -308,6 +326,7 @@ def populate_random_example(request: gr.Request):
|
|
| 308 |
gr.update(interactive=False, variant="primary"), # Reset vote tie
|
| 309 |
gr.update(value="*Model: Hidden*"), # Reset model name A
|
| 310 |
gr.update(value="*Model: Hidden*"), # Reset model name B
|
|
|
|
| 311 |
]
|
| 312 |
|
| 313 |
|
|
@@ -345,6 +364,14 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 345 |
placeholder="Enter the AI response here..."
|
| 346 |
)
|
| 347 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
with gr.Row():
|
| 349 |
random_btn = gr.Button("🎲", scale=2)
|
| 350 |
send_btn = gr.Button(
|
|
@@ -381,22 +408,86 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 381 |
vote_b = gr.Button("Vote B", variant="primary", interactive=False)
|
| 382 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
| 383 |
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
|
| 384 |
-
|
| 385 |
|
| 386 |
gr.Markdown("<br>")
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
|
|
|
| 394 |
)
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
|
| 401 |
with gr.TabItem("Leaderboard"):
|
| 402 |
with gr.Row():
|
|
@@ -404,7 +495,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 404 |
show_preliminary = gr.Checkbox(
|
| 405 |
label="Reveal preliminary results",
|
| 406 |
value=True, # Checked by default
|
| 407 |
-
info="Show all models, including models with less
|
| 408 |
interactive=True
|
| 409 |
)
|
| 410 |
stats_display = gr.Markdown()
|
|
@@ -412,7 +503,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 412 |
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
| 413 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
| 414 |
)
|
| 415 |
-
|
| 416 |
gr.Markdown("""<br>
|
| 417 |
<br>
|
| 418 |
Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
|
|
@@ -444,62 +535,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 444 |
final_prompt_state = gr.State()
|
| 445 |
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
|
| 446 |
is_editing = gr.State(False) # Track editing state
|
| 447 |
-
|
| 448 |
-
# Update variable inputs based on the eval prompt
|
| 449 |
-
#def update_variables(eval_prompt):
|
| 450 |
-
# variables = parse_variables(eval_prompt)
|
| 451 |
-
# updates = []
|
| 452 |
-
|
| 453 |
-
# for i in range(len(variable_rows)):
|
| 454 |
-
# var_row, var_input = variable_rows[i]
|
| 455 |
-
# if i < len(variables):
|
| 456 |
-
# var_name = variables[i]
|
| 457 |
-
# # Set the number of lines based on the variable name
|
| 458 |
-
# if var_name == "response":
|
| 459 |
-
# lines = 4 # Adjust this number as needed
|
| 460 |
-
# else:
|
| 461 |
-
# lines = 1 # Default to single line for other variables
|
| 462 |
-
# updates.extend(
|
| 463 |
-
# [
|
| 464 |
-
# gr.update(visible=True), # Show the variable row
|
| 465 |
-
# gr.update(
|
| 466 |
-
# label=var_name, visible=True, lines=lines
|
| 467 |
-
# ), # Update label and lines
|
| 468 |
-
# ]
|
| 469 |
-
# )
|
| 470 |
-
# else:
|
| 471 |
-
# updates.extend(
|
| 472 |
-
# [
|
| 473 |
-
# gr.update(visible=False), # Hide the variable row
|
| 474 |
-
# gr.update(value="", visible=False), # Clear value when hidden
|
| 475 |
-
# ]
|
| 476 |
-
# )
|
| 477 |
-
# return updates
|
| 478 |
-
|
| 479 |
-
#eval_prompt.change(
|
| 480 |
-
# fn=update_variables,
|
| 481 |
-
# inputs=eval_prompt,
|
| 482 |
-
# outputs=[item for sublist in variable_rows for item in sublist],
|
| 483 |
-
#)
|
| 484 |
-
|
| 485 |
-
# Regenerate button functionality
|
| 486 |
-
#regenerate_button.click(
|
| 487 |
-
# fn=regenerate_prompt,
|
| 488 |
-
# inputs=[model_a_state, model_b_state, eval_prompt, human_input, ai_response],
|
| 489 |
-
# outputs=[
|
| 490 |
-
# score_a,
|
| 491 |
-
# critique_a,
|
| 492 |
-
# score_b,
|
| 493 |
-
# critique_b,
|
| 494 |
-
# vote_a,
|
| 495 |
-
# vote_b,
|
| 496 |
-
# tie_button_row,
|
| 497 |
-
# model_name_a,
|
| 498 |
-
# model_name_b,
|
| 499 |
-
# model_a_state,
|
| 500 |
-
# model_b_state,
|
| 501 |
-
# ],
|
| 502 |
-
#)
|
| 503 |
|
| 504 |
# Update model names after responses are generated
|
| 505 |
def update_model_names(model_a, model_b):
|
|
@@ -621,39 +657,128 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 621 |
outputs=edit_buttons_row
|
| 622 |
)
|
| 623 |
|
| 624 |
-
#
|
| 625 |
-
def
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
model_a,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
model_b,
|
|
|
|
| 637 |
final_prompt,
|
| 638 |
-
|
|
|
|
| 639 |
|
| 640 |
-
# Parse the responses
|
| 641 |
-
|
| 642 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
# Only append "/ 5" if using the default prompt
|
| 645 |
-
if editable_prompt.strip() == DEFAULT_EVAL_PROMPT_EDITABLE.strip():
|
| 646 |
-
|
| 647 |
-
|
| 648 |
|
| 649 |
-
# Update the last_submission state
|
| 650 |
-
last_submission.value = {"prompt":
|
| 651 |
|
| 652 |
return (
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
gr.update(interactive=True, variant="primary"), # vote_a
|
| 658 |
gr.update(interactive=True, variant="primary"), # vote_b
|
| 659 |
gr.update(interactive=True, variant="primary"), # vote_tie
|
|
@@ -662,18 +787,26 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 662 |
final_prompt,
|
| 663 |
gr.update(value="*Model: Hidden*"),
|
| 664 |
gr.update(value="*Model: Hidden*"),
|
| 665 |
-
gr.update(
|
| 666 |
-
value="Regenerate judges",
|
| 667 |
-
variant="secondary",
|
| 668 |
-
interactive=True
|
| 669 |
-
),
|
| 670 |
gr.update(value="🎲"), # random_btn
|
| 671 |
)
|
| 672 |
|
| 673 |
# Update the click handler to use the editable prompt
|
| 674 |
send_btn.click(
|
| 675 |
fn=submit_and_store,
|
| 676 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
outputs=[
|
| 678 |
score_a,
|
| 679 |
critique_a,
|
|
@@ -692,64 +825,10 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 692 |
],
|
| 693 |
)
|
| 694 |
|
| 695 |
-
# Update the input change handlers to also disable regenerate button
|
| 696 |
-
# def handle_input_changes(prompt, *variables):
|
| 697 |
-
# """Enable send button and manage regenerate button based on input changes"""
|
| 698 |
-
# last_inputs = last_submission.value
|
| 699 |
-
# current_inputs = {"prompt": prompt, "variables": variables}
|
| 700 |
-
# inputs_changed = last_inputs != current_inputs
|
| 701 |
-
# return [
|
| 702 |
-
# gr.update(interactive=True), # send button always enabled
|
| 703 |
-
# gr.update(
|
| 704 |
-
# interactive=not inputs_changed
|
| 705 |
-
# ), # regenerate button disabled if inputs changed
|
| 706 |
-
# ]
|
| 707 |
-
|
| 708 |
-
# Update the change handlers for prompt and variables
|
| 709 |
-
#eval_prompt.change(
|
| 710 |
-
# fn=handle_input_changes,
|
| 711 |
-
# inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
| 712 |
-
# outputs=[send_btn, regenerate_button],
|
| 713 |
-
#)
|
| 714 |
-
|
| 715 |
-
# for _, var_input in variable_rows:
|
| 716 |
-
# var_input.change(
|
| 717 |
-
# fn=handle_input_changes,
|
| 718 |
-
# inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
| 719 |
-
# outputs=[send_btn, regenerate_button],
|
| 720 |
-
# )
|
| 721 |
-
|
| 722 |
-
# Add click handlers for metric buttons
|
| 723 |
-
#outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
| 724 |
-
|
| 725 |
-
#custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
|
| 726 |
-
|
| 727 |
-
#hallucination_btn.click(
|
| 728 |
-
# fn=lambda: set_example_metric("Hallucination"), outputs=outputs_list
|
| 729 |
-
#)
|
| 730 |
-
|
| 731 |
-
#precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
|
| 732 |
-
|
| 733 |
-
#recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
|
| 734 |
-
|
| 735 |
-
#coherence_btn.click(
|
| 736 |
-
# fn=lambda: set_example_metric("Logical_Coherence"), outputs=outputs_list
|
| 737 |
-
#)
|
| 738 |
-
|
| 739 |
-
#faithfulness_btn.click(
|
| 740 |
-
# fn=lambda: set_example_metric("Faithfulness"), outputs=outputs_list
|
| 741 |
-
#)
|
| 742 |
-
|
| 743 |
-
# Set default metric at startup
|
| 744 |
-
demo.load(
|
| 745 |
-
#fn=lambda: set_example_metric("Hallucination"),
|
| 746 |
-
#outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
| 747 |
-
)
|
| 748 |
-
|
| 749 |
# Add random button handler
|
| 750 |
random_btn.click(
|
| 751 |
fn=populate_random_example,
|
| 752 |
-
inputs=[],
|
| 753 |
outputs=[
|
| 754 |
human_input,
|
| 755 |
ai_response,
|
|
@@ -763,6 +842,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 763 |
vote_tie,
|
| 764 |
model_name_a,
|
| 765 |
model_name_b,
|
|
|
|
| 766 |
]
|
| 767 |
)
|
| 768 |
|
|
@@ -810,10 +890,149 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 810 |
|
| 811 |
# Update the demo.load to include the random example population
|
| 812 |
demo.load(
|
| 813 |
-
fn=populate_random_example,
|
| 814 |
inputs=[],
|
| 815 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 816 |
)
|
| 817 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
if __name__ == "__main__":
|
| 819 |
demo.launch()
|
|
|
|
| 13 |
import gradio as gr
|
| 14 |
from gen_api_answer import (
|
| 15 |
get_model_response,
|
| 16 |
+
parse_model_response,
|
| 17 |
+
alternative_parse_model_response
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
from random_sample_generation import (
|
| 21 |
get_random_human_ai_pair,
|
| 22 |
+
get_random_human_ai_ground_truth_pair,
|
| 23 |
generate_ai_response
|
| 24 |
+
)
|
| 25 |
from db import add_vote, create_db_connection, get_votes
|
| 26 |
from utils import Vote
|
| 27 |
from common import (
|
|
|
|
| 38 |
VOTING_HEADER,
|
| 39 |
DEFAULT_EVAL_PROMPT_EDITABLE,
|
| 40 |
FIXED_EVAL_SUFFIX,
|
| 41 |
+
DEFAULT_EVAL_CRITERIA,
|
| 42 |
+
DEFAULT_SCORE_1,
|
| 43 |
+
DEFAULT_SCORE_2,
|
| 44 |
+
DEFAULT_SCORE_3,
|
| 45 |
+
DEFAULT_SCORE_4,
|
| 46 |
+
DEFAULT_SCORE_5,
|
| 47 |
)
|
| 48 |
from leaderboard import (
|
| 49 |
get_leaderboard,
|
|
|
|
| 303 |
)
|
| 304 |
|
| 305 |
|
| 306 |
+
def populate_random_example(request: gr.Request, compatible_mode: bool):
|
| 307 |
"""Generate a random human-AI conversation example and reset judge outputs."""
|
| 308 |
+
if compatible_mode:
|
| 309 |
+
# Generate all three components when compatible mode is enabled
|
| 310 |
+
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
|
| 311 |
+
else:
|
| 312 |
+
# Generate only human and AI messages when compatible mode is disabled
|
| 313 |
+
human_msg, ai_msg = get_random_human_ai_pair()
|
| 314 |
+
ground_truth_msg = ""
|
| 315 |
+
|
| 316 |
return [
|
| 317 |
gr.update(value=human_msg),
|
| 318 |
gr.update(value=ai_msg),
|
|
|
|
| 326 |
gr.update(interactive=False, variant="primary"), # Reset vote tie
|
| 327 |
gr.update(value="*Model: Hidden*"), # Reset model name A
|
| 328 |
gr.update(value="*Model: Hidden*"), # Reset model name B
|
| 329 |
+
gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
|
| 330 |
]
|
| 331 |
|
| 332 |
|
|
|
|
| 364 |
placeholder="Enter the AI response here..."
|
| 365 |
)
|
| 366 |
|
| 367 |
+
# Ground truth response (initially hidden)
|
| 368 |
+
ground_truth = gr.TextArea(
|
| 369 |
+
label="🎯 Ground truth response",
|
| 370 |
+
lines=12,
|
| 371 |
+
placeholder="Enter the ground truth response here...",
|
| 372 |
+
visible=False
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
with gr.Row():
|
| 376 |
random_btn = gr.Button("🎲", scale=2)
|
| 377 |
send_btn = gr.Button(
|
|
|
|
| 408 |
vote_b = gr.Button("Vote B", variant="primary", interactive=False)
|
| 409 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
| 410 |
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
|
| 411 |
+
# Place Vote B button directly under Judge B
|
| 412 |
|
| 413 |
gr.Markdown("<br>")
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
# Replace the "Edit Judge Prompt" Accordion section with:
|
| 417 |
+
with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
|
| 418 |
+
gr.Markdown("<br>")
|
| 419 |
+
compatible_mode_toggle = gr.Checkbox(
|
| 420 |
+
label="Use a prompt compatible with Prometheus models",
|
| 421 |
+
value=False
|
| 422 |
)
|
| 423 |
+
|
| 424 |
+
# Default prompt editor
|
| 425 |
+
with gr.Column(visible=True) as default_prompt_editor:
|
| 426 |
+
eval_prompt_editable = gr.TextArea(
|
| 427 |
+
value=DEFAULT_EVAL_PROMPT_EDITABLE,
|
| 428 |
+
label="Evaluation Criteria",
|
| 429 |
+
lines=12
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
with gr.Row(visible=False) as edit_buttons_row:
|
| 433 |
+
cancel_prompt_btn = gr.Button("Cancel")
|
| 434 |
+
save_prompt_btn = gr.Button("Save", variant="primary")
|
| 435 |
+
gr.Markdown("*The sample being evaluated is always appended as:*")
|
| 436 |
+
gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
|
| 437 |
+
|
| 438 |
+
# Compatible mode editor
|
| 439 |
+
with gr.Column(visible=False) as compatible_prompt_editor:
|
| 440 |
+
with gr.Row():
|
| 441 |
+
# Left column - Evaluation Criteria
|
| 442 |
+
with gr.Column(scale=1):
|
| 443 |
+
eval_criteria_text = gr.TextArea(
|
| 444 |
+
label="Evaluation Criteria",
|
| 445 |
+
lines=12,
|
| 446 |
+
value=DEFAULT_EVAL_CRITERIA,
|
| 447 |
+
placeholder="Enter the evaluation criteria..."
|
| 448 |
+
)
|
| 449 |
+
prometheus_reference = gr.Markdown(
|
| 450 |
+
"<br> *This enforces the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
|
| 451 |
+
visible=False # Initially hidden
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
# Right column - Score Descriptions
|
| 455 |
+
with gr.Column(scale=1):
|
| 456 |
+
score1_description = gr.TextArea(
|
| 457 |
+
label="Score 1",
|
| 458 |
+
value=DEFAULT_SCORE_1,
|
| 459 |
+
placeholder="Description for score 1",
|
| 460 |
+
lines=2
|
| 461 |
+
)
|
| 462 |
+
score2_description = gr.TextArea(
|
| 463 |
+
label="Score 2",
|
| 464 |
+
value=DEFAULT_SCORE_2,
|
| 465 |
+
placeholder="Description for score 2",
|
| 466 |
+
lines=2
|
| 467 |
+
)
|
| 468 |
+
score3_description = gr.TextArea(
|
| 469 |
+
label="Score 3",
|
| 470 |
+
value=DEFAULT_SCORE_3,
|
| 471 |
+
placeholder="Description for score 3",
|
| 472 |
+
lines=2
|
| 473 |
+
)
|
| 474 |
+
score4_description = gr.TextArea(
|
| 475 |
+
label="Score 4",
|
| 476 |
+
value=DEFAULT_SCORE_4,
|
| 477 |
+
placeholder="Description for score 4",
|
| 478 |
+
lines=2
|
| 479 |
+
)
|
| 480 |
+
score5_description = gr.TextArea(
|
| 481 |
+
label="Score 5",
|
| 482 |
+
value=DEFAULT_SCORE_5,
|
| 483 |
+
placeholder="Description for score 5",
|
| 484 |
+
lines=2
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
# Add save/cancel buttons for compatible mode
|
| 488 |
+
with gr.Row(visible=False) as compatible_edit_buttons_row:
|
| 489 |
+
compatible_cancel_btn = gr.Button("Cancel")
|
| 490 |
+
compatible_save_btn = gr.Button("Save", variant="primary")
|
| 491 |
|
| 492 |
with gr.TabItem("Leaderboard"):
|
| 493 |
with gr.Row():
|
|
|
|
| 495 |
show_preliminary = gr.Checkbox(
|
| 496 |
label="Reveal preliminary results",
|
| 497 |
value=True, # Checked by default
|
| 498 |
+
info="Show all models, including models with less human ratings (< 500 votes)",
|
| 499 |
interactive=True
|
| 500 |
)
|
| 501 |
stats_display = gr.Markdown()
|
|
|
|
| 503 |
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
| 504 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
| 505 |
)
|
| 506 |
+
|
| 507 |
gr.Markdown("""<br>
|
| 508 |
<br>
|
| 509 |
Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
|
|
|
|
| 535 |
final_prompt_state = gr.State()
|
| 536 |
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
|
| 537 |
is_editing = gr.State(False) # Track editing state
|
| 538 |
+
compatible_mode_state = gr.State(False) # Track compatible mode state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
|
| 540 |
# Update model names after responses are generated
|
| 541 |
def update_model_names(model_a, model_b):
|
|
|
|
| 657 |
outputs=edit_buttons_row
|
| 658 |
)
|
| 659 |
|
| 660 |
+
# Function to toggle visibility based on compatible mode
|
| 661 |
+
def toggle_compatible_mode(checked):
|
| 662 |
+
return {
|
| 663 |
+
ground_truth: gr.update(visible=checked),
|
| 664 |
+
default_prompt_editor: gr.update(visible=not checked),
|
| 665 |
+
compatible_prompt_editor: gr.update(visible=checked),
|
| 666 |
+
prometheus_reference: gr.update(visible=checked),
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
compatible_mode_toggle.change(
|
| 670 |
+
fn=toggle_compatible_mode,
|
| 671 |
+
inputs=[compatible_mode_toggle],
|
| 672 |
+
outputs=[
|
| 673 |
+
ground_truth,
|
| 674 |
+
default_prompt_editor,
|
| 675 |
+
compatible_prompt_editor,
|
| 676 |
+
prometheus_reference,
|
| 677 |
+
]
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
+
# Update the submit function to handle compatible mode
|
| 681 |
+
def submit_and_store(
|
| 682 |
+
compatible_mode,
|
| 683 |
+
editable_prompt,
|
| 684 |
+
human_input,
|
| 685 |
+
ai_response,
|
| 686 |
+
ground_truth_input,
|
| 687 |
+
eval_criteria_text_input,
|
| 688 |
+
score1_desc,
|
| 689 |
+
score2_desc,
|
| 690 |
+
score3_desc,
|
| 691 |
+
score4_desc,
|
| 692 |
+
score5_desc,
|
| 693 |
+
):
|
| 694 |
+
if compatible_mode:
|
| 695 |
+
# Build the prompt using the new format
|
| 696 |
+
prompt = f"""###Task Description:
|
| 697 |
+
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing an evaluation criteria are given.
|
| 698 |
+
1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general.
|
| 699 |
+
2. After writing the feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
|
| 700 |
+
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
|
| 701 |
+
4. Please do not generate any other openings, closings, or explanations.
|
| 702 |
+
|
| 703 |
+
###The instruction to evaluate:
|
| 704 |
+
{human_input}
|
| 705 |
+
|
| 706 |
+
###Response to evaluate:
|
| 707 |
+
{ai_response}
|
| 708 |
+
|
| 709 |
+
###Reference Answer (Score 5):
|
| 710 |
+
{ground_truth_input}
|
| 711 |
+
|
| 712 |
+
###Score Rubrics:
|
| 713 |
+
[{eval_criteria_text_input}]
|
| 714 |
+
Score 1: {score1_desc}
|
| 715 |
+
Score 2: {score2_desc}
|
| 716 |
+
Score 3: {score3_desc}
|
| 717 |
+
Score 4: {score4_desc}
|
| 718 |
+
Score 5: {score5_desc}
|
| 719 |
+
|
| 720 |
+
###Feedback:
|
| 721 |
+
"""
|
| 722 |
+
final_prompt = prompt
|
| 723 |
+
use_alternative_prompt = True
|
| 724 |
+
else:
|
| 725 |
+
# Combine the editable prompt with fixed suffix
|
| 726 |
+
full_prompt = editable_prompt + FIXED_EVAL_SUFFIX
|
| 727 |
+
# Replace variables in the eval prompt
|
| 728 |
+
variable_values = {'input': human_input, 'response': ai_response}
|
| 729 |
+
final_prompt = get_final_prompt(full_prompt, variable_values)
|
| 730 |
+
use_alternative_prompt = False
|
| 731 |
+
|
| 732 |
+
# Filter models based on compatible mode
|
| 733 |
+
if compatible_mode:
|
| 734 |
+
# Include all models when compatible mode is enabled
|
| 735 |
+
models = list(model_data.keys())
|
| 736 |
+
else:
|
| 737 |
+
# Exclude Prometheus models when not in compatible mode
|
| 738 |
+
models = [
|
| 739 |
+
model_name for model_name in model_data.keys()
|
| 740 |
+
if model_data[model_name]["organization"] != "Prometheus"
|
| 741 |
+
]
|
| 742 |
+
|
| 743 |
+
# Select two models randomly from the filtered list
|
| 744 |
+
model1, model2 = random.sample(models, 2)
|
| 745 |
+
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
|
| 746 |
+
|
| 747 |
+
# Get responses from models
|
| 748 |
+
response_a = get_model_response(
|
| 749 |
model_a,
|
| 750 |
+
model_data.get(model_a),
|
| 751 |
+
final_prompt,
|
| 752 |
+
use_alternative_prompt=use_alternative_prompt
|
| 753 |
+
)
|
| 754 |
+
response_b = get_model_response(
|
| 755 |
model_b,
|
| 756 |
+
model_data.get(model_b),
|
| 757 |
final_prompt,
|
| 758 |
+
use_alternative_prompt=use_alternative_prompt
|
| 759 |
+
)
|
| 760 |
|
| 761 |
+
# Parse the responses based on mode
|
| 762 |
+
if compatible_mode:
|
| 763 |
+
score_a_val, critique_a_val = alternative_parse_model_response(response_a)
|
| 764 |
+
score_b_val, critique_b_val = alternative_parse_model_response(response_b)
|
| 765 |
+
else:
|
| 766 |
+
score_a_val, critique_a_val = parse_model_response(response_a)
|
| 767 |
+
score_b_val, critique_b_val = parse_model_response(response_b)
|
| 768 |
|
| 769 |
# Only append "/ 5" if using the default prompt
|
| 770 |
+
if not compatible_mode and editable_prompt.strip() == DEFAULT_EVAL_PROMPT_EDITABLE.strip():
|
| 771 |
+
score_a_val = f"{score_a_val} / 5"
|
| 772 |
+
score_b_val = f"{score_b_val} / 5"
|
| 773 |
|
| 774 |
+
# Update the last_submission state
|
| 775 |
+
last_submission.value = {"prompt": final_prompt, "variables": [human_input, ai_response]}
|
| 776 |
|
| 777 |
return (
|
| 778 |
+
score_a_val,
|
| 779 |
+
critique_a_val,
|
| 780 |
+
score_b_val,
|
| 781 |
+
critique_b_val,
|
| 782 |
gr.update(interactive=True, variant="primary"), # vote_a
|
| 783 |
gr.update(interactive=True, variant="primary"), # vote_b
|
| 784 |
gr.update(interactive=True, variant="primary"), # vote_tie
|
|
|
|
| 787 |
final_prompt,
|
| 788 |
gr.update(value="*Model: Hidden*"),
|
| 789 |
gr.update(value="*Model: Hidden*"),
|
| 790 |
+
gr.update(value="Regenerate judges", variant="secondary", interactive=True),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 791 |
gr.update(value="🎲"), # random_btn
|
| 792 |
)
|
| 793 |
|
| 794 |
# Update the click handler to use the editable prompt
|
| 795 |
send_btn.click(
|
| 796 |
fn=submit_and_store,
|
| 797 |
+
inputs=[
|
| 798 |
+
compatible_mode_toggle,
|
| 799 |
+
eval_prompt_editable,
|
| 800 |
+
human_input,
|
| 801 |
+
ai_response,
|
| 802 |
+
ground_truth,
|
| 803 |
+
eval_criteria_text,
|
| 804 |
+
score1_description,
|
| 805 |
+
score2_description,
|
| 806 |
+
score3_description,
|
| 807 |
+
score4_description,
|
| 808 |
+
score5_description,
|
| 809 |
+
],
|
| 810 |
outputs=[
|
| 811 |
score_a,
|
| 812 |
critique_a,
|
|
|
|
| 825 |
],
|
| 826 |
)
|
| 827 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
# Add random button handler
|
| 829 |
random_btn.click(
|
| 830 |
fn=populate_random_example,
|
| 831 |
+
inputs=[compatible_mode_toggle], # Use compatible mode toggle to decide behavior
|
| 832 |
outputs=[
|
| 833 |
human_input,
|
| 834 |
ai_response,
|
|
|
|
| 842 |
vote_tie,
|
| 843 |
model_name_a,
|
| 844 |
model_name_b,
|
| 845 |
+
ground_truth, # Set ground truth
|
| 846 |
]
|
| 847 |
)
|
| 848 |
|
|
|
|
| 890 |
|
| 891 |
# Update the demo.load to include the random example population
|
| 892 |
demo.load(
|
| 893 |
+
fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
|
| 894 |
inputs=[],
|
| 895 |
+
outputs=[
|
| 896 |
+
human_input,
|
| 897 |
+
ai_response,
|
| 898 |
+
random_btn,
|
| 899 |
+
score_a,
|
| 900 |
+
critique_a,
|
| 901 |
+
score_b,
|
| 902 |
+
critique_b,
|
| 903 |
+
vote_a,
|
| 904 |
+
vote_b,
|
| 905 |
+
vote_tie,
|
| 906 |
+
model_name_a,
|
| 907 |
+
model_name_b,
|
| 908 |
+
ground_truth,
|
| 909 |
+
]
|
| 910 |
+
)
|
| 911 |
+
|
| 912 |
+
# Add new state variables for compatible mode
|
| 913 |
+
eval_criteria_previous = gr.State(value=DEFAULT_EVAL_CRITERIA)
|
| 914 |
+
score1_previous = gr.State(value=DEFAULT_SCORE_1)
|
| 915 |
+
score2_previous = gr.State(value=DEFAULT_SCORE_2)
|
| 916 |
+
score3_previous = gr.State(value=DEFAULT_SCORE_3)
|
| 917 |
+
score4_previous = gr.State(value=DEFAULT_SCORE_4)
|
| 918 |
+
score5_previous = gr.State(value=DEFAULT_SCORE_5)
|
| 919 |
+
|
| 920 |
+
# Add new functions to handle compatible mode saves/cancels
|
| 921 |
+
def save_compatible_prompt(criteria, score1, score2, score3, score4, score5):
|
| 922 |
+
return [
|
| 923 |
+
gr.update(value=criteria), # Update criteria
|
| 924 |
+
criteria, # Update previous criteria state
|
| 925 |
+
gr.update(value=score1),
|
| 926 |
+
score1,
|
| 927 |
+
gr.update(value=score2),
|
| 928 |
+
score2,
|
| 929 |
+
gr.update(value=score3),
|
| 930 |
+
score3,
|
| 931 |
+
gr.update(value=score4),
|
| 932 |
+
score4,
|
| 933 |
+
gr.update(value=score5),
|
| 934 |
+
score5,
|
| 935 |
+
gr.update(visible=False) # Hide buttons
|
| 936 |
+
]
|
| 937 |
+
|
| 938 |
+
def cancel_compatible_prompt(prev_criteria, prev_score1, prev_score2, prev_score3, prev_score4, prev_score5):
|
| 939 |
+
return [
|
| 940 |
+
gr.update(value=prev_criteria),
|
| 941 |
+
prev_criteria,
|
| 942 |
+
gr.update(value=prev_score1),
|
| 943 |
+
prev_score1,
|
| 944 |
+
gr.update(value=prev_score2),
|
| 945 |
+
prev_score2,
|
| 946 |
+
gr.update(value=prev_score3),
|
| 947 |
+
prev_score3,
|
| 948 |
+
gr.update(value=prev_score4),
|
| 949 |
+
prev_score4,
|
| 950 |
+
gr.update(value=prev_score5),
|
| 951 |
+
prev_score5,
|
| 952 |
+
gr.update(visible=False)
|
| 953 |
+
]
|
| 954 |
+
|
| 955 |
+
def show_compatible_edit_buttons(*current_values):
|
| 956 |
+
previous_values = current_values[1::2] # Get previous values
|
| 957 |
+
current_values = current_values[::2] # Get current values
|
| 958 |
+
return gr.update(visible=any(curr != prev for curr, prev in zip(current_values, previous_values)))
|
| 959 |
+
|
| 960 |
+
# Add click handlers for compatible mode buttons
|
| 961 |
+
compatible_save_btn.click(
|
| 962 |
+
fn=save_compatible_prompt,
|
| 963 |
+
inputs=[
|
| 964 |
+
eval_criteria_text,
|
| 965 |
+
score1_description,
|
| 966 |
+
score2_description,
|
| 967 |
+
score3_description,
|
| 968 |
+
score4_description,
|
| 969 |
+
score5_description
|
| 970 |
+
],
|
| 971 |
+
outputs=[
|
| 972 |
+
eval_criteria_text,
|
| 973 |
+
eval_criteria_previous,
|
| 974 |
+
score1_description,
|
| 975 |
+
score1_previous,
|
| 976 |
+
score2_description,
|
| 977 |
+
score2_previous,
|
| 978 |
+
score3_description,
|
| 979 |
+
score3_previous,
|
| 980 |
+
score4_description,
|
| 981 |
+
score4_previous,
|
| 982 |
+
score5_description,
|
| 983 |
+
score5_previous,
|
| 984 |
+
compatible_edit_buttons_row
|
| 985 |
+
]
|
| 986 |
+
)
|
| 987 |
+
|
| 988 |
+
compatible_cancel_btn.click(
|
| 989 |
+
fn=cancel_compatible_prompt,
|
| 990 |
+
inputs=[
|
| 991 |
+
eval_criteria_previous,
|
| 992 |
+
score1_previous,
|
| 993 |
+
score2_previous,
|
| 994 |
+
score3_previous,
|
| 995 |
+
score4_previous,
|
| 996 |
+
score5_previous
|
| 997 |
+
],
|
| 998 |
+
outputs=[
|
| 999 |
+
eval_criteria_text,
|
| 1000 |
+
eval_criteria_previous,
|
| 1001 |
+
score1_description,
|
| 1002 |
+
score1_previous,
|
| 1003 |
+
score2_description,
|
| 1004 |
+
score2_previous,
|
| 1005 |
+
score3_description,
|
| 1006 |
+
score3_previous,
|
| 1007 |
+
score4_description,
|
| 1008 |
+
score4_previous,
|
| 1009 |
+
score5_description,
|
| 1010 |
+
score5_previous,
|
| 1011 |
+
compatible_edit_buttons_row
|
| 1012 |
+
]
|
| 1013 |
)
|
| 1014 |
|
| 1015 |
+
# Add change handlers for all compatible mode inputs
|
| 1016 |
+
for component in [eval_criteria_text, score1_description, score2_description,
|
| 1017 |
+
score3_description, score4_description, score5_description]:
|
| 1018 |
+
component.change(
|
| 1019 |
+
fn=show_compatible_edit_buttons,
|
| 1020 |
+
inputs=[
|
| 1021 |
+
eval_criteria_text,
|
| 1022 |
+
eval_criteria_previous,
|
| 1023 |
+
score1_description,
|
| 1024 |
+
score1_previous,
|
| 1025 |
+
score2_description,
|
| 1026 |
+
score2_previous,
|
| 1027 |
+
score3_description,
|
| 1028 |
+
score3_previous,
|
| 1029 |
+
score4_description,
|
| 1030 |
+
score4_previous,
|
| 1031 |
+
score5_description,
|
| 1032 |
+
score5_previous
|
| 1033 |
+
],
|
| 1034 |
+
outputs=compatible_edit_buttons_row
|
| 1035 |
+
)
|
| 1036 |
+
|
| 1037 |
if __name__ == "__main__":
|
| 1038 |
demo.launch()
|