Spaces:
Running
Running
remove an obsolete model
Browse files- app.py +121 -54
- context_window.json +0 -1
app.py
CHANGED
|
@@ -403,19 +403,20 @@ def get_leaderboard_data(feedback_entry=None):
|
|
| 403 |
pagerank_result = evalica.pagerank(
|
| 404 |
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 405 |
)
|
| 406 |
-
|
| 407 |
# Calculate consistency score as a pandas Series aligned with other metrics
|
| 408 |
-
is_result = pd.Series(
|
|
|
|
|
|
|
| 409 |
|
| 410 |
# Loop through models and update values
|
| 411 |
for model in is_result.index:
|
| 412 |
# Filter self-matches for this model
|
| 413 |
self_matches = feedback_df[
|
| 414 |
-
(feedback_df["left"] == model) &
|
| 415 |
-
(feedback_df["right"] == model)
|
| 416 |
]
|
| 417 |
totals = len(self_matches)
|
| 418 |
-
|
| 419 |
if totals:
|
| 420 |
# Count non-draw outcomes (wins or losses)
|
| 421 |
draws = self_matches[self_matches["winner"] == evalica.Winner.Draw].shape[0]
|
|
@@ -681,23 +682,36 @@ with gr.Blocks() as app:
|
|
| 681 |
# Here we default to fail open, but you can change as needed.
|
| 682 |
return True
|
| 683 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 684 |
# Function to update model titles and responses
|
| 685 |
def update_model_titles_and_responses(
|
| 686 |
repo_url, user_input, models_state, conversation_state
|
| 687 |
):
|
| 688 |
# Guardrail check first
|
| 689 |
if not repo_url and not guardrail_check_se_relevance(user_input):
|
| 690 |
-
# Return updates to show the guardrail message and
|
| 691 |
return (
|
| 692 |
# [0] guardrail_message: Show guardrail message
|
| 693 |
gr.update(
|
| 694 |
value="### Oops! Try asking something about software engineering. Thanks!",
|
| 695 |
visible=True,
|
| 696 |
),
|
| 697 |
-
# [1] shared_input: clear and
|
| 698 |
-
gr.update(value="", visible=True),
|
| 699 |
-
# [2] repo_url: clear and
|
| 700 |
-
gr.update(value="", visible=True),
|
| 701 |
# [3] user_prompt_md: clear and hide
|
| 702 |
gr.update(value="", visible=False),
|
| 703 |
# [4] response_a_title: clear and hide
|
|
@@ -712,8 +726,8 @@ with gr.Blocks() as app:
|
|
| 712 |
gr.update(visible=False),
|
| 713 |
# [9] vote_panel: hide
|
| 714 |
gr.update(visible=False),
|
| 715 |
-
# [10] send_first:
|
| 716 |
-
gr.update(visible=True, interactive=True),
|
| 717 |
# [11] feedback: enable the selection
|
| 718 |
gr.update(interactive=True),
|
| 719 |
# [12] models_state: pass state as-is
|
|
@@ -760,10 +774,10 @@ with gr.Blocks() as app:
|
|
| 760 |
return (
|
| 761 |
# [0] guardrail_message: hide
|
| 762 |
gr.update(visible=False),
|
| 763 |
-
# [1] shared_input:
|
| 764 |
-
gr.update(value="", interactive=
|
| 765 |
-
# [2] repo_url:
|
| 766 |
-
gr.update(value="", interactive=
|
| 767 |
# [3] user_prompt_md: hide
|
| 768 |
gr.update(value="", visible=False),
|
| 769 |
# [4] response_a_title: hide
|
|
@@ -778,8 +792,8 @@ with gr.Blocks() as app:
|
|
| 778 |
gr.update(visible=False),
|
| 779 |
# [9] vote_panel: hide
|
| 780 |
gr.update(visible=False),
|
| 781 |
-
# [10] send_first:
|
| 782 |
-
gr.update(visible=True, interactive=
|
| 783 |
# [11] feedback: disable
|
| 784 |
gr.update(interactive=False),
|
| 785 |
# [12] models_state: pass state as-is
|
|
@@ -806,10 +820,10 @@ with gr.Blocks() as app:
|
|
| 806 |
return (
|
| 807 |
# [0] guardrail_message: hide (since no guardrail issue)
|
| 808 |
gr.update(visible=False),
|
| 809 |
-
# [1] shared_input:
|
| 810 |
-
gr.update(visible=False),
|
| 811 |
-
# [2] repo_url:
|
| 812 |
-
gr.update(visible=False),
|
| 813 |
# [3] user_prompt_md: display the user's query
|
| 814 |
gr.update(value=f"**Your Query:**\n\n{user_input}", visible=True),
|
| 815 |
# [4] response_a_title: show title for Model A
|
|
@@ -824,8 +838,8 @@ with gr.Blocks() as app:
|
|
| 824 |
gr.update(visible=True),
|
| 825 |
# [9] vote_panel: show vote panel
|
| 826 |
gr.update(visible=True),
|
| 827 |
-
# [10] send_first: hide the submit button
|
| 828 |
-
gr.update(visible=False),
|
| 829 |
# [11] feedback: enable the feedback selection
|
| 830 |
gr.update(interactive=True),
|
| 831 |
# [12] models_state: pass updated models_state
|
|
@@ -915,9 +929,20 @@ with gr.Blocks() as app:
|
|
| 915 |
|
| 916 |
# First round handling
|
| 917 |
send_first.click(
|
| 918 |
-
fn=hide_thanks_message,
|
|
|
|
|
|
|
| 919 |
).then(
|
| 920 |
-
fn=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 921 |
inputs=[repo_url, shared_input, models_state, conversation_state],
|
| 922 |
outputs=[
|
| 923 |
guardrail_message,
|
|
@@ -941,6 +966,15 @@ with gr.Blocks() as app:
|
|
| 941 |
],
|
| 942 |
)
|
| 943 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 944 |
# Handle subsequent rounds
|
| 945 |
def handle_model_a_send(user_input, models_state, conversation_state):
|
| 946 |
try:
|
|
@@ -952,10 +986,8 @@ with gr.Blocks() as app:
|
|
| 952 |
response,
|
| 953 |
conversation_state,
|
| 954 |
gr.update(visible=False),
|
| 955 |
-
gr.update(
|
| 956 |
-
|
| 957 |
-
), # Clear and enable model_a_input
|
| 958 |
-
gr.update(interactive=False), # Disable model_a_send button
|
| 959 |
)
|
| 960 |
except TimeoutError as e:
|
| 961 |
# Disable inputs when timeout occurs
|
|
@@ -963,12 +995,19 @@ with gr.Blocks() as app:
|
|
| 963 |
gr.update(value=""), # Clear response
|
| 964 |
conversation_state,
|
| 965 |
gr.update(visible=True), # Show the timeout popup
|
| 966 |
-
gr.update(interactive=
|
| 967 |
-
gr.update(interactive=
|
| 968 |
)
|
| 969 |
except Exception as e:
|
| 970 |
raise gr.Error(str(e))
|
| 971 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 972 |
def handle_model_b_send(user_input, models_state, conversation_state):
|
| 973 |
try:
|
| 974 |
response = chat_with_models(
|
|
@@ -979,10 +1018,8 @@ with gr.Blocks() as app:
|
|
| 979 |
response,
|
| 980 |
conversation_state,
|
| 981 |
gr.update(visible=False),
|
| 982 |
-
gr.update(
|
| 983 |
-
|
| 984 |
-
), # Clear and enable model_b_input
|
| 985 |
-
gr.update(interactive=False), # Disable model_b_send button
|
| 986 |
)
|
| 987 |
except TimeoutError as e:
|
| 988 |
# Disable inputs when timeout occurs
|
|
@@ -990,14 +1027,21 @@ with gr.Blocks() as app:
|
|
| 990 |
gr.update(value=""), # Clear response
|
| 991 |
conversation_state,
|
| 992 |
gr.update(visible=True), # Show the timeout popup
|
| 993 |
-
gr.update(interactive=
|
| 994 |
-
gr.update(interactive=
|
| 995 |
)
|
| 996 |
except Exception as e:
|
| 997 |
raise gr.Error(str(e))
|
| 998 |
|
| 999 |
model_a_send.click(
|
| 1000 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1001 |
inputs=[model_a_input, models_state, conversation_state],
|
| 1002 |
outputs=[
|
| 1003 |
response_a,
|
|
@@ -1008,7 +1052,14 @@ with gr.Blocks() as app:
|
|
| 1008 |
],
|
| 1009 |
)
|
| 1010 |
model_b_send.click(
|
| 1011 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1012 |
inputs=[model_b_input, models_state, conversation_state],
|
| 1013 |
outputs=[
|
| 1014 |
response_b,
|
|
@@ -1050,19 +1101,35 @@ with gr.Blocks() as app:
|
|
| 1050 |
|
| 1051 |
# Adjust output count to match the interface definition
|
| 1052 |
return (
|
| 1053 |
-
gr.update(
|
| 1054 |
-
|
| 1055 |
-
|
| 1056 |
-
gr.update(
|
| 1057 |
-
|
| 1058 |
-
|
| 1059 |
-
gr.update(
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
gr.update(
|
| 1063 |
-
|
| 1064 |
-
|
| 1065 |
-
gr.update(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1066 |
)
|
| 1067 |
|
| 1068 |
# Update the click event for the submit feedback button
|
|
|
|
| 403 |
pagerank_result = evalica.pagerank(
|
| 404 |
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 405 |
)
|
| 406 |
+
|
| 407 |
# Calculate consistency score as a pandas Series aligned with other metrics
|
| 408 |
+
is_result = pd.Series(
|
| 409 |
+
"N/A", index=elo_result.scores.index
|
| 410 |
+
) # Initialize with zeros using same index
|
| 411 |
|
| 412 |
# Loop through models and update values
|
| 413 |
for model in is_result.index:
|
| 414 |
# Filter self-matches for this model
|
| 415 |
self_matches = feedback_df[
|
| 416 |
+
(feedback_df["left"] == model) & (feedback_df["right"] == model)
|
|
|
|
| 417 |
]
|
| 418 |
totals = len(self_matches)
|
| 419 |
+
|
| 420 |
if totals:
|
| 421 |
# Count non-draw outcomes (wins or losses)
|
| 422 |
draws = self_matches[self_matches["winner"] == evalica.Winner.Draw].shape[0]
|
|
|
|
| 682 |
# Here we default to fail open, but you can change as needed.
|
| 683 |
return True
|
| 684 |
|
| 685 |
+
def disable_first_submit_ui():
|
| 686 |
+
"""First function to immediately disable UI elements"""
|
| 687 |
+
return (
|
| 688 |
+
# [0] guardrail_message: hide
|
| 689 |
+
gr.update(visible=False),
|
| 690 |
+
# [1] shared_input: disable but keep visible
|
| 691 |
+
gr.update(interactive=False),
|
| 692 |
+
# [2] repo_url: disable but keep visible
|
| 693 |
+
gr.update(interactive=False),
|
| 694 |
+
# [3] send_first: disable and show loading state
|
| 695 |
+
gr.update(interactive=False, value="Processing..."),
|
| 696 |
+
)
|
| 697 |
+
|
| 698 |
# Function to update model titles and responses
|
| 699 |
def update_model_titles_and_responses(
|
| 700 |
repo_url, user_input, models_state, conversation_state
|
| 701 |
):
|
| 702 |
# Guardrail check first
|
| 703 |
if not repo_url and not guardrail_check_se_relevance(user_input):
|
| 704 |
+
# Return updates to show the guardrail message and re-enable UI
|
| 705 |
return (
|
| 706 |
# [0] guardrail_message: Show guardrail message
|
| 707 |
gr.update(
|
| 708 |
value="### Oops! Try asking something about software engineering. Thanks!",
|
| 709 |
visible=True,
|
| 710 |
),
|
| 711 |
+
# [1] shared_input: clear and re-enable
|
| 712 |
+
gr.update(value="", interactive=True, visible=True),
|
| 713 |
+
# [2] repo_url: clear and re-enable
|
| 714 |
+
gr.update(value="", interactive=True, visible=True),
|
| 715 |
# [3] user_prompt_md: clear and hide
|
| 716 |
gr.update(value="", visible=False),
|
| 717 |
# [4] response_a_title: clear and hide
|
|
|
|
| 726 |
gr.update(visible=False),
|
| 727 |
# [9] vote_panel: hide
|
| 728 |
gr.update(visible=False),
|
| 729 |
+
# [10] send_first: re-enable button with original text
|
| 730 |
+
gr.update(visible=True, interactive=True, value="Submit"),
|
| 731 |
# [11] feedback: enable the selection
|
| 732 |
gr.update(interactive=True),
|
| 733 |
# [12] models_state: pass state as-is
|
|
|
|
| 774 |
return (
|
| 775 |
# [0] guardrail_message: hide
|
| 776 |
gr.update(visible=False),
|
| 777 |
+
# [1] shared_input: re-enable and clear
|
| 778 |
+
gr.update(value="", interactive=True, visible=True),
|
| 779 |
+
# [2] repo_url: re-enable and clear
|
| 780 |
+
gr.update(value="", interactive=True, visible=True),
|
| 781 |
# [3] user_prompt_md: hide
|
| 782 |
gr.update(value="", visible=False),
|
| 783 |
# [4] response_a_title: hide
|
|
|
|
| 792 |
gr.update(visible=False),
|
| 793 |
# [9] vote_panel: hide
|
| 794 |
gr.update(visible=False),
|
| 795 |
+
# [10] send_first: re-enable with original text
|
| 796 |
+
gr.update(visible=True, interactive=True, value="Submit"),
|
| 797 |
# [11] feedback: disable
|
| 798 |
gr.update(interactive=False),
|
| 799 |
# [12] models_state: pass state as-is
|
|
|
|
| 820 |
return (
|
| 821 |
# [0] guardrail_message: hide (since no guardrail issue)
|
| 822 |
gr.update(visible=False),
|
| 823 |
+
# [1] shared_input: re-enable but hide
|
| 824 |
+
gr.update(interactive=True, visible=False),
|
| 825 |
+
# [2] repo_url: re-enable but hide
|
| 826 |
+
gr.update(interactive=True, visible=False),
|
| 827 |
# [3] user_prompt_md: display the user's query
|
| 828 |
gr.update(value=f"**Your Query:**\n\n{user_input}", visible=True),
|
| 829 |
# [4] response_a_title: show title for Model A
|
|
|
|
| 838 |
gr.update(visible=True),
|
| 839 |
# [9] vote_panel: show vote panel
|
| 840 |
gr.update(visible=True),
|
| 841 |
+
# [10] send_first: hide the submit button but restore label
|
| 842 |
+
gr.update(visible=False, value="Submit"),
|
| 843 |
# [11] feedback: enable the feedback selection
|
| 844 |
gr.update(interactive=True),
|
| 845 |
# [12] models_state: pass updated models_state
|
|
|
|
| 929 |
|
| 930 |
# First round handling
|
| 931 |
send_first.click(
|
| 932 |
+
fn=hide_thanks_message,
|
| 933 |
+
inputs=[],
|
| 934 |
+
outputs=[thanks_message]
|
| 935 |
).then(
|
| 936 |
+
fn=disable_first_submit_ui, # First disable UI
|
| 937 |
+
inputs=[],
|
| 938 |
+
outputs=[
|
| 939 |
+
guardrail_message,
|
| 940 |
+
shared_input,
|
| 941 |
+
repo_url,
|
| 942 |
+
send_first # Just the essential UI elements to update immediately
|
| 943 |
+
]
|
| 944 |
+
).then(
|
| 945 |
+
fn=update_model_titles_and_responses, # Then do the actual processing
|
| 946 |
inputs=[repo_url, shared_input, models_state, conversation_state],
|
| 947 |
outputs=[
|
| 948 |
guardrail_message,
|
|
|
|
| 966 |
],
|
| 967 |
)
|
| 968 |
|
| 969 |
+
def disable_model_a_ui():
|
| 970 |
+
"""First function to immediately disable model A UI elements"""
|
| 971 |
+
return (
|
| 972 |
+
# [0] model_a_input: disable
|
| 973 |
+
gr.update(interactive=False),
|
| 974 |
+
# [1] model_a_send: disable and show loading state
|
| 975 |
+
gr.update(interactive=False, value="Processing...")
|
| 976 |
+
)
|
| 977 |
+
|
| 978 |
# Handle subsequent rounds
|
| 979 |
def handle_model_a_send(user_input, models_state, conversation_state):
|
| 980 |
try:
|
|
|
|
| 986 |
response,
|
| 987 |
conversation_state,
|
| 988 |
gr.update(visible=False),
|
| 989 |
+
gr.update(value="", interactive=True), # Clear and enable model_a_input
|
| 990 |
+
gr.update(interactive=False, value="Send to Model A"), # Reset button text
|
|
|
|
|
|
|
| 991 |
)
|
| 992 |
except TimeoutError as e:
|
| 993 |
# Disable inputs when timeout occurs
|
|
|
|
| 995 |
gr.update(value=""), # Clear response
|
| 996 |
conversation_state,
|
| 997 |
gr.update(visible=True), # Show the timeout popup
|
| 998 |
+
gr.update(interactive=True), # Re-enable model_a_input
|
| 999 |
+
gr.update(interactive=True, value="Send to Model A"), # Re-enable model_a_send button
|
| 1000 |
)
|
| 1001 |
except Exception as e:
|
| 1002 |
raise gr.Error(str(e))
|
| 1003 |
+
def disable_model_b_ui():
|
| 1004 |
+
"""First function to immediately disable model B UI elements"""
|
| 1005 |
+
return (
|
| 1006 |
+
# [0] model_b_input: disable
|
| 1007 |
+
gr.update(interactive=False),
|
| 1008 |
+
# [1] model_b_send: disable and show loading state
|
| 1009 |
+
gr.update(interactive=False, value="Processing...")
|
| 1010 |
+
)
|
| 1011 |
def handle_model_b_send(user_input, models_state, conversation_state):
|
| 1012 |
try:
|
| 1013 |
response = chat_with_models(
|
|
|
|
| 1018 |
response,
|
| 1019 |
conversation_state,
|
| 1020 |
gr.update(visible=False),
|
| 1021 |
+
gr.update(value="", interactive=True), # Clear and enable model_b_input
|
| 1022 |
+
gr.update(interactive=False, value="Send to Model B"), # Reset button text
|
|
|
|
|
|
|
| 1023 |
)
|
| 1024 |
except TimeoutError as e:
|
| 1025 |
# Disable inputs when timeout occurs
|
|
|
|
| 1027 |
gr.update(value=""), # Clear response
|
| 1028 |
conversation_state,
|
| 1029 |
gr.update(visible=True), # Show the timeout popup
|
| 1030 |
+
gr.update(interactive=True), # Re-enable model_b_input
|
| 1031 |
+
gr.update(interactive=True, value="Send to Model B"), # Re-enable model_b_send button
|
| 1032 |
)
|
| 1033 |
except Exception as e:
|
| 1034 |
raise gr.Error(str(e))
|
| 1035 |
|
| 1036 |
model_a_send.click(
|
| 1037 |
+
fn=disable_model_a_ui, # First disable UI
|
| 1038 |
+
inputs=[],
|
| 1039 |
+
outputs=[
|
| 1040 |
+
model_a_input,
|
| 1041 |
+
model_a_send
|
| 1042 |
+
]
|
| 1043 |
+
).then(
|
| 1044 |
+
fn=handle_model_a_send, # Then do the actual processing
|
| 1045 |
inputs=[model_a_input, models_state, conversation_state],
|
| 1046 |
outputs=[
|
| 1047 |
response_a,
|
|
|
|
| 1052 |
],
|
| 1053 |
)
|
| 1054 |
model_b_send.click(
|
| 1055 |
+
fn=disable_model_b_ui, # First disable UI
|
| 1056 |
+
inputs=[],
|
| 1057 |
+
outputs=[
|
| 1058 |
+
model_b_input,
|
| 1059 |
+
model_b_send
|
| 1060 |
+
]
|
| 1061 |
+
).then(
|
| 1062 |
+
fn=handle_model_b_send, # Then do the actual processing
|
| 1063 |
inputs=[model_b_input, models_state, conversation_state],
|
| 1064 |
outputs=[
|
| 1065 |
response_b,
|
|
|
|
| 1101 |
|
| 1102 |
# Adjust output count to match the interface definition
|
| 1103 |
return (
|
| 1104 |
+
gr.update(
|
| 1105 |
+
value="", interactive=True, visible=True
|
| 1106 |
+
), # [0] Clear shared_input textbox
|
| 1107 |
+
gr.update(
|
| 1108 |
+
value="", interactive=True, visible=True
|
| 1109 |
+
), # [1] Clear repo_url textbox
|
| 1110 |
+
gr.update(
|
| 1111 |
+
value="", visible=False
|
| 1112 |
+
), # [2] Hide user_prompt_md markdown component
|
| 1113 |
+
gr.update(
|
| 1114 |
+
value="", visible=False
|
| 1115 |
+
), # [3] Hide response_a_title markdown component
|
| 1116 |
+
gr.update(
|
| 1117 |
+
value="", visible=False
|
| 1118 |
+
), # [4] Hide response_b_title markdown component
|
| 1119 |
+
gr.update(value=""), # [5] Clear Model A response markdown component
|
| 1120 |
+
gr.update(value=""), # [6] Clear Model B response markdown component
|
| 1121 |
+
gr.update(visible=False), # [7] Hide multi_round_inputs row
|
| 1122 |
+
gr.update(visible=False), # [8] Hide vote_panel row
|
| 1123 |
+
gr.update(
|
| 1124 |
+
value="Submit", interactive=True, visible=True
|
| 1125 |
+
), # [9] Reset send_first button
|
| 1126 |
+
gr.update(
|
| 1127 |
+
value="Can't Decide", interactive=True
|
| 1128 |
+
), # [10] Reset feedback radio selection
|
| 1129 |
+
get_leaderboard_data(feedback_entry), # [11] Updated leaderboard data
|
| 1130 |
+
gr.update(
|
| 1131 |
+
visible=True
|
| 1132 |
+
), # [12] Show the thanks_message markdown component
|
| 1133 |
)
|
| 1134 |
|
| 1135 |
# Update the click event for the submit feedback button
|
context_window.json
CHANGED
|
@@ -22,7 +22,6 @@
|
|
| 22 |
"grok-3-beta": 1000000,
|
| 23 |
"grok-3-mini-fast-beta": 1000000,
|
| 24 |
"grok-3-mini-beta": 1000000,
|
| 25 |
-
"llama-3.1-8b": 128000,
|
| 26 |
"llama-3.1-405b": 128000,
|
| 27 |
"llama-3.3-70b": 128000,
|
| 28 |
"llama4-scout-instruct-basic": 10000000,
|
|
|
|
| 22 |
"grok-3-beta": 1000000,
|
| 23 |
"grok-3-mini-fast-beta": 1000000,
|
| 24 |
"grok-3-mini-beta": 1000000,
|
|
|
|
| 25 |
"llama-3.1-405b": 128000,
|
| 26 |
"llama-3.3-70b": 128000,
|
| 27 |
"llama4-scout-instruct-basic": 10000000,
|