Spaces:
Running
on
Zero
Running
on
Zero
Refine vote options
Browse files
app.py
CHANGED
|
@@ -7,18 +7,56 @@ import time
|
|
| 7 |
import numpy as np
|
| 8 |
from utils.data_loader import get_random_example
|
| 9 |
from utils.models import generate_summaries, model_names
|
| 10 |
-
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
|
| 11 |
from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
|
| 12 |
from utils.vote_logger import save_vote_details
|
| 13 |
from utils.shared import generation_interrupt
|
| 14 |
|
| 15 |
feedback_options = {
|
| 16 |
-
"left": [
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
}
|
| 23 |
|
| 24 |
def weighted_sample_without_replacement(population, weights, k=2):
|
|
@@ -248,7 +286,7 @@ def show_loading_state():
|
|
| 248 |
gr.update(visible=False), # feedback_section
|
| 249 |
gr.update(interactive=False), # submit_button
|
| 250 |
gr.update(visible=False), # results_reveal_area
|
| 251 |
-
gr.update(interactive=
|
| 252 |
None # Reset selected_winner
|
| 253 |
]
|
| 254 |
|
|
@@ -268,6 +306,15 @@ def update_ui_for_new_context(example):
|
|
| 268 |
False
|
| 269 |
]
|
| 270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
def cleanup_on_disconnect():
|
| 272 |
print(f"Browser disconnected. Cleaning up resources...")
|
| 273 |
generation_interrupt.set()
|
|
@@ -321,13 +368,14 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
| 321 |
show_results_state = gr.State(False)
|
| 322 |
results_agg = gr.State(load_leaderboard_data())
|
| 323 |
show_full_context = gr.State(False)
|
|
|
|
| 324 |
faq_expanded = gr.State(False) # State for FAQ toggle
|
| 325 |
|
| 326 |
with gr.Tabs() as tabs:
|
| 327 |
with gr.TabItem("Arena", id="arena-tab"):
|
| 328 |
-
gr.Markdown("#
|
| 329 |
gr.Markdown("""
|
| 330 |
-
🏟️ This arena evaluates how well
|
| 331 |
|
| 332 |
📝 Instructions:
|
| 333 |
- **Click the "Get a Question" button** to load a random question with context
|
|
@@ -372,6 +420,8 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
| 372 |
# Model comparison section - initially hidden
|
| 373 |
with gr.Column(visible=False, elem_id="model-section") as model_section:
|
| 374 |
gr.Markdown("---")
|
|
|
|
|
|
|
| 375 |
gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
|
| 376 |
|
| 377 |
with gr.Row(elem_id="summary-containers"):
|
|
@@ -395,6 +445,15 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
| 395 |
autoscroll=False,
|
| 396 |
elem_id="summary-b-display"
|
| 397 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
|
| 399 |
# Voting section - initially hidden
|
| 400 |
with gr.Column(visible=False, elem_id="voting-section") as voting_section:
|
|
@@ -431,7 +490,7 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
| 431 |
|
| 432 |
with gr.TabItem("Leaderboard", id="leaderboard-tab"):
|
| 433 |
gr.Markdown("# SLM RAG Leaderboard", elem_classes="orange-title")
|
| 434 |
-
gr.
|
| 435 |
|
| 436 |
with gr.Group(elem_id="leaderboard-info"):
|
| 437 |
gr.Markdown("""### About Elo Ratings
|
|
@@ -458,6 +517,13 @@ The Elo rating system provides a more accurate ranking than simple win rates:
|
|
| 458 |
inputs=[current_example, show_full_context],
|
| 459 |
outputs=[show_full_context, context_display, context_toggle_btn]
|
| 460 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
|
| 462 |
# Initialize UI to empty state on load
|
| 463 |
demo.load(
|
|
@@ -497,6 +563,11 @@ The Elo rating system provides a more accurate ranking than simple win rates:
|
|
| 497 |
inputs=[current_example],
|
| 498 |
outputs=[query_display, context_description, context_display,
|
| 499 |
context_toggle_btn, show_full_context]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
).then(
|
| 501 |
# IMPORTANT: Explicitly hide FAQ here
|
| 502 |
fn=hide_faq_section,
|
|
@@ -541,6 +612,11 @@ The Elo rating system provides a more accurate ranking than simple win rates:
|
|
| 541 |
inputs=[current_example],
|
| 542 |
outputs=[query_display, context_description, context_display,
|
| 543 |
context_toggle_btn, show_full_context]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
).then(
|
| 545 |
# IMPORTANT: Explicitly hide FAQ here too
|
| 546 |
fn=hide_faq_section,
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
from utils.data_loader import get_random_example
|
| 9 |
from utils.models import generate_summaries, model_names
|
| 10 |
+
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html, toggle_reference_answer
|
| 11 |
from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
|
| 12 |
from utils.vote_logger import save_vote_details
|
| 13 |
from utils.shared import generation_interrupt
|
| 14 |
|
| 15 |
feedback_options = {
|
| 16 |
+
"left": [
|
| 17 |
+
"Model A: Answers the question completely",
|
| 18 |
+
"Model A: Information is accurate and correct",
|
| 19 |
+
"Model A: Stays on topic and relevant",
|
| 20 |
+
"Model A: Clear and well-written response",
|
| 21 |
+
"Model A: Appropriately says 'I don't know' without enough info",
|
| 22 |
+
"Model A: Asks helpful follow-up questions when unclear"
|
| 23 |
+
],
|
| 24 |
+
"right": [
|
| 25 |
+
"Model B: Answers the question completely",
|
| 26 |
+
"Model B: Information is accurate and correct",
|
| 27 |
+
"Model B: Stays on topic and relevant",
|
| 28 |
+
"Model B: Clear and well-written response",
|
| 29 |
+
"Model B: Appropriately says 'I don't know' without enough info",
|
| 30 |
+
"Model B: Asks helpful follow-up questions when unclear"
|
| 31 |
+
],
|
| 32 |
+
"tie": [
|
| 33 |
+
"Model A: Answers the question completely",
|
| 34 |
+
"Model A: Information is accurate and correct",
|
| 35 |
+
"Model A: Stays on topic and relevant",
|
| 36 |
+
"Model A: Clear and well-written response",
|
| 37 |
+
"Model A: Appropriately says 'I don't know' without enough info",
|
| 38 |
+
"Model A: Asks helpful follow-up questions when unclear",
|
| 39 |
+
"Model B: Answers the question completely",
|
| 40 |
+
"Model B: Information is accurate and correct",
|
| 41 |
+
"Model B: Stays on topic and relevant",
|
| 42 |
+
"Model B: Clear and well-written response",
|
| 43 |
+
"Model B: Appropriately says 'I don't know' without enough info",
|
| 44 |
+
"Model B: Asks helpful follow-up questions when unclear"
|
| 45 |
+
],
|
| 46 |
+
"neither": [
|
| 47 |
+
"Model A: Incomplete or missing key information",
|
| 48 |
+
"Model A: Contains incorrect or made-up information",
|
| 49 |
+
"Model A: Goes off-topic or irrelevant",
|
| 50 |
+
"Model A: Poorly written or confusing",
|
| 51 |
+
"Model A: Should have admitted uncertainty without enough info",
|
| 52 |
+
"Model A: Should have asked clarifying questions but didn't",
|
| 53 |
+
"Model B: Incomplete or missing key information",
|
| 54 |
+
"Model B: Contains incorrect or made-up information",
|
| 55 |
+
"Model B: Goes off-topic or irrelevant",
|
| 56 |
+
"Model B: Poorly written or confusing",
|
| 57 |
+
"Model B: Should have admitted uncertainty without enough info",
|
| 58 |
+
"Model B: Should have asked clarifying questions but didn't"
|
| 59 |
+
]
|
| 60 |
}
|
| 61 |
|
| 62 |
def weighted_sample_without_replacement(population, weights, k=2):
|
|
|
|
| 286 |
gr.update(visible=False), # feedback_section
|
| 287 |
gr.update(interactive=False), # submit_button
|
| 288 |
gr.update(visible=False), # results_reveal_area
|
| 289 |
+
gr.update(interactive=True), # random_question_btn - KEEP ACTIVE during inference
|
| 290 |
None # Reset selected_winner
|
| 291 |
]
|
| 292 |
|
|
|
|
| 306 |
False
|
| 307 |
]
|
| 308 |
|
| 309 |
+
def reset_reference_section():
|
| 310 |
+
"""Reset reference answer section to hidden state when loading new question"""
|
| 311 |
+
return [
|
| 312 |
+
False, # Reset show_reference_answer state to False
|
| 313 |
+
gr.update(visible=False), # Hide reference content (like FAQ)
|
| 314 |
+
gr.update(value="▶ Show Reference Answer"), # Reset button text (like FAQ)
|
| 315 |
+
gr.update(value="") # Clear reference content
|
| 316 |
+
]
|
| 317 |
+
|
| 318 |
def cleanup_on_disconnect():
|
| 319 |
print(f"Browser disconnected. Cleaning up resources...")
|
| 320 |
generation_interrupt.set()
|
|
|
|
| 368 |
show_results_state = gr.State(False)
|
| 369 |
results_agg = gr.State(load_leaderboard_data())
|
| 370 |
show_full_context = gr.State(False)
|
| 371 |
+
show_reference_answer = gr.State(False) # NEW: State for reference answer toggle
|
| 372 |
faq_expanded = gr.State(False) # State for FAQ toggle
|
| 373 |
|
| 374 |
with gr.Tabs() as tabs:
|
| 375 |
with gr.TabItem("Arena", id="arena-tab"):
|
| 376 |
+
gr.Markdown("# SLM RAG Arena - Compare and Find The Best Sub-5B Models for RAG")
|
| 377 |
gr.Markdown("""
|
| 378 |
+
🏟️ This arena evaluates how well small language models (under 5B) answer questions based on document contexts.
|
| 379 |
|
| 380 |
📝 Instructions:
|
| 381 |
- **Click the "Get a Question" button** to load a random question with context
|
|
|
|
| 420 |
# Model comparison section - initially hidden
|
| 421 |
with gr.Column(visible=False, elem_id="model-section") as model_section:
|
| 422 |
gr.Markdown("---")
|
| 423 |
+
|
| 424 |
+
# NEW: Model comparison header (simple)
|
| 425 |
gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
|
| 426 |
|
| 427 |
with gr.Row(elem_id="summary-containers"):
|
|
|
|
| 445 |
autoscroll=False,
|
| 446 |
elem_id="summary-b-display"
|
| 447 |
)
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
# NEW: Reference Answer Toggle (exactly like FAQ style)
|
| 451 |
+
with gr.Row(elem_id="reference-toggle-row"):
|
| 452 |
+
reference_toggle_btn = gr.Button("▶ Show Reference Answer", elem_classes=["faq-toggle-button"])
|
| 453 |
+
|
| 454 |
+
# Reference Answer Content - initially hidden (exactly like FAQ)
|
| 455 |
+
with gr.Row(visible=False, elem_id="reference-content") as reference_content:
|
| 456 |
+
reference_answer_display = gr.Markdown("", elem_classes="faq-text")
|
| 457 |
|
| 458 |
# Voting section - initially hidden
|
| 459 |
with gr.Column(visible=False, elem_id="voting-section") as voting_section:
|
|
|
|
| 490 |
|
| 491 |
with gr.TabItem("Leaderboard", id="leaderboard-tab"):
|
| 492 |
gr.Markdown("# SLM RAG Leaderboard", elem_classes="orange-title")
|
| 493 |
+
gr.HTML('View performance statistics for all models ranked by Elo rating. <br><br><a href="https://docs.google.com/forms/d/e/1FAIpQLSeUZoy43MlpK8-tJS4a6n5Q8PAKf-8Twdui5ybU18t0e2UuVA/viewform" class="form-link" target="_blank" rel="noopener noreferrer">Submit a new model request</a>')
|
| 494 |
|
| 495 |
with gr.Group(elem_id="leaderboard-info"):
|
| 496 |
gr.Markdown("""### About Elo Ratings
|
|
|
|
| 517 |
inputs=[current_example, show_full_context],
|
| 518 |
outputs=[show_full_context, context_display, context_toggle_btn]
|
| 519 |
)
|
| 520 |
+
|
| 521 |
+
# NEW: Reference answer toggle functionality (exactly like FAQ)
|
| 522 |
+
reference_toggle_btn.click(
|
| 523 |
+
fn=toggle_reference_answer,
|
| 524 |
+
inputs=[show_reference_answer, current_example],
|
| 525 |
+
outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
|
| 526 |
+
)
|
| 527 |
|
| 528 |
# Initialize UI to empty state on load
|
| 529 |
demo.load(
|
|
|
|
| 563 |
inputs=[current_example],
|
| 564 |
outputs=[query_display, context_description, context_display,
|
| 565 |
context_toggle_btn, show_full_context]
|
| 566 |
+
).then(
|
| 567 |
+
# NEW: Reset reference section when loading new question
|
| 568 |
+
fn=reset_reference_section,
|
| 569 |
+
inputs=[],
|
| 570 |
+
outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
|
| 571 |
).then(
|
| 572 |
# IMPORTANT: Explicitly hide FAQ here
|
| 573 |
fn=hide_faq_section,
|
|
|
|
| 612 |
inputs=[current_example],
|
| 613 |
outputs=[query_display, context_description, context_display,
|
| 614 |
context_toggle_btn, show_full_context]
|
| 615 |
+
).then(
|
| 616 |
+
# NEW: Reset reference section when trying another question
|
| 617 |
+
fn=reset_reference_section,
|
| 618 |
+
inputs=[],
|
| 619 |
+
outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
|
| 620 |
).then(
|
| 621 |
# IMPORTANT: Explicitly hide FAQ here too
|
| 622 |
fn=hide_faq_section,
|